@ls-stack/agent-eval 0.32.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Dc6vvHRL.mjs → app-DOE-crd0.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-BJpxc61J.css +1 -0
- package/dist/apps/web/dist/assets/index-ol64metU.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-huuJbDNb.mjs → cli-CG66f0GN.mjs} +3 -3
- package/dist/index.d.mts +62 -62
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-ZpN7xty_.mjs → runOrchestration-CYhn4DJ0.mjs} +19 -55
- package/dist/{runner-BPXPvinB.mjs → runner-D5i7HLcE.mjs} +1 -1
- package/dist/{runner-Dkol2ukD.mjs → runner-j9b6g0h0.mjs} +2 -2
- package/dist/src-B0b1vgk0.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +7 -4
- package/dist/apps/web/dist/assets/index-BNQnbfi0.js +0 -118
- package/dist/apps/web/dist/assets/index-BPMMRktE.css +0 -1
- package/dist/src-1Qvuh0NH.mjs +0 -3
|
@@ -1688,7 +1688,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1688
1688
|
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
1689
1689
|
if (hit) {
|
|
1690
1690
|
const storedAt = hit.storedAt;
|
|
1691
|
-
const age =
|
|
1691
|
+
const age = getRealDateNowMs() - new Date(storedAt).getTime();
|
|
1692
1692
|
recordCacheRef(scope, activeSpan, {
|
|
1693
1693
|
type: "value",
|
|
1694
1694
|
name: info.name,
|
|
@@ -2149,7 +2149,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
2149
2149
|
mergeSpanAttributes(spanRecord, {
|
|
2150
2150
|
"cache.status": "hit",
|
|
2151
2151
|
"cache.storedAt": storedAt,
|
|
2152
|
-
"cache.age":
|
|
2152
|
+
"cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
|
|
2153
2153
|
});
|
|
2154
2154
|
const recording = deserializeCacheRecording(hit.recording);
|
|
2155
2155
|
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
@@ -5025,30 +5025,6 @@ function mergeOverrides(base, override) {
|
|
|
5025
5025
|
};
|
|
5026
5026
|
}
|
|
5027
5027
|
/**
|
|
5028
|
-
* Populate `target` with `ColumnDef` entries for any keys in `columns`
|
|
5029
|
-
* that aren't already present, applying user-supplied `overrides` and
|
|
5030
|
-
* flagging score columns declared via `scores`.
|
|
5031
|
-
*/
|
|
5032
|
-
function mergeColumnDefs(target, columns, overrides, scores, manualScores) {
|
|
5033
|
-
const scoreKeys = new Set(Object.keys(scores ?? {}));
|
|
5034
|
-
const manualScoreKeys = new Set(Object.keys(manualScores ?? {}));
|
|
5035
|
-
const overrideMap = overrides ?? {};
|
|
5036
|
-
for (const [key, value] of Object.entries(columns)) {
|
|
5037
|
-
if (target.has(key)) continue;
|
|
5038
|
-
const override = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], overrideMap[key]);
|
|
5039
|
-
const isScore = scoreKeys.has(key) || manualScoreKeys.has(key);
|
|
5040
|
-
target.set(key, createColumnDef({
|
|
5041
|
-
key,
|
|
5042
|
-
override,
|
|
5043
|
-
scoreDef: scores?.[key],
|
|
5044
|
-
manualScoreDef: manualScores?.[key],
|
|
5045
|
-
inferredKind: isScore ? "number" : inferKind(value),
|
|
5046
|
-
isScore,
|
|
5047
|
-
isManualScore: manualScoreKeys.has(key)
|
|
5048
|
-
}));
|
|
5049
|
-
}
|
|
5050
|
-
}
|
|
5051
|
-
/**
|
|
5052
5028
|
* Build the column definitions declared directly on an eval before any runtime
|
|
5053
5029
|
* output values exist. This lets discovery metadata describe authored rich
|
|
5054
5030
|
* output columns even for runs created by another process.
|
|
@@ -5092,30 +5068,30 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
|
|
|
5092
5068
|
}
|
|
5093
5069
|
return [...declaredDefs.values()];
|
|
5094
5070
|
}
|
|
5095
|
-
/** Infer a `ColumnKind` from a runtime value when no override is set. */
|
|
5096
|
-
function inferKind(value) {
|
|
5097
|
-
if (typeof value === "number") return "number";
|
|
5098
|
-
if (typeof value === "boolean") return "boolean";
|
|
5099
|
-
return "string";
|
|
5100
|
-
}
|
|
5101
5071
|
/**
|
|
5102
5072
|
* Coerce an arbitrary runtime value into a serializable `CellValue`.
|
|
5103
|
-
*
|
|
5073
|
+
* JSON-safe objects and arrays stay structured so saved run artifacts preserve
|
|
5074
|
+
* the authored output shape. Rich runtime values fall back to `JSON.stringify`.
|
|
5104
5075
|
*/
|
|
5105
|
-
function toCellValue(value
|
|
5076
|
+
function toCellValue(value) {
|
|
5106
5077
|
if (value === null) return null;
|
|
5107
5078
|
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
|
|
5108
5079
|
if (value === void 0) return void 0;
|
|
5109
|
-
|
|
5110
|
-
|
|
5111
|
-
|
|
5112
|
-
}
|
|
5113
|
-
if (override?.format === "json") {
|
|
5080
|
+
const fileRef = fileRefSchema.safeParse(value);
|
|
5081
|
+
if (fileRef.success) return fileRef.data;
|
|
5082
|
+
if (isPlainJsonContainer(value)) {
|
|
5114
5083
|
const parsed = jsonCellSchema.safeParse(value);
|
|
5115
5084
|
if (parsed.success) return parsed.data;
|
|
5116
5085
|
}
|
|
5086
|
+
if (value instanceof Date) return value.toISOString();
|
|
5117
5087
|
return JSON.stringify(value);
|
|
5118
5088
|
}
|
|
5089
|
+
function isPlainJsonContainer(value) {
|
|
5090
|
+
if (Array.isArray(value)) return true;
|
|
5091
|
+
if (typeof value !== "object" || value === null) return false;
|
|
5092
|
+
const prototype = Object.getPrototypeOf(value);
|
|
5093
|
+
return prototype === Object.prototype || prototype === null;
|
|
5094
|
+
}
|
|
5119
5095
|
function inferKindFromFormat(format) {
|
|
5120
5096
|
if (format === "boolean") return "boolean";
|
|
5121
5097
|
if (format === "duration" || format === "percent" || format === "number" || format === "passFail" || format === "stars") return "number";
|
|
@@ -6586,7 +6562,7 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6586
6562
|
}
|
|
6587
6563
|
}
|
|
6588
6564
|
async function runCase(params) {
|
|
6589
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay,
|
|
6565
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6590
6566
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6591
6567
|
evalId,
|
|
6592
6568
|
evalFilePath,
|
|
@@ -6739,12 +6715,6 @@ async function runCase(params) {
|
|
|
6739
6715
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
6740
6716
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
6741
6717
|
const columns = {};
|
|
6742
|
-
const columnOverrides = mergeDefaultColumns({
|
|
6743
|
-
globalColumns,
|
|
6744
|
-
columns: evalDef.columns,
|
|
6745
|
-
globalRemove: globalRemoveDefaultConfig,
|
|
6746
|
-
evalRemove: evalDef.removeDefaultConfig
|
|
6747
|
-
});
|
|
6748
6718
|
for (const [key, value] of Object.entries(scope.outputs)) {
|
|
6749
6719
|
const cell = isBlob(value) ? await persistInlineArtifact({
|
|
6750
6720
|
artifactDir,
|
|
@@ -6753,7 +6723,7 @@ async function runCase(params) {
|
|
|
6753
6723
|
outputKey: key,
|
|
6754
6724
|
trial,
|
|
6755
6725
|
value
|
|
6756
|
-
}) : toCellValue(value
|
|
6726
|
+
}) : toCellValue(value);
|
|
6757
6727
|
if (cell !== void 0) columns[key] = cell;
|
|
6758
6728
|
}
|
|
6759
6729
|
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
@@ -6989,7 +6959,6 @@ async function finalizePreparedCase(params) {
|
|
|
6989
6959
|
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
6990
6960
|
runState.cases.push(winningTrial.caseRow);
|
|
6991
6961
|
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
6992
|
-
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
6993
6962
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
6994
6963
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
6995
6964
|
else runState.summary.failedCases++;
|
|
@@ -7106,13 +7075,13 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7106
7075
|
globalRemove: config.removeDefaultConfig
|
|
7107
7076
|
});
|
|
7108
7077
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
7109
|
-
const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
|
|
7110
7078
|
const validatedCharts = validateCharts({
|
|
7111
7079
|
charts: defaultConfig.charts,
|
|
7112
7080
|
columnDefs: declaredColumnDefs,
|
|
7113
7081
|
evalId: evalMeta.id
|
|
7114
7082
|
});
|
|
7115
7083
|
for (const warning of validatedCharts.warnings) console.warn(warning);
|
|
7084
|
+
evalMeta.columnDefs = declaredColumnDefs;
|
|
7116
7085
|
evalMeta.stats = defaultConfig.stats;
|
|
7117
7086
|
evalMeta.charts = validatedCharts.charts;
|
|
7118
7087
|
const evalCaseRows = [];
|
|
@@ -7121,13 +7090,9 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7121
7090
|
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
7122
7091
|
const preparedEval = {
|
|
7123
7092
|
evalMeta,
|
|
7124
|
-
accumulatedColumns,
|
|
7125
7093
|
evalCaseRows,
|
|
7126
7094
|
preparedCases,
|
|
7127
|
-
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
|
|
7128
|
-
mergeColumns: (columns) => {
|
|
7129
|
-
mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
7130
|
-
}
|
|
7095
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
|
|
7131
7096
|
};
|
|
7132
7097
|
preparedEvals.push(preparedEval);
|
|
7133
7098
|
for (const evalCase of cases) {
|
|
@@ -7236,7 +7201,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7236
7201
|
onCaseFinished,
|
|
7237
7202
|
emitEvent
|
|
7238
7203
|
});
|
|
7239
|
-
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
7240
7204
|
lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
7241
7205
|
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
|
|
7242
7206
|
latestRunInfoMap.set(preparedEval.evalMeta.key, {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-j9b6g0h0.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-CG66f0GN.mjs";
|
|
2
|
+
import "./src-B0b1vgk0.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.34.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -31,9 +31,9 @@
|
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^24.7.2",
|
|
33
33
|
"typescript": "^5.9.2",
|
|
34
|
-
"@agent-evals/sdk": "0.0.1",
|
|
35
34
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/shared": "0.0.1"
|
|
35
|
+
"@agent-evals/shared": "0.0.1",
|
|
36
|
+
"@agent-evals/sdk": "0.0.1"
|
|
37
37
|
},
|
|
38
38
|
"scripts": {
|
|
39
39
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -299,10 +299,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
299
299
|
|
|
300
300
|
- `setEvalOutput(key, value)` writes reviewable data for the case. Values are
|
|
301
301
|
plain data (strings, numbers, booleans, JSON-safe objects) plus native
|
|
302
|
-
`Blob`/`File` or `FileRef` variants for media columns.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
302
|
+
`Blob`/`File` or `FileRef` variants for media columns. Stored output values
|
|
303
|
+
are not coerced by `columns.format`; column formats only control
|
|
304
|
+
visualization. Native `Blob`/`File` values are copied to run artifacts
|
|
305
|
+
because saved run files are JSON. Inside `execute`, prefer the context
|
|
306
|
+
`setOutput(key, value)` helper when writing schema-backed outputs; it is
|
|
307
|
+
typed from the eval's outputs generic. Keep `setEvalOutput` for shared
|
|
308
|
+
workflow code that does not receive the execute context.
|
|
306
309
|
- Use `incrementEvalOutput(key, delta)` for numeric totals,
|
|
307
310
|
`appendToEvalOutput(key, value)` for arrays that preserve existing scalar
|
|
308
311
|
values, and `mergeEvalOutput(key, patch)` for shallow object updates.
|