@ls-stack/agent-eval 0.32.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1688,7 +1688,7 @@ function createTraceCache(generateSpanId) {
1688
1688
  const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
1689
1689
  if (hit) {
1690
1690
  const storedAt = hit.storedAt;
1691
- const age = Date.now() - new Date(storedAt).getTime();
1691
+ const age = getRealDateNowMs() - new Date(storedAt).getTime();
1692
1692
  recordCacheRef(scope, activeSpan, {
1693
1693
  type: "value",
1694
1694
  name: info.name,
@@ -2149,7 +2149,7 @@ async function traceSpanInternal(info, fn) {
2149
2149
  mergeSpanAttributes(spanRecord, {
2150
2150
  "cache.status": "hit",
2151
2151
  "cache.storedAt": storedAt,
2152
- "cache.age": Date.now() - new Date(storedAt).getTime()
2152
+ "cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
2153
2153
  });
2154
2154
  const recording = deserializeCacheRecording(hit.recording);
2155
2155
  replayRecording(scope, spanRecord, recording, { generateSpanId });
@@ -5025,30 +5025,6 @@ function mergeOverrides(base, override) {
5025
5025
  };
5026
5026
  }
5027
5027
  /**
5028
- * Populate `target` with `ColumnDef` entries for any keys in `columns`
5029
- * that aren't already present, applying user-supplied `overrides` and
5030
- * flagging score columns declared via `scores`.
5031
- */
5032
- function mergeColumnDefs(target, columns, overrides, scores, manualScores) {
5033
- const scoreKeys = new Set(Object.keys(scores ?? {}));
5034
- const manualScoreKeys = new Set(Object.keys(manualScores ?? {}));
5035
- const overrideMap = overrides ?? {};
5036
- for (const [key, value] of Object.entries(columns)) {
5037
- if (target.has(key)) continue;
5038
- const override = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], overrideMap[key]);
5039
- const isScore = scoreKeys.has(key) || manualScoreKeys.has(key);
5040
- target.set(key, createColumnDef({
5041
- key,
5042
- override,
5043
- scoreDef: scores?.[key],
5044
- manualScoreDef: manualScores?.[key],
5045
- inferredKind: isScore ? "number" : inferKind(value),
5046
- isScore,
5047
- isManualScore: manualScoreKeys.has(key)
5048
- }));
5049
- }
5050
- }
5051
- /**
5052
5028
  * Build the column definitions declared directly on an eval before any runtime
5053
5029
  * output values exist. This lets discovery metadata describe authored rich
5054
5030
  * output columns even for runs created by another process.
@@ -5092,30 +5068,30 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
5092
5068
  }
5093
5069
  return [...declaredDefs.values()];
5094
5070
  }
5095
- /** Infer a `ColumnKind` from a runtime value when no override is set. */
5096
- function inferKind(value) {
5097
- if (typeof value === "number") return "number";
5098
- if (typeof value === "boolean") return "boolean";
5099
- return "string";
5100
- }
5101
5071
  /**
5102
5072
  * Coerce an arbitrary runtime value into a serializable `CellValue`.
5103
- * Non-primitive values fall back to `JSON.stringify`.
5073
+ * JSON-safe objects and arrays stay structured so saved run artifacts preserve
5074
+ * the authored output shape. Rich runtime values fall back to `JSON.stringify`.
5104
5075
  */
5105
- function toCellValue(value, override = void 0) {
5076
+ function toCellValue(value) {
5106
5077
  if (value === null) return null;
5107
5078
  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
5108
5079
  if (value === void 0) return void 0;
5109
- if (override?.format === "image" || override?.format === "audio" || override?.format === "video" || override?.format === "file") {
5110
- const parsed = fileRefSchema.safeParse(value);
5111
- if (parsed.success) return parsed.data;
5112
- }
5113
- if (override?.format === "json") {
5080
+ const fileRef = fileRefSchema.safeParse(value);
5081
+ if (fileRef.success) return fileRef.data;
5082
+ if (isPlainJsonContainer(value)) {
5114
5083
  const parsed = jsonCellSchema.safeParse(value);
5115
5084
  if (parsed.success) return parsed.data;
5116
5085
  }
5086
+ if (value instanceof Date) return value.toISOString();
5117
5087
  return JSON.stringify(value);
5118
5088
  }
5089
+ function isPlainJsonContainer(value) {
5090
+ if (Array.isArray(value)) return true;
5091
+ if (typeof value !== "object" || value === null) return false;
5092
+ const prototype = Object.getPrototypeOf(value);
5093
+ return prototype === Object.prototype || prototype === null;
5094
+ }
5119
5095
  function inferKindFromFormat(format) {
5120
5096
  if (format === "boolean") return "boolean";
5121
5097
  if (format === "duration" || format === "percent" || format === "number" || format === "passFail" || format === "stars") return "number";
@@ -6586,7 +6562,7 @@ async function runDeriveFromTracingConfig(params) {
6586
6562
  }
6587
6563
  }
6588
6564
  async function runCase(params) {
6589
- const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6565
+ const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6590
6566
  const scopedIdPrefix = buildScopedEvalIdPrefix({
6591
6567
  evalId,
6592
6568
  evalFilePath,
@@ -6739,12 +6715,6 @@ async function runCase(params) {
6739
6715
  const status = nonAssertError ? "error" : passed ? "pass" : "fail";
6740
6716
  const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
6741
6717
  const columns = {};
6742
- const columnOverrides = mergeDefaultColumns({
6743
- globalColumns,
6744
- columns: evalDef.columns,
6745
- globalRemove: globalRemoveDefaultConfig,
6746
- evalRemove: evalDef.removeDefaultConfig
6747
- });
6748
6718
  for (const [key, value] of Object.entries(scope.outputs)) {
6749
6719
  const cell = isBlob(value) ? await persistInlineArtifact({
6750
6720
  artifactDir,
@@ -6753,7 +6723,7 @@ async function runCase(params) {
6753
6723
  outputKey: key,
6754
6724
  trial,
6755
6725
  value
6756
- }) : toCellValue(value, columnOverrides?.[key]);
6726
+ }) : toCellValue(value);
6757
6727
  if (cell !== void 0) columns[key] = cell;
6758
6728
  }
6759
6729
  for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
@@ -6989,7 +6959,6 @@ async function finalizePreparedCase(params) {
6989
6959
  const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
6990
6960
  runState.cases.push(winningTrial.caseRow);
6991
6961
  runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
6992
- preparedEval.mergeColumns(winningTrial.caseDetail.columns);
6993
6962
  if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
6994
6963
  else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
6995
6964
  else runState.summary.failedCases++;
@@ -7106,13 +7075,13 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7106
7075
  globalRemove: config.removeDefaultConfig
7107
7076
  });
7108
7077
  const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
7109
- const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
7110
7078
  const validatedCharts = validateCharts({
7111
7079
  charts: defaultConfig.charts,
7112
7080
  columnDefs: declaredColumnDefs,
7113
7081
  evalId: evalMeta.id
7114
7082
  });
7115
7083
  for (const warning of validatedCharts.warnings) console.warn(warning);
7084
+ evalMeta.columnDefs = declaredColumnDefs;
7116
7085
  evalMeta.stats = defaultConfig.stats;
7117
7086
  evalMeta.charts = validatedCharts.charts;
7118
7087
  const evalCaseRows = [];
@@ -7121,13 +7090,9 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7121
7090
  const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
7122
7091
  const preparedEval = {
7123
7092
  evalMeta,
7124
- accumulatedColumns,
7125
7093
  evalCaseRows,
7126
7094
  preparedCases,
7127
- scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
7128
- mergeColumns: (columns) => {
7129
- mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
7130
- }
7095
+ scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
7131
7096
  };
7132
7097
  preparedEvals.push(preparedEval);
7133
7098
  for (const evalCase of cases) {
@@ -7236,7 +7201,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7236
7201
  onCaseFinished,
7237
7202
  emitEvent
7238
7203
  });
7239
- preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
7240
7204
  lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
7241
7205
  const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
7242
7206
  latestRunInfoMap.set(preparedEval.evalMeta.key, {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-Dkol2ukD.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-j9b6g0h0.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-huuJbDNb.mjs";
2
- import "./src-1Qvuh0NH.mjs";
1
+ import { n as createRunner } from "./cli-CG66f0GN.mjs";
2
+ import "./src-B0b1vgk0.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-CYhn4DJ0.mjs";
2
+ import "./cli-CG66f0GN.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.32.0",
3
+ "version": "0.34.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -31,9 +31,9 @@
31
31
  "devDependencies": {
32
32
  "@types/node": "^24.7.2",
33
33
  "typescript": "^5.9.2",
34
- "@agent-evals/sdk": "0.0.1",
35
34
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1"
35
+ "@agent-evals/shared": "0.0.1",
36
+ "@agent-evals/sdk": "0.0.1"
37
37
  },
38
38
  "scripts": {
39
39
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -299,10 +299,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
299
299
 
300
300
  - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
301
301
  plain data (strings, numbers, booleans, JSON-safe objects) plus native
302
- `Blob`/`File` or `FileRef` variants for media columns. Inside `execute`,
303
- prefer the context `setOutput(key, value)` helper when writing schema-backed
304
- outputs; it is typed from the eval's outputs generic. Keep `setEvalOutput`
305
- for shared workflow code that does not receive the execute context.
302
+ `Blob`/`File` or `FileRef` variants for media columns. Stored output values
303
+ are not coerced by `columns.format`; column formats only control
304
+ visualization. Native `Blob`/`File` values are copied to run artifacts
305
+ because saved run files are JSON. Inside `execute`, prefer the context
306
+ `setOutput(key, value)` helper when writing schema-backed outputs; it is
307
+ typed from the eval's outputs generic. Keep `setEvalOutput` for shared
308
+ workflow code that does not receive the execute context.
306
309
  - Use `incrementEvalOutput(key, delta)` for numeric totals,
307
310
  `appendToEvalOutput(key, value)` for arrays that preserve existing scalar
308
311
  values, and `mergeEvalOutput(key, patch)` for shallow object updates.