@ls-stack/agent-eval 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1330,7 +1330,7 @@ const errorCoreFields = new Set([
1330
1330
  "stack",
1331
1331
  "capturedAt"
1332
1332
  ]);
1333
- function isRecord$4(value) {
1333
+ function isRecord$5(value) {
1334
1334
  return typeof value === "object" && value !== null && !Array.isArray(value);
1335
1335
  }
1336
1336
  function formatUnknownErrorMessage(error) {
@@ -1358,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
1358
1358
  stack: error.stack,
1359
1359
  capturedAt
1360
1360
  };
1361
- if (isRecord$4(error)) {
1361
+ if (isRecord$5(error)) {
1362
1362
  const extraFields = getErrorExtraFields(error);
1363
1363
  const name = typeof error.name === "string" ? error.name : void 0;
1364
1364
  const stack = typeof error.stack === "string" ? error.stack : void 0;
@@ -1383,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
1383
1383
  return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
1384
1384
  }
1385
1385
  function isCaptureEvalSpanErrorOptions(value) {
1386
- if (!isRecord$4(value)) return false;
1386
+ if (!isRecord$5(value)) return false;
1387
1387
  const keys = Object.keys(value);
1388
1388
  if (keys.length === 0) return false;
1389
1389
  if (!keys.every((key) => key === "level")) return false;
@@ -1506,7 +1506,7 @@ function createTraceCache(generateSpanId) {
1506
1506
  namespace,
1507
1507
  operationType: "value",
1508
1508
  operationName: info.name,
1509
- storedAt: (/* @__PURE__ */ new Date()).toISOString(),
1509
+ storedAt: new Date(getRealDateNowMs()).toISOString(),
1510
1510
  codeFingerprint: cacheCtx.codeFingerprint,
1511
1511
  recording: await serializeCacheRecording(recording)
1512
1512
  }, {
@@ -1940,7 +1940,7 @@ async function traceSpanInternal(info, fn) {
1940
1940
  operationName: info.name,
1941
1941
  spanName: info.name,
1942
1942
  spanKind: info.kind,
1943
- storedAt: (/* @__PURE__ */ new Date()).toISOString(),
1943
+ storedAt: new Date(getRealDateNowMs()).toISOString(),
1944
1944
  codeFingerprint: ctx.codeFingerprint,
1945
1945
  recording: await serializeCacheRecording(recording)
1946
1946
  };
@@ -2541,6 +2541,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
2541
2541
  const evalStatsConfigSchema = z.array(evalStatItemSchema);
2542
2542
  /** Schema summarizing a discovered eval for list and overview screens. */
2543
2543
  const evalSummarySchema = z.object({
2544
+ /**
2545
+ * Stable eval identity derived from the workspace-relative file path and
2546
+ * authored eval id. Older clients should display `id`; callers that need an
2547
+ * exact eval must use `key`.
2548
+ */
2549
+ key: z.string().default(""),
2544
2550
  id: z.string(),
2545
2551
  title: z.string().optional(),
2546
2552
  /** Eval file path relative to the active workspace root. */
@@ -2580,6 +2586,16 @@ const evalSummarySchema = z.object({
2580
2586
  });
2581
2587
  /** Schema for one case row in an eval run result table. */
2582
2588
  const caseRowSchema = z.object({
2589
+ /**
2590
+ * Stable eval identity for this case row. Legacy rows may omit it and fall
2591
+ * back to `evalId`.
2592
+ */
2593
+ evalKey: z.string().optional(),
2594
+ /**
2595
+ * Stable case identity derived from file path, eval id, and case id. Legacy
2596
+ * rows may omit it and fall back to `caseId`.
2597
+ */
2598
+ caseKey: z.string().optional(),
2583
2599
  caseId: z.string(),
2584
2600
  evalId: z.string(),
2585
2601
  status: z.enum([
@@ -2657,6 +2673,10 @@ const scoreTraceSchema = z.object({
2657
2673
  });
2658
2674
  /** Schema for the detailed payload shown when opening a specific case. */
2659
2675
  const caseDetailSchema = z.object({
2676
+ /** Stable eval identity for this case detail. */
2677
+ evalKey: z.string().optional(),
2678
+ /** Stable case identity for this case detail. */
2679
+ caseKey: z.string().optional(),
2660
2680
  caseId: z.string(),
2661
2681
  evalId: z.string(),
2662
2682
  status: z.enum([
@@ -2694,6 +2714,36 @@ const caseDetailSchema = z.object({
2694
2714
  */
2695
2715
  cacheRefs: z.array(traceCacheRefSchema).default([])
2696
2716
  });
2717
+ /** Schema for discovery problems that should be shown before running evals. */
2718
+ const discoveryIssueSchema = z.object({
2719
+ type: z.enum(["duplicate-eval-id"]),
2720
+ severity: z.enum(["error"]),
2721
+ filePath: z.string(),
2722
+ evalId: z.string(),
2723
+ message: z.string()
2724
+ });
2725
+ //#endregion
2726
+ //#region ../shared/src/evalIdentity.ts
2727
+ /** Build the stable identity for one eval inside a workspace. */
2728
+ function buildEvalKey(params) {
2729
+ return `${encodeURIComponent(params.filePath)}#${encodeURIComponent(params.evalId)}`;
2730
+ }
2731
+ /** Build the stable identity for one eval case inside a workspace. */
2732
+ function buildCaseKey(params) {
2733
+ return [
2734
+ encodeURIComponent(params.filePath),
2735
+ encodeURIComponent(params.evalId),
2736
+ encodeURIComponent(params.caseId)
2737
+ ].join("#");
2738
+ }
2739
+ /** Return the collision-safe eval key stored on a row, falling back for legacy data. */
2740
+ function getCaseRowEvalKey(row) {
2741
+ return row.evalKey ?? row.evalId;
2742
+ }
2743
+ /** Return the collision-safe case key stored on a row, falling back for legacy data. */
2744
+ function getCaseRowCaseKey(row) {
2745
+ return row.caseKey ?? row.caseId;
2746
+ }
2697
2747
  //#endregion
2698
2748
  //#region ../shared/src/schemas/config.ts
2699
2749
  /** Strategy used to collapse repeated trials into one stored case result. */
@@ -2727,13 +2777,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
2727
2777
  const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
2728
2778
  /** Where an API-call metric is rendered inside the API calls tab. */
2729
2779
  const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
2780
+ const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
2730
2781
  /**
2731
2782
  * Schema for a single user-defined metric attached to LLM call rows.
2732
2783
  *
2733
2784
  * Each metric reads `path` from the span's `attributes` and renders the value
2734
- * with the configured `format` and `numberFormat`. `placements` controls
2735
- * whether the metric appears as a chip on the collapsed row header, as a row
2736
- * inside the expanded body, or both. Defaults to `['body']` when omitted.
2785
+ * with the configured `format` and `numberFormat`. Use
2786
+ * `llmCalls.derivedAttributes` when a metric should read a value computed from
2787
+ * other attributes. `placements` controls whether the metric appears as a chip
2788
+ * on the collapsed row header, as a row inside the expanded body, or both.
2789
+ * Defaults to `['body']` when omitted.
2737
2790
  */
2738
2791
  const llmCallMetricSchema = z.object({
2739
2792
  /** Display label for the metric row or header chip. */
@@ -2760,9 +2813,11 @@ const llmCallMetricSchema = z.object({
2760
2813
  * Schema for a single user-defined metric attached to API call rows.
2761
2814
  *
2762
2815
  * Each metric reads `path` from the span's `attributes` and renders the value
2763
- * with the configured `format` and `numberFormat`. `placements` controls
2764
- * whether the metric appears as a chip on the collapsed row header, as a row
2765
- * inside the expanded body, or both. Defaults to `['body']` when omitted.
2816
+ * with the configured `format` and `numberFormat`. Use
2817
+ * `apiCalls.derivedAttributes` when a metric should read a value computed from
2818
+ * other attributes. `placements` controls whether the metric appears as a chip
2819
+ * on the collapsed row header, as a row inside the expanded body, or both.
2820
+ * Defaults to `['body']` when omitted.
2766
2821
  */
2767
2822
  const apiCallMetricSchema = z.object({
2768
2823
  /** Display label for the metric row or header chip. */
@@ -2839,6 +2894,13 @@ const llmCallsConfigSchema = z.object({
2839
2894
  toolCalls: z.string().optional()
2840
2895
  }).optional(),
2841
2896
  /**
2897
+ * Derived attributes persisted onto every matching LLM span before
2898
+ * `deriveFromTracing`, default outputs, trace display, and call metrics read
2899
+ * the trace. Keys are dot-paths under `span.attributes`; return `undefined`
2900
+ * to skip writing the attribute for one span.
2901
+ */
2902
+ derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
2903
+ /**
2842
2904
  * Model/provider pricing registry used to calculate LLM-call costs from
2843
2905
  * token counts. Built-in LLM cost fields are only derived from this registry.
2844
2906
  */
@@ -2867,6 +2929,13 @@ const apiCallsConfigSchema = z.object({
2867
2929
  durationMs: z.string().optional(),
2868
2930
  error: z.string().optional()
2869
2931
  }).optional(),
2932
+ /**
2933
+ * Derived attributes persisted onto every matching API span before trace
2934
+ * display and call metrics read the trace. Keys are dot-paths under
2935
+ * `span.attributes`; return `undefined` to skip writing the attribute for
2936
+ * one span.
2937
+ */
2938
+ derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
2870
2939
  /** Custom user-defined metrics surfaced on each API call. */
2871
2940
  metrics: z.array(apiCallMetricSchema).optional()
2872
2941
  });
@@ -2898,6 +2967,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2898
2967
  reasoning: "reasoning",
2899
2968
  toolCalls: "toolCalls"
2900
2969
  },
2970
+ derivedAttributes: [],
2901
2971
  metrics: [],
2902
2972
  pricing: []
2903
2973
  };
@@ -2921,8 +2991,35 @@ const DEFAULT_API_CALLS_CONFIG = {
2921
2991
  durationMs: "durationMs",
2922
2992
  error: "error"
2923
2993
  },
2994
+ derivedAttributes: [],
2924
2995
  metrics: []
2925
2996
  };
2997
+ function resolveDerivedAttributes(input) {
2998
+ return Object.entries(input ?? {}).map(([path, compute]) => ({
2999
+ path,
3000
+ compute
3001
+ }));
3002
+ }
3003
+ function resolveLlmCallMetric(metric) {
3004
+ return {
3005
+ label: metric.label,
3006
+ tooltip: metric.tooltip,
3007
+ path: metric.path,
3008
+ format: metric.format ?? "string",
3009
+ numberFormat: metric.numberFormat,
3010
+ placements: metric.placements ? [...metric.placements] : ["body"]
3011
+ };
3012
+ }
3013
+ function resolveApiCallMetric(metric) {
3014
+ return {
3015
+ label: metric.label,
3016
+ tooltip: metric.tooltip,
3017
+ path: metric.path,
3018
+ format: metric.format ?? "string",
3019
+ numberFormat: metric.numberFormat,
3020
+ placements: metric.placements ? [...metric.placements] : ["body"]
3021
+ };
3022
+ }
2926
3023
  /**
2927
3024
  * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
2928
3025
  * by the UI to derive the LLM calls tab.
@@ -2942,14 +3039,8 @@ function resolveLlmCallsConfig(input) {
2942
3039
  ...DEFAULT_LLM_CALLS_CONFIG.attributes,
2943
3040
  ...input?.attributes
2944
3041
  },
2945
- metrics: (input?.metrics ?? []).map((m) => ({
2946
- label: m.label,
2947
- tooltip: m.tooltip,
2948
- path: m.path,
2949
- format: m.format ?? "string",
2950
- numberFormat: m.numberFormat,
2951
- placements: m.placements ? [...m.placements] : ["body"]
2952
- })),
3042
+ derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
3043
+ metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
2953
3044
  pricing: (input?.pricing ?? []).map((p) => ({
2954
3045
  model: p.model,
2955
3046
  provider: p.provider,
@@ -2979,14 +3070,8 @@ function resolveApiCallsConfig(input) {
2979
3070
  ...DEFAULT_API_CALLS_CONFIG.attributes,
2980
3071
  ...input?.attributes
2981
3072
  },
2982
- metrics: (input?.metrics ?? []).map((m) => ({
2983
- label: m.label,
2984
- tooltip: m.tooltip,
2985
- path: m.path,
2986
- format: m.format ?? "string",
2987
- numberFormat: m.numberFormat,
2988
- placements: m.placements ? [...m.placements] : ["body"]
2989
- }))
3073
+ derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
3074
+ metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
2990
3075
  };
2991
3076
  }
2992
3077
  /** Zod schema for validating `agent-evals.config.ts` input. */
@@ -3037,8 +3122,8 @@ const runManifestSchema = z.object({
3037
3122
  */
3038
3123
  commitSha: z.string().nullable().optional().default(null),
3039
3124
  /**
3040
- * Eval-file fingerprints captured for this run, keyed by eval id. Older
3041
- * persisted runs may not include this field.
3125
+ * Eval-file fingerprints captured for this run, keyed by exact eval key.
3126
+ * Older persisted runs may use authored eval ids or omit this field.
3042
3127
  */
3043
3128
  evalSourceFingerprints: z.record(z.string(), z.string()).optional().default({}),
3044
3129
  target: z.object({
@@ -3047,6 +3132,10 @@ const runManifestSchema = z.object({
3047
3132
  "evalIds",
3048
3133
  "caseIds"
3049
3134
  ]),
3135
+ /** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
3136
+ evalKeys: z.array(z.string()).optional(),
3137
+ /** Workspace-relative file paths or glob patterns used to filter selected evals. */
3138
+ files: z.array(z.string()).optional(),
3050
3139
  evalIds: z.array(z.string()).optional(),
3051
3140
  caseIds: z.array(z.string()).optional()
3052
3141
  }),
@@ -3206,7 +3295,7 @@ function getEvalTitle(evalLike) {
3206
3295
  }
3207
3296
  //#endregion
3208
3297
  //#region ../shared/src/utils/getNestedAttribute.ts
3209
- function isRecord$3(value) {
3298
+ function isRecord$4(value) {
3210
3299
  return typeof value === "object" && value !== null;
3211
3300
  }
3212
3301
  /**
@@ -3221,12 +3310,84 @@ function getNestedAttribute(value, path) {
3221
3310
  const parts = path.split(".");
3222
3311
  let current = value;
3223
3312
  for (const part of parts) {
3224
- if (!isRecord$3(current) || !(part in current)) return;
3313
+ if (!isRecord$4(current) || !(part in current)) return;
3225
3314
  current = current[part];
3226
3315
  }
3227
3316
  return current;
3228
3317
  }
3229
3318
  //#endregion
3319
+ //#region ../shared/src/utils/deriveCallAttributes.ts
3320
+ function isRecord$3(value) {
3321
+ return typeof value === "object" && value !== null;
3322
+ }
3323
+ function mergeNestedAttribute$1(value, path, attributeValue) {
3324
+ const root = value === void 0 ? {} : { ...value };
3325
+ const parts = path.split(".");
3326
+ let current = root;
3327
+ for (const [index, part] of parts.entries()) {
3328
+ if (index === parts.length - 1) {
3329
+ current[part] = attributeValue;
3330
+ continue;
3331
+ }
3332
+ const nextValue = current[part];
3333
+ const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
3334
+ current[part] = nextRecord;
3335
+ current = nextRecord;
3336
+ }
3337
+ return root;
3338
+ }
3339
+ function applyDerivedAttributesForKind(params) {
3340
+ let attributes = params.span.attributes;
3341
+ for (const derivedAttribute of params.derivedAttributes) {
3342
+ if (derivedAttribute.compute === void 0) continue;
3343
+ const span = {
3344
+ ...params.span,
3345
+ attributes
3346
+ };
3347
+ const value = (() => {
3348
+ try {
3349
+ return derivedAttribute.compute({
3350
+ attributes,
3351
+ span,
3352
+ get: (path) => getNestedAttribute(attributes, path)
3353
+ });
3354
+ } catch {
3355
+ return;
3356
+ }
3357
+ })();
3358
+ if (value === void 0) continue;
3359
+ attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
3360
+ }
3361
+ if (attributes === params.span.attributes) return params.span;
3362
+ return {
3363
+ ...params.span,
3364
+ attributes
3365
+ };
3366
+ }
3367
+ /**
3368
+ * Persist configured derived attributes onto matching LLM/API spans.
3369
+ *
3370
+ * These derived attributes are applied before trace consumers run, so
3371
+ * `deriveFromTracing`, default usage extraction, trace display, and call
3372
+ * metrics can all read them by normal dot-path lookup.
3373
+ */
3374
+ function applyDerivedCallAttributes(params) {
3375
+ const llmKinds = new Set(params.llmCallsConfig.kinds);
3376
+ const apiKinds = new Set(params.apiCallsConfig.kinds);
3377
+ return params.spans.map((span) => {
3378
+ let nextSpan = span;
3379
+ if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
3380
+ span: nextSpan,
3381
+ derivedAttributes: params.llmCallsConfig.derivedAttributes
3382
+ });
3383
+ if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
3384
+ span: nextSpan,
3385
+ derivedAttributes: params.apiCallsConfig.derivedAttributes
3386
+ });
3387
+ return nextSpan;
3388
+ });
3389
+ }
3390
+ //#endregion
3230
3391
  //#region ../shared/src/utils/extractLlmCalls.ts
3231
3392
  function readNumber$2(attributes, path) {
3232
3393
  const raw = getNestedAttribute(attributes, path);
@@ -3701,6 +3862,10 @@ const createRunRequestSchema = z.object({
3701
3862
  "evalIds",
3702
3863
  "caseIds"
3703
3864
  ]),
3865
+ /** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
3866
+ evalKeys: z.array(z.string()).optional(),
3867
+ /** Workspace-relative file paths or glob patterns used to filter selected evals. */
3868
+ files: z.array(z.string()).optional(),
3704
3869
  evalIds: z.array(z.string()).optional(),
3705
3870
  caseIds: z.array(z.string()).optional()
3706
3871
  }),
@@ -4671,7 +4836,8 @@ function addDefaultOutputs(params) {
4671
4836
  //#region ../runner/src/discovery.ts
4672
4837
  const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
4673
4838
  const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
4674
- function parseEvalMetas(filePath, content) {
4839
+ /** Parse static eval metadata and discovery issues from one eval file. */
4840
+ function parseEvalDiscovery(filePath, content) {
4675
4841
  const metas = [];
4676
4842
  let searchIndex = 0;
4677
4843
  while (searchIndex < content.length) {
@@ -4694,7 +4860,20 @@ function parseEvalMetas(filePath, content) {
4694
4860
  }
4695
4861
  searchIndex = extracted.nextIndex;
4696
4862
  }
4697
- return metas;
4863
+ const countsById = /* @__PURE__ */ new Map();
4864
+ for (const meta of metas) countsById.set(meta.id, (countsById.get(meta.id) ?? 0) + 1);
4865
+ const duplicateIds = new Set([...countsById].filter(([, count]) => count > 1).map(([id]) => id));
4866
+ const issues = [...duplicateIds].map((evalId) => ({
4867
+ type: "duplicate-eval-id",
4868
+ severity: "error",
4869
+ filePath,
4870
+ evalId,
4871
+ message: `Duplicate eval id "${evalId}" in ${filePath}. Eval ids must be unique within one file.`
4872
+ }));
4873
+ return {
4874
+ metas: metas.filter((meta) => !duplicateIds.has(meta.id)),
4875
+ issues
4876
+ };
4698
4877
  }
4699
4878
  function extractDefineEvalObject(content, defineEvalIndex) {
4700
4879
  const openParenIndex = content.indexOf("(", defineEvalIndex);
@@ -4813,40 +4992,6 @@ function getRunFreshnessTimestamp(manifest) {
4813
4992
  return manifest.endedAt ?? manifest.startedAt;
4814
4993
  }
4815
4994
  //#endregion
4816
- //#region ../runner/src/evalSummaries.ts
4817
- /** Build the API/UI summary payload for one discovered eval. */
4818
- function buildEvalSummary(params) {
4819
- const { meta, config, gitState, latestRun, lastRunStatus } = params;
4820
- const { sourceFingerprint, ...summaryMeta } = meta;
4821
- const freshness = deriveEvalFreshness({
4822
- latestRun,
4823
- gitState,
4824
- currentEvalSourceFingerprint: sourceFingerprint,
4825
- staleAfterDays: config.staleAfterDays ?? 14
4826
- });
4827
- return {
4828
- ...summaryMeta,
4829
- stale: freshness.stale,
4830
- outdated: freshness.outdated,
4831
- freshnessStatus: freshness.freshnessStatus,
4832
- latestRunAt: latestRun?.startedAt ?? null,
4833
- latestRunCommitSha: latestRun?.commitSha ?? null,
4834
- currentCommitSha: gitState.commitSha,
4835
- lastRunStatus
4836
- };
4837
- }
4838
- /** Resolve which eval ids a run request should mark as the latest run. */
4839
- function getTargetEvalIds(params) {
4840
- const { request, sortedEvalIds, knownEvalIds } = params;
4841
- if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.filter((evalId) => knownEvalIds.has(evalId));
4842
- return sortedEvalIds;
4843
- }
4844
- /** Write one latest-run snapshot to each targeted eval id. */
4845
- function setLatestRunInfoMap(params) {
4846
- const { latestRunInfoMap, evalIds, info } = params;
4847
- for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
4848
- }
4849
- //#endregion
4850
4995
  //#region ../runner/src/outputArtifacts.ts
4851
4996
  const mimeTypeExtensionMap = {
4852
4997
  "application/json": ".json",
@@ -4957,9 +5102,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
4957
5102
  return caseRow.status === "error" ? "error" : "pass";
4958
5103
  }
4959
5104
  function runTouchesEval(params) {
4960
- if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
5105
+ if (params.caseRows.some((caseRow) => getCaseRowEvalKey(caseRow) === params.evalKey || caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) return true;
4961
5106
  if (params.target.mode === "all") return params.evalExists;
4962
- if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
5107
+ if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
4963
5108
  return false;
4964
5109
  }
4965
5110
  async function recomputeEvalStatusesInRuns(params) {
@@ -4968,14 +5113,15 @@ async function recomputeEvalStatusesInRuns(params) {
4968
5113
  if (!runTouchesEval({
4969
5114
  target: run.manifest.target,
4970
5115
  caseRows: run.cases,
5116
+ evalKey: params.evalKey,
4971
5117
  evalId: params.evalId,
4972
5118
  evalExists: params.evalExists
4973
5119
  })) continue;
4974
5120
  if (run.manifest.status === "running") continue;
4975
5121
  let changed = false;
4976
5122
  for (const caseRow of run.cases) {
4977
- if (caseRow.evalId !== params.evalId) continue;
4978
- const caseDetail = run.caseDetails.get(caseRow.caseId);
5123
+ if (getCaseRowEvalKey(caseRow) !== params.evalKey && !(caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) continue;
5124
+ const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
4979
5125
  const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
4980
5126
  if (caseRow.status === nextStatus) continue;
4981
5127
  caseRow.status = nextStatus;
@@ -5043,8 +5189,8 @@ async function loadPersistedRunSnapshots(localStateDir) {
5043
5189
  }
5044
5190
  return snapshots;
5045
5191
  }
5046
- async function persistCaseDetail(runDir, caseDetail) {
5047
- await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
5192
+ async function persistCaseDetail(runDir, caseDetail, fileId = caseDetail.caseId) {
5193
+ await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), JSON.stringify(caseDetail, null, 2));
5048
5194
  }
5049
5195
  function getLastRunStatuses(params) {
5050
5196
  const latestRunInfos = getLatestRunInfos(params);
@@ -5057,14 +5203,15 @@ function getLastRunStatuses(params) {
5057
5203
  function getLatestRunInfos(params) {
5058
5204
  const { runs, knownEvals } = params;
5059
5205
  const knownEvalMetas = [...knownEvals];
5060
- const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
5206
+ const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
5207
+ const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
5061
5208
  const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
5062
5209
  const latestRunInfos = /* @__PURE__ */ new Map();
5063
- for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
5064
- status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
5210
+ for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
5211
+ status: getEvalStatusForRun(run, evalKey, evalIdByKey.get(evalKey), manualScoreKeysByEval.get(evalKey) ?? []),
5065
5212
  startedAt: getRunFreshnessTimestamp(run.manifest),
5066
5213
  commitSha: run.manifest.commitSha ?? null,
5067
- evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
5214
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? run.manifest.evalSourceFingerprints[evalIdByKey.get(evalKey) ?? ""] ?? null
5068
5215
  });
5069
5216
  return latestRunInfos;
5070
5217
  }
@@ -5117,18 +5264,25 @@ async function readCaseDetails(runDir) {
5117
5264
  if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
5118
5265
  const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
5119
5266
  if (!detail) continue;
5120
- caseDetails.set(detail.caseId, detail);
5267
+ caseDetails.set(detail.caseKey ?? detail.caseId, detail);
5121
5268
  }
5122
5269
  return caseDetails;
5123
5270
  }
5124
- function getRunEvalIds(run, knownEvalIds) {
5125
- const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
5126
- if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
5127
- else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
5128
- return [...evalIds];
5129
- }
5130
- function getEvalStatusForRun(run, evalId, manualScoreKeys) {
5131
- const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
5271
+ function getRunEvalKeys(run, knownEvals) {
5272
+ const knownEvalMetas = [...knownEvals];
5273
+ const evalKeys = new Set(run.cases.map(getCaseRowEvalKey));
5274
+ for (const caseRow of run.cases) {
5275
+ if (caseRow.evalKey !== void 0) continue;
5276
+ for (const evalMeta of knownEvalMetas) if (evalMeta.id === caseRow.evalId) evalKeys.add(evalMeta.key);
5277
+ }
5278
+ if (run.manifest.target.mode === "evalIds") {
5279
+ for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
5280
+ for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
5281
+ } else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
5282
+ return [...evalKeys];
5283
+ }
5284
+ function getEvalStatusForRun(run, evalKey, evalId, manualScoreKeys) {
5285
+ const evalCases = run.cases.filter((caseRow) => getCaseRowEvalKey(caseRow) === evalKey || caseRow.evalKey === void 0 && caseRow.evalId === evalId);
5132
5286
  if (evalCases.length > 0) {
5133
5287
  if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
5134
5288
  return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
@@ -5299,8 +5453,7 @@ function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
5299
5453
  }
5300
5454
  //#endregion
5301
5455
  //#region ../runner/src/runExecution.ts
5302
- function filterEvalCases(cases, evalIds, caseIds, evalId) {
5303
- if (evalIds && evalIds.length > 0 && !evalIds.includes(evalId)) return [];
5456
+ function filterEvalCases(cases, caseIds) {
5304
5457
  if (!caseIds || caseIds.length === 0) return cases;
5305
5458
  const selectedCaseIds = new Set(caseIds);
5306
5459
  return cases.filter((evalCase) => selectedCaseIds.has(evalCase.id));
@@ -5329,13 +5482,18 @@ async function callWithUnknownResult(fn, args) {
5329
5482
  return await Reflect.apply(fn, void 0, args);
5330
5483
  }
5331
5484
  async function runCase(params) {
5332
- const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
5485
+ const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
5333
5486
  const scopedIdPrefix = buildScopedEvalIdPrefix({
5334
5487
  evalId,
5335
5488
  evalFilePath,
5336
5489
  caseId: evalCase.id,
5337
5490
  workspaceRoot
5338
5491
  });
5492
+ const caseKey = buildCaseKey({
5493
+ filePath: evalFileRelativePath,
5494
+ evalId,
5495
+ caseId: evalCase.id
5496
+ });
5339
5497
  const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
5340
5498
  const execute = async () => {
5341
5499
  await Reflect.apply(evalDef.execute, evalDef, [{
@@ -5361,7 +5519,12 @@ async function runCase(params) {
5361
5519
  startTime: evalDef.startTime,
5362
5520
  freezeTime: evalDef.freezeTime
5363
5521
  });
5364
- const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
5522
+ const spansWithDerivedAttributes = applyDerivedCallAttributes({
5523
+ spans: scope.spans,
5524
+ llmCallsConfig,
5525
+ apiCallsConfig
5526
+ });
5527
+ const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
5365
5528
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
5366
5529
  if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
5367
5530
  if (!nonAssertError && evalDef.deriveFromTracing) {
@@ -5383,7 +5546,7 @@ async function runCase(params) {
5383
5546
  }
5384
5547
  if (!nonAssertError) addDefaultOutputs({
5385
5548
  outputs: scope.outputs,
5386
- spans: scope.spans,
5549
+ spans: spansWithDerivedAttributes,
5387
5550
  llmCallsConfig,
5388
5551
  apiCallsConfig,
5389
5552
  globalRemove: globalRemoveDefaultConfig,
@@ -5471,7 +5634,7 @@ async function runCase(params) {
5471
5634
  }
5472
5635
  }
5473
5636
  const status = nonAssertError ? "error" : passed ? "pass" : "fail";
5474
- const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
5637
+ const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
5475
5638
  const columns = {};
5476
5639
  const columnOverrides = mergeDefaultColumns({
5477
5640
  columns: evalDef.columns,
@@ -5496,6 +5659,8 @@ async function runCase(params) {
5496
5659
  stack: nonAssertError.stack
5497
5660
  } : null;
5498
5661
  const caseDetail = {
5662
+ evalKey,
5663
+ caseKey,
5499
5664
  caseId: evalCase.id,
5500
5665
  evalId,
5501
5666
  status,
@@ -5581,6 +5746,56 @@ async function executeQueuedCase(params) {
5581
5746
  await queuedCase.onComplete(result);
5582
5747
  }
5583
5748
  //#endregion
5749
+ //#region ../runner/src/targeting.ts
5750
+ function escapeRegex(value) {
5751
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
5752
+ }
5753
+ function globToRegex(pattern) {
5754
+ const normalized = pattern.replaceAll("\\", "/");
5755
+ let regex = "^";
5756
+ for (let i = 0; i < normalized.length; i++) {
5757
+ const char = normalized[i];
5758
+ const next = normalized[i + 1];
5759
+ if (char === "*" && next === "*") {
5760
+ regex += ".*";
5761
+ i++;
5762
+ } else if (char === "*") regex += "[^/]*";
5763
+ else if (char === "?") regex += "[^/]";
5764
+ else regex += escapeRegex(char ?? "");
5765
+ }
5766
+ regex += "$";
5767
+ return new RegExp(regex);
5768
+ }
5769
+ function fileMatches(pattern, filePath) {
5770
+ const normalizedPattern = pattern.replaceAll("\\", "/");
5771
+ if (normalizedPattern === filePath) return true;
5772
+ return globToRegex(normalizedPattern).test(filePath);
5773
+ }
5774
+ function matchesFiles(evalMeta, files) {
5775
+ if (files === void 0 || files.length === 0) return true;
5776
+ return files.some((file) => fileMatches(file, evalMeta.filePath));
5777
+ }
5778
+ function matchesEvalIds(evalMeta, evalIds) {
5779
+ if (evalIds === void 0 || evalIds.length === 0) return true;
5780
+ return evalIds.includes(evalMeta.id);
5781
+ }
5782
+ function matchesEvalKeys(evalMeta, evalKeys) {
5783
+ if (evalKeys === void 0 || evalKeys.length === 0) return true;
5784
+ return evalKeys.includes(evalMeta.key);
5785
+ }
5786
+ /** Return the discovered evals selected by a run target. */
5787
+ function getTargetEvals(params) {
5788
+ const { target } = params.request;
5789
+ return [...params.evals].filter((evalMeta) => matchesEvalKeys(evalMeta, target.evalKeys)).filter((evalMeta) => matchesEvalIds(evalMeta, target.evalIds)).filter((evalMeta) => matchesFiles(evalMeta, target.files)).toSorted((a, b) => a.filePath.localeCompare(b.filePath));
5790
+ }
5791
+ /** Resolve which exact eval keys a run request can affect. */
5792
+ function getTargetEvalKeys(params) {
5793
+ return getTargetEvals({
5794
+ evals: params.sortedEvals,
5795
+ request: params.request
5796
+ }).map((evalMeta) => evalMeta.key);
5797
+ }
5798
+ //#endregion
5584
5799
  //#region ../runner/src/runOrchestration.ts
5585
5800
  /**
5586
5801
  * Ranks case statuses from worst to best. Used to order trial attempts so the
@@ -5631,6 +5846,20 @@ function formatUnknownErrorDetails(error) {
5631
5846
  if (typeof error === "string") return error;
5632
5847
  return String(error);
5633
5848
  }
5849
+ function findDuplicateCaseIds(cases) {
5850
+ const counts = /* @__PURE__ */ new Map();
5851
+ for (const evalCase of cases) counts.set(evalCase.id, (counts.get(evalCase.id) ?? 0) + 1);
5852
+ return [...counts].filter(([, count]) => count > 1).map(([caseId]) => caseId).toSorted();
5853
+ }
5854
+ function findAmbiguousTargetCaseIds(preparedEvals) {
5855
+ const ownersByCaseId = /* @__PURE__ */ new Map();
5856
+ for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
5857
+ const owners = ownersByCaseId.get(preparedCase.caseId) ?? /* @__PURE__ */ new Set();
5858
+ owners.add(`${preparedEval.evalMeta.filePath}#${preparedEval.evalMeta.id}`);
5859
+ ownersByCaseId.set(preparedCase.caseId, owners);
5860
+ }
5861
+ return [...ownersByCaseId].filter(([, owners]) => owners.size > 1).map(([caseId, owners]) => `${caseId} (${[...owners].join(", ")})`);
5862
+ }
5634
5863
  function buildRunErrorMessage(errors) {
5635
5864
  return errors.map((entry) => {
5636
5865
  const [firstLine, ...detailLines] = entry.details.split("\n");
@@ -5650,14 +5879,15 @@ async function finalizePreparedCase(params) {
5650
5879
  scoreKeys: preparedEval.scoreKeys
5651
5880
  });
5652
5881
  if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
5882
+ const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
5653
5883
  runState.cases.push(winningTrial.caseRow);
5654
- runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
5884
+ runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
5655
5885
  preparedEval.mergeColumns(winningTrial.caseDetail.columns);
5656
5886
  if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
5657
5887
  else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
5658
5888
  else runState.summary.failedCases++;
5659
- await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
5660
- await persistCaseDetail(runDir, winningTrial.caseDetail);
5889
+ await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
5890
+ await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
5661
5891
  onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
5662
5892
  emitEvent(runState, {
5663
5893
  type: "case.finished",
@@ -5668,20 +5898,24 @@ async function finalizePreparedCase(params) {
5668
5898
  preparedEval.evalCaseRows.push(winningTrial.caseRow);
5669
5899
  }
5670
5900
  function getPreparedCaseOrderKey(caseRow) {
5671
- return `${caseRow.evalId}\u0000${caseRow.caseId}`;
5901
+ return `${caseRow.evalKey ?? caseRow.evalId}\u0000${caseRow.caseId}`;
5902
+ }
5903
+ function getCaseArtifactFileId(runState, caseRow) {
5904
+ const caseKey = getCaseRowCaseKey(caseRow);
5905
+ return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
5672
5906
  }
5673
5907
  function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
5674
5908
  const orderByCase = /* @__PURE__ */ new Map();
5675
5909
  let order = 0;
5676
5910
  for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
5677
- orderByCase.set(`${preparedEval.evalMeta.id}\u0000${preparedCase.caseId}`, order);
5911
+ orderByCase.set(`${preparedEval.evalMeta.key}\u0000${preparedCase.caseId}`, order);
5678
5912
  order++;
5679
5913
  }
5680
5914
  caseRows.sort((left, right) => {
5681
5915
  return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
5682
5916
  });
5683
5917
  }
5684
- async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
5918
+ async function executeRun({ runState, request, runDir, config, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
5685
5919
  try {
5686
5920
  const targetEvals = getTargetEvals(request);
5687
5921
  emitEvent(runState, {
@@ -5710,10 +5944,10 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5710
5944
  codeFingerprint = "";
5711
5945
  }
5712
5946
  if (codeFingerprint.length > 0) {
5713
- runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
5947
+ runState.manifest.evalSourceFingerprints[evalMeta.key] = codeFingerprint;
5714
5948
  evalMeta.sourceFingerprint = codeFingerprint;
5715
5949
  } else {
5716
- delete runState.manifest.evalSourceFingerprints[evalMeta.id];
5950
+ delete runState.manifest.evalSourceFingerprints[evalMeta.key];
5717
5951
  evalMeta.sourceFingerprint = null;
5718
5952
  }
5719
5953
  try {
@@ -5734,10 +5968,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5734
5968
  await runWithModuleIsolation(moduleIsolation, async () => {
5735
5969
  await runInEvalRuntimeScope("cases", async () => {
5736
5970
  await entry.use(async (evalDef) => {
5737
- const cases = filterEvalCases(resolveRunnableEvalCases({
5971
+ const runnableCases = resolveRunnableEvalCases({
5738
5972
  cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
5739
5973
  evalId: evalMeta.id
5740
- }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5974
+ });
5975
+ const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
5976
+ if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
5977
+ const cases = filterEvalCases(runnableCases, request.target.caseIds);
5741
5978
  runState.summary.totalCases += cases.length;
5742
5979
  const defaultConfig = resolveEvalDefaultConfig({
5743
5980
  evalDef,
@@ -5783,6 +6020,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5783
6020
  const { caseDetail, caseRowUpdate } = await runCase({
5784
6021
  evalDef,
5785
6022
  evalId: evalMeta.id,
6023
+ evalKey: evalMeta.key,
5786
6024
  evalCase,
5787
6025
  globalTraceDisplay,
5788
6026
  llmCallsConfig,
@@ -5795,6 +6033,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5795
6033
  codeFingerprint,
5796
6034
  moduleIsolation,
5797
6035
  evalFilePath,
6036
+ evalFileRelativePath: evalMeta.filePath,
5798
6037
  workspaceRoot,
5799
6038
  artifactDir: join(runDir, "artifacts"),
5800
6039
  runId: runState.manifest.id
@@ -5804,6 +6043,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5804
6043
  caseRow: {
5805
6044
  caseId: evalCase.id,
5806
6045
  evalId: evalMeta.id,
6046
+ evalKey: evalMeta.key,
6047
+ caseKey: caseDetail.caseKey,
5807
6048
  status: caseRowUpdate.status ?? "pending",
5808
6049
  durationMs: caseRowUpdate.durationMs ?? null,
5809
6050
  columns: caseRowUpdate.columns ?? {},
@@ -5839,16 +6080,23 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5839
6080
  evalId: evalMeta.id,
5840
6081
  details: formatUnknownErrorDetails(error)
5841
6082
  });
5842
- lastRunStatusMap.set(evalMeta.id, "error");
5843
- latestRunInfoMap.set(evalMeta.id, {
6083
+ lastRunStatusMap.set(evalMeta.key, "error");
6084
+ latestRunInfoMap.set(evalMeta.key, {
5844
6085
  status: "error",
5845
6086
  startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
5846
6087
  commitSha: runState.manifest.commitSha ?? null,
5847
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.id] ?? null
6088
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.key] ?? null
5848
6089
  });
5849
6090
  }
5850
6091
  }
5851
- await executeQueuedCases({
6092
+ const ambiguousCaseTargets = request.target.caseIds && request.target.caseIds.length > 0 ? findAmbiguousTargetCaseIds(preparedEvals) : [];
6093
+ if (ambiguousCaseTargets.length > 0) {
6094
+ queuedCases.length = 0;
6095
+ evalErrors.push({
6096
+ evalId: "target",
6097
+ details: `Ambiguous --case target. Narrow it with --file and/or --eval: ${ambiguousCaseTargets.join("; ")}`
6098
+ });
6099
+ } else await executeQueuedCases({
5852
6100
  queuedCases,
5853
6101
  concurrency: getConfiguredConcurrency(),
5854
6102
  globalTraceDisplay: config.traceDisplay
@@ -5863,13 +6111,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5863
6111
  emitEvent
5864
6112
  });
5865
6113
  preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
5866
- lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
5867
- const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
5868
- latestRunInfoMap.set(preparedEval.evalMeta.id, {
6114
+ lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
6115
+ const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
6116
+ latestRunInfoMap.set(preparedEval.evalMeta.key, {
5869
6117
  status: latestStatus,
5870
6118
  startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
5871
6119
  commitSha: runState.manifest.commitSha ?? null,
5872
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
6120
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.key] ?? null
5873
6121
  });
5874
6122
  }
5875
6123
  sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
@@ -5882,20 +6130,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5882
6130
  const completedRunAt = endTime.toISOString();
5883
6131
  runState.manifest.endedAt = completedRunAt;
5884
6132
  runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
5885
- for (const evalId of getTargetEvalIds({
6133
+ for (const evalKey of getTargetEvalKeys({
5886
6134
  request,
5887
- sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
5888
- knownEvalIds: new Set(evals.keys())
6135
+ sortedEvals: getSortedEvalMetas()
5889
6136
  })) {
5890
- const latestStatus = lastRunStatusMap.get(evalId) ?? toLastRunStatus(deriveStatusFromCaseRows({
6137
+ const latestStatus = lastRunStatusMap.get(evalKey) ?? toLastRunStatus(deriveStatusFromCaseRows({
5891
6138
  caseRows: [],
5892
6139
  lifecycleStatus: runState.manifest.status
5893
6140
  }));
5894
- latestRunInfoMap.set(evalId, {
6141
+ latestRunInfoMap.set(evalKey, {
5895
6142
  status: latestStatus,
5896
6143
  startedAt: completedRunAt,
5897
6144
  commitSha: runState.manifest.commitSha ?? null,
5898
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
6145
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalKey] ?? null
5899
6146
  });
5900
6147
  }
5901
6148
  await persistRunState(runState);
@@ -5938,4 +6185,4 @@ function toLastRunStatus(status) {
5938
6185
  return status === "pending" ? null : status;
5939
6186
  }
5940
6187
  //#endregion
5941
- export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, runInEvalScope as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, getEvalCaseInput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, mergeEvalOutput as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, isInEvalScope as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, repoFile as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, defineEval as In, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, getEvalRegistry as Ln, serializedCacheSpanSchema as Lt, getNestedAttribute as M, setEvalOutput as Mn, cacheModeSchema as Mt, getEvalTitle as N, setScopeCacheContext as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, nextEvalId as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, startEvalBackgroundJob as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getCurrentScope as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, incrementEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, advanceEvalTime as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalAssert as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, runInExistingEvalScope as jn, cacheListItemSchema as jt, extractCacheHits as k, runInEvalRuntimeScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, appendToEvalOutput as vn, evalChartAxisSchema as vt, createFsCacheStore as w, getEvalStartTime as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, evalLog as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, configureEvalRunLogs as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
6188
+ export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };