@ls-stack/agent-eval 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Cw79dJDr.mjs → app-DS3j_AyX.mjs} +6 -3
- package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
- package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-D3QNOcPN.mjs → cli-ETfZ15RB.mjs} +151 -42
- package/dist/index.d.mts +111 -12
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +28 -18
- package/dist/{runOrchestration-CimthgI7.mjs → runOrchestration-B31SV_Bq.mjs} +364 -117
- package/dist/{runner-B-SYzW8w.mjs → runner-B2f2TEjp.mjs} +1 -1
- package/dist/{runner-4yNYRvmF.mjs → runner-cj1TkR-H.mjs} +2 -2
- package/dist/src-CyNb2ycA.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +14 -6
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +0 -118
- package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
- package/dist/src-CcXfWT4M.mjs +0 -3
|
@@ -1330,7 +1330,7 @@ const errorCoreFields = new Set([
|
|
|
1330
1330
|
"stack",
|
|
1331
1331
|
"capturedAt"
|
|
1332
1332
|
]);
|
|
1333
|
-
function isRecord$
|
|
1333
|
+
function isRecord$5(value) {
|
|
1334
1334
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1335
1335
|
}
|
|
1336
1336
|
function formatUnknownErrorMessage(error) {
|
|
@@ -1358,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
|
|
|
1358
1358
|
stack: error.stack,
|
|
1359
1359
|
capturedAt
|
|
1360
1360
|
};
|
|
1361
|
-
if (isRecord$
|
|
1361
|
+
if (isRecord$5(error)) {
|
|
1362
1362
|
const extraFields = getErrorExtraFields(error);
|
|
1363
1363
|
const name = typeof error.name === "string" ? error.name : void 0;
|
|
1364
1364
|
const stack = typeof error.stack === "string" ? error.stack : void 0;
|
|
@@ -1383,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
|
|
|
1383
1383
|
return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
|
|
1384
1384
|
}
|
|
1385
1385
|
function isCaptureEvalSpanErrorOptions(value) {
|
|
1386
|
-
if (!isRecord$
|
|
1386
|
+
if (!isRecord$5(value)) return false;
|
|
1387
1387
|
const keys = Object.keys(value);
|
|
1388
1388
|
if (keys.length === 0) return false;
|
|
1389
1389
|
if (!keys.every((key) => key === "level")) return false;
|
|
@@ -1506,7 +1506,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1506
1506
|
namespace,
|
|
1507
1507
|
operationType: "value",
|
|
1508
1508
|
operationName: info.name,
|
|
1509
|
-
storedAt:
|
|
1509
|
+
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1510
1510
|
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1511
1511
|
recording: await serializeCacheRecording(recording)
|
|
1512
1512
|
}, {
|
|
@@ -1940,7 +1940,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1940
1940
|
operationName: info.name,
|
|
1941
1941
|
spanName: info.name,
|
|
1942
1942
|
spanKind: info.kind,
|
|
1943
|
-
storedAt:
|
|
1943
|
+
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1944
1944
|
codeFingerprint: ctx.codeFingerprint,
|
|
1945
1945
|
recording: await serializeCacheRecording(recording)
|
|
1946
1946
|
};
|
|
@@ -2541,6 +2541,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2541
2541
|
const evalStatsConfigSchema = z.array(evalStatItemSchema);
|
|
2542
2542
|
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
2543
2543
|
const evalSummarySchema = z.object({
|
|
2544
|
+
/**
|
|
2545
|
+
* Stable eval identity derived from the workspace-relative file path and
|
|
2546
|
+
* authored eval id. Older clients should display `id`; callers that need an
|
|
2547
|
+
* exact eval must use `key`.
|
|
2548
|
+
*/
|
|
2549
|
+
key: z.string().default(""),
|
|
2544
2550
|
id: z.string(),
|
|
2545
2551
|
title: z.string().optional(),
|
|
2546
2552
|
/** Eval file path relative to the active workspace root. */
|
|
@@ -2580,6 +2586,16 @@ const evalSummarySchema = z.object({
|
|
|
2580
2586
|
});
|
|
2581
2587
|
/** Schema for one case row in an eval run result table. */
|
|
2582
2588
|
const caseRowSchema = z.object({
|
|
2589
|
+
/**
|
|
2590
|
+
* Stable eval identity for this case row. Legacy rows may omit it and fall
|
|
2591
|
+
* back to `evalId`.
|
|
2592
|
+
*/
|
|
2593
|
+
evalKey: z.string().optional(),
|
|
2594
|
+
/**
|
|
2595
|
+
* Stable case identity derived from file path, eval id, and case id. Legacy
|
|
2596
|
+
* rows may omit it and fall back to `caseId`.
|
|
2597
|
+
*/
|
|
2598
|
+
caseKey: z.string().optional(),
|
|
2583
2599
|
caseId: z.string(),
|
|
2584
2600
|
evalId: z.string(),
|
|
2585
2601
|
status: z.enum([
|
|
@@ -2657,6 +2673,10 @@ const scoreTraceSchema = z.object({
|
|
|
2657
2673
|
});
|
|
2658
2674
|
/** Schema for the detailed payload shown when opening a specific case. */
|
|
2659
2675
|
const caseDetailSchema = z.object({
|
|
2676
|
+
/** Stable eval identity for this case detail. */
|
|
2677
|
+
evalKey: z.string().optional(),
|
|
2678
|
+
/** Stable case identity for this case detail. */
|
|
2679
|
+
caseKey: z.string().optional(),
|
|
2660
2680
|
caseId: z.string(),
|
|
2661
2681
|
evalId: z.string(),
|
|
2662
2682
|
status: z.enum([
|
|
@@ -2694,6 +2714,36 @@ const caseDetailSchema = z.object({
|
|
|
2694
2714
|
*/
|
|
2695
2715
|
cacheRefs: z.array(traceCacheRefSchema).default([])
|
|
2696
2716
|
});
|
|
2717
|
+
/** Schema for discovery problems that should be shown before running evals. */
|
|
2718
|
+
const discoveryIssueSchema = z.object({
|
|
2719
|
+
type: z.enum(["duplicate-eval-id"]),
|
|
2720
|
+
severity: z.enum(["error"]),
|
|
2721
|
+
filePath: z.string(),
|
|
2722
|
+
evalId: z.string(),
|
|
2723
|
+
message: z.string()
|
|
2724
|
+
});
|
|
2725
|
+
//#endregion
|
|
2726
|
+
//#region ../shared/src/evalIdentity.ts
|
|
2727
|
+
/** Build the stable identity for one eval inside a workspace. */
|
|
2728
|
+
function buildEvalKey(params) {
|
|
2729
|
+
return `${encodeURIComponent(params.filePath)}#${encodeURIComponent(params.evalId)}`;
|
|
2730
|
+
}
|
|
2731
|
+
/** Build the stable identity for one eval case inside a workspace. */
|
|
2732
|
+
function buildCaseKey(params) {
|
|
2733
|
+
return [
|
|
2734
|
+
encodeURIComponent(params.filePath),
|
|
2735
|
+
encodeURIComponent(params.evalId),
|
|
2736
|
+
encodeURIComponent(params.caseId)
|
|
2737
|
+
].join("#");
|
|
2738
|
+
}
|
|
2739
|
+
/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
|
|
2740
|
+
function getCaseRowEvalKey(row) {
|
|
2741
|
+
return row.evalKey ?? row.evalId;
|
|
2742
|
+
}
|
|
2743
|
+
/** Return the collision-safe case key stored on a row, falling back for legacy data. */
|
|
2744
|
+
function getCaseRowCaseKey(row) {
|
|
2745
|
+
return row.caseKey ?? row.caseId;
|
|
2746
|
+
}
|
|
2697
2747
|
//#endregion
|
|
2698
2748
|
//#region ../shared/src/schemas/config.ts
|
|
2699
2749
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
@@ -2727,13 +2777,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
|
|
|
2727
2777
|
const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
|
|
2728
2778
|
/** Where an API-call metric is rendered inside the API calls tab. */
|
|
2729
2779
|
const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
|
|
2780
|
+
const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
|
|
2730
2781
|
/**
|
|
2731
2782
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
2732
2783
|
*
|
|
2733
2784
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2734
|
-
* with the configured `format` and `numberFormat`.
|
|
2735
|
-
*
|
|
2736
|
-
*
|
|
2785
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2786
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
2787
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2788
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2789
|
+
* Defaults to `['body']` when omitted.
|
|
2737
2790
|
*/
|
|
2738
2791
|
const llmCallMetricSchema = z.object({
|
|
2739
2792
|
/** Display label for the metric row or header chip. */
|
|
@@ -2760,9 +2813,11 @@ const llmCallMetricSchema = z.object({
|
|
|
2760
2813
|
* Schema for a single user-defined metric attached to API call rows.
|
|
2761
2814
|
*
|
|
2762
2815
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2763
|
-
* with the configured `format` and `numberFormat`.
|
|
2764
|
-
*
|
|
2765
|
-
*
|
|
2816
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2817
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
2818
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2819
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2820
|
+
* Defaults to `['body']` when omitted.
|
|
2766
2821
|
*/
|
|
2767
2822
|
const apiCallMetricSchema = z.object({
|
|
2768
2823
|
/** Display label for the metric row or header chip. */
|
|
@@ -2839,6 +2894,13 @@ const llmCallsConfigSchema = z.object({
|
|
|
2839
2894
|
toolCalls: z.string().optional()
|
|
2840
2895
|
}).optional(),
|
|
2841
2896
|
/**
|
|
2897
|
+
* Derived attributes persisted onto every matching LLM span before
|
|
2898
|
+
* `deriveFromTracing`, default outputs, trace display, and call metrics read
|
|
2899
|
+
* the trace. Keys are dot-paths under `span.attributes`; return `undefined`
|
|
2900
|
+
* to skip writing the attribute for one span.
|
|
2901
|
+
*/
|
|
2902
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2903
|
+
/**
|
|
2842
2904
|
* Model/provider pricing registry used to calculate LLM-call costs from
|
|
2843
2905
|
* token counts. Built-in LLM cost fields are only derived from this registry.
|
|
2844
2906
|
*/
|
|
@@ -2867,6 +2929,13 @@ const apiCallsConfigSchema = z.object({
|
|
|
2867
2929
|
durationMs: z.string().optional(),
|
|
2868
2930
|
error: z.string().optional()
|
|
2869
2931
|
}).optional(),
|
|
2932
|
+
/**
|
|
2933
|
+
* Derived attributes persisted onto every matching API span before trace
|
|
2934
|
+
* display and call metrics read the trace. Keys are dot-paths under
|
|
2935
|
+
* `span.attributes`; return `undefined` to skip writing the attribute for
|
|
2936
|
+
* one span.
|
|
2937
|
+
*/
|
|
2938
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2870
2939
|
/** Custom user-defined metrics surfaced on each API call. */
|
|
2871
2940
|
metrics: z.array(apiCallMetricSchema).optional()
|
|
2872
2941
|
});
|
|
@@ -2898,6 +2967,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2898
2967
|
reasoning: "reasoning",
|
|
2899
2968
|
toolCalls: "toolCalls"
|
|
2900
2969
|
},
|
|
2970
|
+
derivedAttributes: [],
|
|
2901
2971
|
metrics: [],
|
|
2902
2972
|
pricing: []
|
|
2903
2973
|
};
|
|
@@ -2921,8 +2991,35 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2921
2991
|
durationMs: "durationMs",
|
|
2922
2992
|
error: "error"
|
|
2923
2993
|
},
|
|
2994
|
+
derivedAttributes: [],
|
|
2924
2995
|
metrics: []
|
|
2925
2996
|
};
|
|
2997
|
+
function resolveDerivedAttributes(input) {
|
|
2998
|
+
return Object.entries(input ?? {}).map(([path, compute]) => ({
|
|
2999
|
+
path,
|
|
3000
|
+
compute
|
|
3001
|
+
}));
|
|
3002
|
+
}
|
|
3003
|
+
function resolveLlmCallMetric(metric) {
|
|
3004
|
+
return {
|
|
3005
|
+
label: metric.label,
|
|
3006
|
+
tooltip: metric.tooltip,
|
|
3007
|
+
path: metric.path,
|
|
3008
|
+
format: metric.format ?? "string",
|
|
3009
|
+
numberFormat: metric.numberFormat,
|
|
3010
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
3011
|
+
};
|
|
3012
|
+
}
|
|
3013
|
+
function resolveApiCallMetric(metric) {
|
|
3014
|
+
return {
|
|
3015
|
+
label: metric.label,
|
|
3016
|
+
tooltip: metric.tooltip,
|
|
3017
|
+
path: metric.path,
|
|
3018
|
+
format: metric.format ?? "string",
|
|
3019
|
+
numberFormat: metric.numberFormat,
|
|
3020
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
3021
|
+
};
|
|
3022
|
+
}
|
|
2926
3023
|
/**
|
|
2927
3024
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
2928
3025
|
* by the UI to derive the LLM calls tab.
|
|
@@ -2942,14 +3039,8 @@ function resolveLlmCallsConfig(input) {
|
|
|
2942
3039
|
...DEFAULT_LLM_CALLS_CONFIG.attributes,
|
|
2943
3040
|
...input?.attributes
|
|
2944
3041
|
},
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
tooltip: m.tooltip,
|
|
2948
|
-
path: m.path,
|
|
2949
|
-
format: m.format ?? "string",
|
|
2950
|
-
numberFormat: m.numberFormat,
|
|
2951
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2952
|
-
})),
|
|
3042
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3043
|
+
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
2953
3044
|
pricing: (input?.pricing ?? []).map((p) => ({
|
|
2954
3045
|
model: p.model,
|
|
2955
3046
|
provider: p.provider,
|
|
@@ -2979,14 +3070,8 @@ function resolveApiCallsConfig(input) {
|
|
|
2979
3070
|
...DEFAULT_API_CALLS_CONFIG.attributes,
|
|
2980
3071
|
...input?.attributes
|
|
2981
3072
|
},
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
tooltip: m.tooltip,
|
|
2985
|
-
path: m.path,
|
|
2986
|
-
format: m.format ?? "string",
|
|
2987
|
-
numberFormat: m.numberFormat,
|
|
2988
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2989
|
-
}))
|
|
3073
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3074
|
+
metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
|
|
2990
3075
|
};
|
|
2991
3076
|
}
|
|
2992
3077
|
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
@@ -3037,8 +3122,8 @@ const runManifestSchema = z.object({
|
|
|
3037
3122
|
*/
|
|
3038
3123
|
commitSha: z.string().nullable().optional().default(null),
|
|
3039
3124
|
/**
|
|
3040
|
-
* Eval-file fingerprints captured for this run, keyed by eval
|
|
3041
|
-
* persisted runs may
|
|
3125
|
+
* Eval-file fingerprints captured for this run, keyed by exact eval key.
|
|
3126
|
+
* Older persisted runs may use authored eval ids or omit this field.
|
|
3042
3127
|
*/
|
|
3043
3128
|
evalSourceFingerprints: z.record(z.string(), z.string()).optional().default({}),
|
|
3044
3129
|
target: z.object({
|
|
@@ -3047,6 +3132,10 @@ const runManifestSchema = z.object({
|
|
|
3047
3132
|
"evalIds",
|
|
3048
3133
|
"caseIds"
|
|
3049
3134
|
]),
|
|
3135
|
+
/** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
|
|
3136
|
+
evalKeys: z.array(z.string()).optional(),
|
|
3137
|
+
/** Workspace-relative file paths or glob patterns used to filter selected evals. */
|
|
3138
|
+
files: z.array(z.string()).optional(),
|
|
3050
3139
|
evalIds: z.array(z.string()).optional(),
|
|
3051
3140
|
caseIds: z.array(z.string()).optional()
|
|
3052
3141
|
}),
|
|
@@ -3206,7 +3295,7 @@ function getEvalTitle(evalLike) {
|
|
|
3206
3295
|
}
|
|
3207
3296
|
//#endregion
|
|
3208
3297
|
//#region ../shared/src/utils/getNestedAttribute.ts
|
|
3209
|
-
function isRecord$
|
|
3298
|
+
function isRecord$4(value) {
|
|
3210
3299
|
return typeof value === "object" && value !== null;
|
|
3211
3300
|
}
|
|
3212
3301
|
/**
|
|
@@ -3221,12 +3310,84 @@ function getNestedAttribute(value, path) {
|
|
|
3221
3310
|
const parts = path.split(".");
|
|
3222
3311
|
let current = value;
|
|
3223
3312
|
for (const part of parts) {
|
|
3224
|
-
if (!isRecord$
|
|
3313
|
+
if (!isRecord$4(current) || !(part in current)) return;
|
|
3225
3314
|
current = current[part];
|
|
3226
3315
|
}
|
|
3227
3316
|
return current;
|
|
3228
3317
|
}
|
|
3229
3318
|
//#endregion
|
|
3319
|
+
//#region ../shared/src/utils/deriveCallAttributes.ts
|
|
3320
|
+
function isRecord$3(value) {
|
|
3321
|
+
return typeof value === "object" && value !== null;
|
|
3322
|
+
}
|
|
3323
|
+
function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
3324
|
+
const root = value === void 0 ? {} : { ...value };
|
|
3325
|
+
const parts = path.split(".");
|
|
3326
|
+
let current = root;
|
|
3327
|
+
for (const [index, part] of parts.entries()) {
|
|
3328
|
+
if (index === parts.length - 1) {
|
|
3329
|
+
current[part] = attributeValue;
|
|
3330
|
+
continue;
|
|
3331
|
+
}
|
|
3332
|
+
const nextValue = current[part];
|
|
3333
|
+
const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
|
|
3334
|
+
current[part] = nextRecord;
|
|
3335
|
+
current = nextRecord;
|
|
3336
|
+
}
|
|
3337
|
+
return root;
|
|
3338
|
+
}
|
|
3339
|
+
function applyDerivedAttributesForKind(params) {
|
|
3340
|
+
let attributes = params.span.attributes;
|
|
3341
|
+
for (const derivedAttribute of params.derivedAttributes) {
|
|
3342
|
+
if (derivedAttribute.compute === void 0) continue;
|
|
3343
|
+
const span = {
|
|
3344
|
+
...params.span,
|
|
3345
|
+
attributes
|
|
3346
|
+
};
|
|
3347
|
+
const value = (() => {
|
|
3348
|
+
try {
|
|
3349
|
+
return derivedAttribute.compute({
|
|
3350
|
+
attributes,
|
|
3351
|
+
span,
|
|
3352
|
+
get: (path) => getNestedAttribute(attributes, path)
|
|
3353
|
+
});
|
|
3354
|
+
} catch {
|
|
3355
|
+
return;
|
|
3356
|
+
}
|
|
3357
|
+
})();
|
|
3358
|
+
if (value === void 0) continue;
|
|
3359
|
+
attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
|
|
3360
|
+
}
|
|
3361
|
+
if (attributes === params.span.attributes) return params.span;
|
|
3362
|
+
return {
|
|
3363
|
+
...params.span,
|
|
3364
|
+
attributes
|
|
3365
|
+
};
|
|
3366
|
+
}
|
|
3367
|
+
/**
|
|
3368
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
3369
|
+
*
|
|
3370
|
+
* These derived attributes are applied before trace consumers run, so
|
|
3371
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
3372
|
+
* metrics can all read them by normal dot-path lookup.
|
|
3373
|
+
*/
|
|
3374
|
+
function applyDerivedCallAttributes(params) {
|
|
3375
|
+
const llmKinds = new Set(params.llmCallsConfig.kinds);
|
|
3376
|
+
const apiKinds = new Set(params.apiCallsConfig.kinds);
|
|
3377
|
+
return params.spans.map((span) => {
|
|
3378
|
+
let nextSpan = span;
|
|
3379
|
+
if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3380
|
+
span: nextSpan,
|
|
3381
|
+
derivedAttributes: params.llmCallsConfig.derivedAttributes
|
|
3382
|
+
});
|
|
3383
|
+
if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3384
|
+
span: nextSpan,
|
|
3385
|
+
derivedAttributes: params.apiCallsConfig.derivedAttributes
|
|
3386
|
+
});
|
|
3387
|
+
return nextSpan;
|
|
3388
|
+
});
|
|
3389
|
+
}
|
|
3390
|
+
//#endregion
|
|
3230
3391
|
//#region ../shared/src/utils/extractLlmCalls.ts
|
|
3231
3392
|
function readNumber$2(attributes, path) {
|
|
3232
3393
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -3701,6 +3862,10 @@ const createRunRequestSchema = z.object({
|
|
|
3701
3862
|
"evalIds",
|
|
3702
3863
|
"caseIds"
|
|
3703
3864
|
]),
|
|
3865
|
+
/** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
|
|
3866
|
+
evalKeys: z.array(z.string()).optional(),
|
|
3867
|
+
/** Workspace-relative file paths or glob patterns used to filter selected evals. */
|
|
3868
|
+
files: z.array(z.string()).optional(),
|
|
3704
3869
|
evalIds: z.array(z.string()).optional(),
|
|
3705
3870
|
caseIds: z.array(z.string()).optional()
|
|
3706
3871
|
}),
|
|
@@ -4671,7 +4836,8 @@ function addDefaultOutputs(params) {
|
|
|
4671
4836
|
//#region ../runner/src/discovery.ts
|
|
4672
4837
|
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4673
4838
|
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
4674
|
-
|
|
4839
|
+
/** Parse static eval metadata and discovery issues from one eval file. */
|
|
4840
|
+
function parseEvalDiscovery(filePath, content) {
|
|
4675
4841
|
const metas = [];
|
|
4676
4842
|
let searchIndex = 0;
|
|
4677
4843
|
while (searchIndex < content.length) {
|
|
@@ -4694,7 +4860,20 @@ function parseEvalMetas(filePath, content) {
|
|
|
4694
4860
|
}
|
|
4695
4861
|
searchIndex = extracted.nextIndex;
|
|
4696
4862
|
}
|
|
4697
|
-
|
|
4863
|
+
const countsById = /* @__PURE__ */ new Map();
|
|
4864
|
+
for (const meta of metas) countsById.set(meta.id, (countsById.get(meta.id) ?? 0) + 1);
|
|
4865
|
+
const duplicateIds = new Set([...countsById].filter(([, count]) => count > 1).map(([id]) => id));
|
|
4866
|
+
const issues = [...duplicateIds].map((evalId) => ({
|
|
4867
|
+
type: "duplicate-eval-id",
|
|
4868
|
+
severity: "error",
|
|
4869
|
+
filePath,
|
|
4870
|
+
evalId,
|
|
4871
|
+
message: `Duplicate eval id "${evalId}" in ${filePath}. Eval ids must be unique within one file.`
|
|
4872
|
+
}));
|
|
4873
|
+
return {
|
|
4874
|
+
metas: metas.filter((meta) => !duplicateIds.has(meta.id)),
|
|
4875
|
+
issues
|
|
4876
|
+
};
|
|
4698
4877
|
}
|
|
4699
4878
|
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
4700
4879
|
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
@@ -4813,40 +4992,6 @@ function getRunFreshnessTimestamp(manifest) {
|
|
|
4813
4992
|
return manifest.endedAt ?? manifest.startedAt;
|
|
4814
4993
|
}
|
|
4815
4994
|
//#endregion
|
|
4816
|
-
//#region ../runner/src/evalSummaries.ts
|
|
4817
|
-
/** Build the API/UI summary payload for one discovered eval. */
|
|
4818
|
-
function buildEvalSummary(params) {
|
|
4819
|
-
const { meta, config, gitState, latestRun, lastRunStatus } = params;
|
|
4820
|
-
const { sourceFingerprint, ...summaryMeta } = meta;
|
|
4821
|
-
const freshness = deriveEvalFreshness({
|
|
4822
|
-
latestRun,
|
|
4823
|
-
gitState,
|
|
4824
|
-
currentEvalSourceFingerprint: sourceFingerprint,
|
|
4825
|
-
staleAfterDays: config.staleAfterDays ?? 14
|
|
4826
|
-
});
|
|
4827
|
-
return {
|
|
4828
|
-
...summaryMeta,
|
|
4829
|
-
stale: freshness.stale,
|
|
4830
|
-
outdated: freshness.outdated,
|
|
4831
|
-
freshnessStatus: freshness.freshnessStatus,
|
|
4832
|
-
latestRunAt: latestRun?.startedAt ?? null,
|
|
4833
|
-
latestRunCommitSha: latestRun?.commitSha ?? null,
|
|
4834
|
-
currentCommitSha: gitState.commitSha,
|
|
4835
|
-
lastRunStatus
|
|
4836
|
-
};
|
|
4837
|
-
}
|
|
4838
|
-
/** Resolve which eval ids a run request should mark as the latest run. */
|
|
4839
|
-
function getTargetEvalIds(params) {
|
|
4840
|
-
const { request, sortedEvalIds, knownEvalIds } = params;
|
|
4841
|
-
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.filter((evalId) => knownEvalIds.has(evalId));
|
|
4842
|
-
return sortedEvalIds;
|
|
4843
|
-
}
|
|
4844
|
-
/** Write one latest-run snapshot to each targeted eval id. */
|
|
4845
|
-
function setLatestRunInfoMap(params) {
|
|
4846
|
-
const { latestRunInfoMap, evalIds, info } = params;
|
|
4847
|
-
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
4848
|
-
}
|
|
4849
|
-
//#endregion
|
|
4850
4995
|
//#region ../runner/src/outputArtifacts.ts
|
|
4851
4996
|
const mimeTypeExtensionMap = {
|
|
4852
4997
|
"application/json": ".json",
|
|
@@ -4957,9 +5102,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
|
|
|
4957
5102
|
return caseRow.status === "error" ? "error" : "pass";
|
|
4958
5103
|
}
|
|
4959
5104
|
function runTouchesEval(params) {
|
|
4960
|
-
if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
|
|
5105
|
+
if (params.caseRows.some((caseRow) => getCaseRowEvalKey(caseRow) === params.evalKey || caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) return true;
|
|
4961
5106
|
if (params.target.mode === "all") return params.evalExists;
|
|
4962
|
-
if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
|
|
5107
|
+
if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
|
|
4963
5108
|
return false;
|
|
4964
5109
|
}
|
|
4965
5110
|
async function recomputeEvalStatusesInRuns(params) {
|
|
@@ -4968,14 +5113,15 @@ async function recomputeEvalStatusesInRuns(params) {
|
|
|
4968
5113
|
if (!runTouchesEval({
|
|
4969
5114
|
target: run.manifest.target,
|
|
4970
5115
|
caseRows: run.cases,
|
|
5116
|
+
evalKey: params.evalKey,
|
|
4971
5117
|
evalId: params.evalId,
|
|
4972
5118
|
evalExists: params.evalExists
|
|
4973
5119
|
})) continue;
|
|
4974
5120
|
if (run.manifest.status === "running") continue;
|
|
4975
5121
|
let changed = false;
|
|
4976
5122
|
for (const caseRow of run.cases) {
|
|
4977
|
-
if (caseRow.evalId
|
|
4978
|
-
const caseDetail = run.caseDetails.get(caseRow
|
|
5123
|
+
if (getCaseRowEvalKey(caseRow) !== params.evalKey && !(caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) continue;
|
|
5124
|
+
const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
4979
5125
|
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
4980
5126
|
if (caseRow.status === nextStatus) continue;
|
|
4981
5127
|
caseRow.status = nextStatus;
|
|
@@ -5043,8 +5189,8 @@ async function loadPersistedRunSnapshots(localStateDir) {
|
|
|
5043
5189
|
}
|
|
5044
5190
|
return snapshots;
|
|
5045
5191
|
}
|
|
5046
|
-
async function persistCaseDetail(runDir, caseDetail) {
|
|
5047
|
-
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(
|
|
5192
|
+
async function persistCaseDetail(runDir, caseDetail, fileId = caseDetail.caseId) {
|
|
5193
|
+
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
5048
5194
|
}
|
|
5049
5195
|
function getLastRunStatuses(params) {
|
|
5050
5196
|
const latestRunInfos = getLatestRunInfos(params);
|
|
@@ -5057,14 +5203,15 @@ function getLastRunStatuses(params) {
|
|
|
5057
5203
|
function getLatestRunInfos(params) {
|
|
5058
5204
|
const { runs, knownEvals } = params;
|
|
5059
5205
|
const knownEvalMetas = [...knownEvals];
|
|
5060
|
-
const
|
|
5206
|
+
const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
|
|
5207
|
+
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
5061
5208
|
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
5062
5209
|
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
5063
|
-
for (const run of orderedRuns) for (const
|
|
5064
|
-
status: getEvalStatusForRun(run,
|
|
5210
|
+
for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
|
|
5211
|
+
status: getEvalStatusForRun(run, evalKey, evalIdByKey.get(evalKey), manualScoreKeysByEval.get(evalKey) ?? []),
|
|
5065
5212
|
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
5066
5213
|
commitSha: run.manifest.commitSha ?? null,
|
|
5067
|
-
evalSourceFingerprint: run.manifest.evalSourceFingerprints[
|
|
5214
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? run.manifest.evalSourceFingerprints[evalIdByKey.get(evalKey) ?? ""] ?? null
|
|
5068
5215
|
});
|
|
5069
5216
|
return latestRunInfos;
|
|
5070
5217
|
}
|
|
@@ -5117,18 +5264,25 @@ async function readCaseDetails(runDir) {
|
|
|
5117
5264
|
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
5118
5265
|
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
5119
5266
|
if (!detail) continue;
|
|
5120
|
-
caseDetails.set(detail.caseId, detail);
|
|
5267
|
+
caseDetails.set(detail.caseKey ?? detail.caseId, detail);
|
|
5121
5268
|
}
|
|
5122
5269
|
return caseDetails;
|
|
5123
5270
|
}
|
|
5124
|
-
function
|
|
5125
|
-
const
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
5130
|
-
|
|
5131
|
-
|
|
5271
|
+
function getRunEvalKeys(run, knownEvals) {
|
|
5272
|
+
const knownEvalMetas = [...knownEvals];
|
|
5273
|
+
const evalKeys = new Set(run.cases.map(getCaseRowEvalKey));
|
|
5274
|
+
for (const caseRow of run.cases) {
|
|
5275
|
+
if (caseRow.evalKey !== void 0) continue;
|
|
5276
|
+
for (const evalMeta of knownEvalMetas) if (evalMeta.id === caseRow.evalId) evalKeys.add(evalMeta.key);
|
|
5277
|
+
}
|
|
5278
|
+
if (run.manifest.target.mode === "evalIds") {
|
|
5279
|
+
for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
|
|
5280
|
+
for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
|
|
5281
|
+
} else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
|
|
5282
|
+
return [...evalKeys];
|
|
5283
|
+
}
|
|
5284
|
+
function getEvalStatusForRun(run, evalKey, evalId, manualScoreKeys) {
|
|
5285
|
+
const evalCases = run.cases.filter((caseRow) => getCaseRowEvalKey(caseRow) === evalKey || caseRow.evalKey === void 0 && caseRow.evalId === evalId);
|
|
5132
5286
|
if (evalCases.length > 0) {
|
|
5133
5287
|
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
5134
5288
|
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
@@ -5299,8 +5453,7 @@ function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
|
5299
5453
|
}
|
|
5300
5454
|
//#endregion
|
|
5301
5455
|
//#region ../runner/src/runExecution.ts
|
|
5302
|
-
function filterEvalCases(cases,
|
|
5303
|
-
if (evalIds && evalIds.length > 0 && !evalIds.includes(evalId)) return [];
|
|
5456
|
+
function filterEvalCases(cases, caseIds) {
|
|
5304
5457
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
5305
5458
|
const selectedCaseIds = new Set(caseIds);
|
|
5306
5459
|
return cases.filter((evalCase) => selectedCaseIds.has(evalCase.id));
|
|
@@ -5329,13 +5482,18 @@ async function callWithUnknownResult(fn, args) {
|
|
|
5329
5482
|
return await Reflect.apply(fn, void 0, args);
|
|
5330
5483
|
}
|
|
5331
5484
|
async function runCase(params) {
|
|
5332
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5485
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5333
5486
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
5334
5487
|
evalId,
|
|
5335
5488
|
evalFilePath,
|
|
5336
5489
|
caseId: evalCase.id,
|
|
5337
5490
|
workspaceRoot
|
|
5338
5491
|
});
|
|
5492
|
+
const caseKey = buildCaseKey({
|
|
5493
|
+
filePath: evalFileRelativePath,
|
|
5494
|
+
evalId,
|
|
5495
|
+
caseId: evalCase.id
|
|
5496
|
+
});
|
|
5339
5497
|
const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
|
|
5340
5498
|
const execute = async () => {
|
|
5341
5499
|
await Reflect.apply(evalDef.execute, evalDef, [{
|
|
@@ -5361,7 +5519,12 @@ async function runCase(params) {
|
|
|
5361
5519
|
startTime: evalDef.startTime,
|
|
5362
5520
|
freezeTime: evalDef.freezeTime
|
|
5363
5521
|
});
|
|
5364
|
-
const
|
|
5522
|
+
const spansWithDerivedAttributes = applyDerivedCallAttributes({
|
|
5523
|
+
spans: scope.spans,
|
|
5524
|
+
llmCallsConfig,
|
|
5525
|
+
apiCallsConfig
|
|
5526
|
+
});
|
|
5527
|
+
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5365
5528
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5366
5529
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5367
5530
|
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
@@ -5383,7 +5546,7 @@ async function runCase(params) {
|
|
|
5383
5546
|
}
|
|
5384
5547
|
if (!nonAssertError) addDefaultOutputs({
|
|
5385
5548
|
outputs: scope.outputs,
|
|
5386
|
-
spans:
|
|
5549
|
+
spans: spansWithDerivedAttributes,
|
|
5387
5550
|
llmCallsConfig,
|
|
5388
5551
|
apiCallsConfig,
|
|
5389
5552
|
globalRemove: globalRemoveDefaultConfig,
|
|
@@ -5471,7 +5634,7 @@ async function runCase(params) {
|
|
|
5471
5634
|
}
|
|
5472
5635
|
}
|
|
5473
5636
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
5474
|
-
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(
|
|
5637
|
+
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5475
5638
|
const columns = {};
|
|
5476
5639
|
const columnOverrides = mergeDefaultColumns({
|
|
5477
5640
|
columns: evalDef.columns,
|
|
@@ -5496,6 +5659,8 @@ async function runCase(params) {
|
|
|
5496
5659
|
stack: nonAssertError.stack
|
|
5497
5660
|
} : null;
|
|
5498
5661
|
const caseDetail = {
|
|
5662
|
+
evalKey,
|
|
5663
|
+
caseKey,
|
|
5499
5664
|
caseId: evalCase.id,
|
|
5500
5665
|
evalId,
|
|
5501
5666
|
status,
|
|
@@ -5581,6 +5746,56 @@ async function executeQueuedCase(params) {
|
|
|
5581
5746
|
await queuedCase.onComplete(result);
|
|
5582
5747
|
}
|
|
5583
5748
|
//#endregion
|
|
5749
|
+
//#region ../runner/src/targeting.ts
|
|
5750
|
+
function escapeRegex(value) {
|
|
5751
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
5752
|
+
}
|
|
5753
|
+
function globToRegex(pattern) {
|
|
5754
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
5755
|
+
let regex = "^";
|
|
5756
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
5757
|
+
const char = normalized[i];
|
|
5758
|
+
const next = normalized[i + 1];
|
|
5759
|
+
if (char === "*" && next === "*") {
|
|
5760
|
+
regex += ".*";
|
|
5761
|
+
i++;
|
|
5762
|
+
} else if (char === "*") regex += "[^/]*";
|
|
5763
|
+
else if (char === "?") regex += "[^/]";
|
|
5764
|
+
else regex += escapeRegex(char ?? "");
|
|
5765
|
+
}
|
|
5766
|
+
regex += "$";
|
|
5767
|
+
return new RegExp(regex);
|
|
5768
|
+
}
|
|
5769
|
+
function fileMatches(pattern, filePath) {
|
|
5770
|
+
const normalizedPattern = pattern.replaceAll("\\", "/");
|
|
5771
|
+
if (normalizedPattern === filePath) return true;
|
|
5772
|
+
return globToRegex(normalizedPattern).test(filePath);
|
|
5773
|
+
}
|
|
5774
|
+
function matchesFiles(evalMeta, files) {
|
|
5775
|
+
if (files === void 0 || files.length === 0) return true;
|
|
5776
|
+
return files.some((file) => fileMatches(file, evalMeta.filePath));
|
|
5777
|
+
}
|
|
5778
|
+
function matchesEvalIds(evalMeta, evalIds) {
|
|
5779
|
+
if (evalIds === void 0 || evalIds.length === 0) return true;
|
|
5780
|
+
return evalIds.includes(evalMeta.id);
|
|
5781
|
+
}
|
|
5782
|
+
function matchesEvalKeys(evalMeta, evalKeys) {
|
|
5783
|
+
if (evalKeys === void 0 || evalKeys.length === 0) return true;
|
|
5784
|
+
return evalKeys.includes(evalMeta.key);
|
|
5785
|
+
}
|
|
5786
|
+
/** Return the discovered evals selected by a run target. */
|
|
5787
|
+
function getTargetEvals(params) {
|
|
5788
|
+
const { target } = params.request;
|
|
5789
|
+
return [...params.evals].filter((evalMeta) => matchesEvalKeys(evalMeta, target.evalKeys)).filter((evalMeta) => matchesEvalIds(evalMeta, target.evalIds)).filter((evalMeta) => matchesFiles(evalMeta, target.files)).toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
5790
|
+
}
|
|
5791
|
+
/** Resolve which exact eval keys a run request can affect. */
|
|
5792
|
+
function getTargetEvalKeys(params) {
|
|
5793
|
+
return getTargetEvals({
|
|
5794
|
+
evals: params.sortedEvals,
|
|
5795
|
+
request: params.request
|
|
5796
|
+
}).map((evalMeta) => evalMeta.key);
|
|
5797
|
+
}
|
|
5798
|
+
//#endregion
|
|
5584
5799
|
//#region ../runner/src/runOrchestration.ts
|
|
5585
5800
|
/**
|
|
5586
5801
|
* Ranks case statuses from worst to best. Used to order trial attempts so the
|
|
@@ -5631,6 +5846,20 @@ function formatUnknownErrorDetails(error) {
|
|
|
5631
5846
|
if (typeof error === "string") return error;
|
|
5632
5847
|
return String(error);
|
|
5633
5848
|
}
|
|
5849
|
+
function findDuplicateCaseIds(cases) {
|
|
5850
|
+
const counts = /* @__PURE__ */ new Map();
|
|
5851
|
+
for (const evalCase of cases) counts.set(evalCase.id, (counts.get(evalCase.id) ?? 0) + 1);
|
|
5852
|
+
return [...counts].filter(([, count]) => count > 1).map(([caseId]) => caseId).toSorted();
|
|
5853
|
+
}
|
|
5854
|
+
function findAmbiguousTargetCaseIds(preparedEvals) {
|
|
5855
|
+
const ownersByCaseId = /* @__PURE__ */ new Map();
|
|
5856
|
+
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
5857
|
+
const owners = ownersByCaseId.get(preparedCase.caseId) ?? /* @__PURE__ */ new Set();
|
|
5858
|
+
owners.add(`${preparedEval.evalMeta.filePath}#${preparedEval.evalMeta.id}`);
|
|
5859
|
+
ownersByCaseId.set(preparedCase.caseId, owners);
|
|
5860
|
+
}
|
|
5861
|
+
return [...ownersByCaseId].filter(([, owners]) => owners.size > 1).map(([caseId, owners]) => `${caseId} (${[...owners].join(", ")})`);
|
|
5862
|
+
}
|
|
5634
5863
|
function buildRunErrorMessage(errors) {
|
|
5635
5864
|
return errors.map((entry) => {
|
|
5636
5865
|
const [firstLine, ...detailLines] = entry.details.split("\n");
|
|
@@ -5650,14 +5879,15 @@ async function finalizePreparedCase(params) {
|
|
|
5650
5879
|
scoreKeys: preparedEval.scoreKeys
|
|
5651
5880
|
});
|
|
5652
5881
|
if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
|
|
5882
|
+
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
5653
5883
|
runState.cases.push(winningTrial.caseRow);
|
|
5654
|
-
runState.caseDetails.set(
|
|
5884
|
+
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
5655
5885
|
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
5656
5886
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
5657
5887
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
5658
5888
|
else runState.summary.failedCases++;
|
|
5659
|
-
await writeFile(join(runDir, "traces", `${
|
|
5660
|
-
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
5889
|
+
await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
5890
|
+
await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
|
|
5661
5891
|
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
5662
5892
|
emitEvent(runState, {
|
|
5663
5893
|
type: "case.finished",
|
|
@@ -5668,20 +5898,24 @@ async function finalizePreparedCase(params) {
|
|
|
5668
5898
|
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
5669
5899
|
}
|
|
5670
5900
|
function getPreparedCaseOrderKey(caseRow) {
|
|
5671
|
-
return `${caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
5901
|
+
return `${caseRow.evalKey ?? caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
5902
|
+
}
|
|
5903
|
+
function getCaseArtifactFileId(runState, caseRow) {
|
|
5904
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
5905
|
+
return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
|
|
5672
5906
|
}
|
|
5673
5907
|
function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
|
|
5674
5908
|
const orderByCase = /* @__PURE__ */ new Map();
|
|
5675
5909
|
let order = 0;
|
|
5676
5910
|
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
5677
|
-
orderByCase.set(`${preparedEval.evalMeta.
|
|
5911
|
+
orderByCase.set(`${preparedEval.evalMeta.key}\u0000${preparedCase.caseId}`, order);
|
|
5678
5912
|
order++;
|
|
5679
5913
|
}
|
|
5680
5914
|
caseRows.sort((left, right) => {
|
|
5681
5915
|
return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
|
|
5682
5916
|
});
|
|
5683
5917
|
}
|
|
5684
|
-
async function executeRun({ runState, request, runDir, config,
|
|
5918
|
+
async function executeRun({ runState, request, runDir, config, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
|
|
5685
5919
|
try {
|
|
5686
5920
|
const targetEvals = getTargetEvals(request);
|
|
5687
5921
|
emitEvent(runState, {
|
|
@@ -5710,10 +5944,10 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5710
5944
|
codeFingerprint = "";
|
|
5711
5945
|
}
|
|
5712
5946
|
if (codeFingerprint.length > 0) {
|
|
5713
|
-
runState.manifest.evalSourceFingerprints[evalMeta.
|
|
5947
|
+
runState.manifest.evalSourceFingerprints[evalMeta.key] = codeFingerprint;
|
|
5714
5948
|
evalMeta.sourceFingerprint = codeFingerprint;
|
|
5715
5949
|
} else {
|
|
5716
|
-
delete runState.manifest.evalSourceFingerprints[evalMeta.
|
|
5950
|
+
delete runState.manifest.evalSourceFingerprints[evalMeta.key];
|
|
5717
5951
|
evalMeta.sourceFingerprint = null;
|
|
5718
5952
|
}
|
|
5719
5953
|
try {
|
|
@@ -5734,10 +5968,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5734
5968
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5735
5969
|
await runInEvalRuntimeScope("cases", async () => {
|
|
5736
5970
|
await entry.use(async (evalDef) => {
|
|
5737
|
-
const
|
|
5971
|
+
const runnableCases = resolveRunnableEvalCases({
|
|
5738
5972
|
cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
|
|
5739
5973
|
evalId: evalMeta.id
|
|
5740
|
-
})
|
|
5974
|
+
});
|
|
5975
|
+
const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
|
|
5976
|
+
if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
|
|
5977
|
+
const cases = filterEvalCases(runnableCases, request.target.caseIds);
|
|
5741
5978
|
runState.summary.totalCases += cases.length;
|
|
5742
5979
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
5743
5980
|
evalDef,
|
|
@@ -5783,6 +6020,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5783
6020
|
const { caseDetail, caseRowUpdate } = await runCase({
|
|
5784
6021
|
evalDef,
|
|
5785
6022
|
evalId: evalMeta.id,
|
|
6023
|
+
evalKey: evalMeta.key,
|
|
5786
6024
|
evalCase,
|
|
5787
6025
|
globalTraceDisplay,
|
|
5788
6026
|
llmCallsConfig,
|
|
@@ -5795,6 +6033,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5795
6033
|
codeFingerprint,
|
|
5796
6034
|
moduleIsolation,
|
|
5797
6035
|
evalFilePath,
|
|
6036
|
+
evalFileRelativePath: evalMeta.filePath,
|
|
5798
6037
|
workspaceRoot,
|
|
5799
6038
|
artifactDir: join(runDir, "artifacts"),
|
|
5800
6039
|
runId: runState.manifest.id
|
|
@@ -5804,6 +6043,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5804
6043
|
caseRow: {
|
|
5805
6044
|
caseId: evalCase.id,
|
|
5806
6045
|
evalId: evalMeta.id,
|
|
6046
|
+
evalKey: evalMeta.key,
|
|
6047
|
+
caseKey: caseDetail.caseKey,
|
|
5807
6048
|
status: caseRowUpdate.status ?? "pending",
|
|
5808
6049
|
durationMs: caseRowUpdate.durationMs ?? null,
|
|
5809
6050
|
columns: caseRowUpdate.columns ?? {},
|
|
@@ -5839,16 +6080,23 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5839
6080
|
evalId: evalMeta.id,
|
|
5840
6081
|
details: formatUnknownErrorDetails(error)
|
|
5841
6082
|
});
|
|
5842
|
-
lastRunStatusMap.set(evalMeta.
|
|
5843
|
-
latestRunInfoMap.set(evalMeta.
|
|
6083
|
+
lastRunStatusMap.set(evalMeta.key, "error");
|
|
6084
|
+
latestRunInfoMap.set(evalMeta.key, {
|
|
5844
6085
|
status: "error",
|
|
5845
6086
|
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
5846
6087
|
commitSha: runState.manifest.commitSha ?? null,
|
|
5847
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.
|
|
6088
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.key] ?? null
|
|
5848
6089
|
});
|
|
5849
6090
|
}
|
|
5850
6091
|
}
|
|
5851
|
-
|
|
6092
|
+
const ambiguousCaseTargets = request.target.caseIds && request.target.caseIds.length > 0 ? findAmbiguousTargetCaseIds(preparedEvals) : [];
|
|
6093
|
+
if (ambiguousCaseTargets.length > 0) {
|
|
6094
|
+
queuedCases.length = 0;
|
|
6095
|
+
evalErrors.push({
|
|
6096
|
+
evalId: "target",
|
|
6097
|
+
details: `Ambiguous --case target. Narrow it with --file and/or --eval: ${ambiguousCaseTargets.join("; ")}`
|
|
6098
|
+
});
|
|
6099
|
+
} else await executeQueuedCases({
|
|
5852
6100
|
queuedCases,
|
|
5853
6101
|
concurrency: getConfiguredConcurrency(),
|
|
5854
6102
|
globalTraceDisplay: config.traceDisplay
|
|
@@ -5863,13 +6111,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5863
6111
|
emitEvent
|
|
5864
6112
|
});
|
|
5865
6113
|
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
5866
|
-
lastRunStatusMap.set(preparedEval.evalMeta.
|
|
5867
|
-
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.
|
|
5868
|
-
latestRunInfoMap.set(preparedEval.evalMeta.
|
|
6114
|
+
lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
6115
|
+
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
|
|
6116
|
+
latestRunInfoMap.set(preparedEval.evalMeta.key, {
|
|
5869
6117
|
status: latestStatus,
|
|
5870
6118
|
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
5871
6119
|
commitSha: runState.manifest.commitSha ?? null,
|
|
5872
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.
|
|
6120
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.key] ?? null
|
|
5873
6121
|
});
|
|
5874
6122
|
}
|
|
5875
6123
|
sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
|
|
@@ -5882,20 +6130,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5882
6130
|
const completedRunAt = endTime.toISOString();
|
|
5883
6131
|
runState.manifest.endedAt = completedRunAt;
|
|
5884
6132
|
runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
|
|
5885
|
-
for (const
|
|
6133
|
+
for (const evalKey of getTargetEvalKeys({
|
|
5886
6134
|
request,
|
|
5887
|
-
|
|
5888
|
-
knownEvalIds: new Set(evals.keys())
|
|
6135
|
+
sortedEvals: getSortedEvalMetas()
|
|
5889
6136
|
})) {
|
|
5890
|
-
const latestStatus = lastRunStatusMap.get(
|
|
6137
|
+
const latestStatus = lastRunStatusMap.get(evalKey) ?? toLastRunStatus(deriveStatusFromCaseRows({
|
|
5891
6138
|
caseRows: [],
|
|
5892
6139
|
lifecycleStatus: runState.manifest.status
|
|
5893
6140
|
}));
|
|
5894
|
-
latestRunInfoMap.set(
|
|
6141
|
+
latestRunInfoMap.set(evalKey, {
|
|
5895
6142
|
status: latestStatus,
|
|
5896
6143
|
startedAt: completedRunAt,
|
|
5897
6144
|
commitSha: runState.manifest.commitSha ?? null,
|
|
5898
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[
|
|
6145
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalKey] ?? null
|
|
5899
6146
|
});
|
|
5900
6147
|
}
|
|
5901
6148
|
await persistRunState(runState);
|
|
@@ -5938,4 +6185,4 @@ function toLastRunStatus(status) {
|
|
|
5938
6185
|
return status === "pending" ? null : status;
|
|
5939
6186
|
}
|
|
5940
6187
|
//#endregion
|
|
5941
|
-
export {
|
|
6188
|
+
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };
|