@ls-stack/agent-eval 0.58.0 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-L9GdY28I.mjs → app-BxD6aHbp.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-Cf37PZKi.mjs → cli-HBwXIJsg.mjs} +31 -5
- package/dist/index.d.mts +136 -80
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-C4kAOhC1.mjs → runExecution-pHJ0_TzH.mjs} +188 -89
- package/dist/{runOrchestration-5xEiQxiS.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
- package/dist/{runner-JIykMlve.mjs → runner-BnZMGBla.mjs} +1 -1
- package/dist/{runner-bjd_UB9i.mjs → runner-D_pz2NON.mjs} +2 -2
- package/dist/{src-303BocMW.mjs → src-AeXGBJ26.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +18 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
4
|
-
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
1
|
+
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-HBwXIJsg.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-AeXGBJ26.mjs";
|
|
4
|
+
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-ngVXShH4.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
|
|
|
851
851
|
const runLogPhaseSchema = z.enum([
|
|
852
852
|
"eval",
|
|
853
853
|
"derive",
|
|
854
|
+
"tracingAssertions",
|
|
854
855
|
"outputsSchema",
|
|
855
856
|
"scorer"
|
|
856
857
|
]);
|
|
@@ -1008,6 +1009,9 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
|
|
|
1008
1009
|
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
1009
1010
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
1010
1011
|
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
1012
|
+
const evalTracingAssertionsFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracing assertions function" });
|
|
1013
|
+
/** Schema for function or keyed trace-derived assertion config. */
|
|
1014
|
+
const evalTracingAssertionsConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" }), z.record(z.string().min(1), evalTracingAssertionsFnSchema)]);
|
|
1011
1015
|
/** Schema for UI overrides on derived or scored columns. */
|
|
1012
1016
|
const evalColumnOverrideSchema = z.object({
|
|
1013
1017
|
label: z.string().optional(),
|
|
@@ -1411,6 +1415,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1411
1415
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1412
1416
|
columns: evalColumnsSchema.optional(),
|
|
1413
1417
|
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
1418
|
+
tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
|
|
1414
1419
|
stats: evalStatsConfigSchema.optional(),
|
|
1415
1420
|
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
1416
1421
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
@@ -1847,8 +1852,9 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1847
1852
|
* freshness state.
|
|
1848
1853
|
*/
|
|
1849
1854
|
function getEvalDisplayStatus(params) {
|
|
1850
|
-
const { stale, outdated, lastRunStatus, isRunning = false } = params;
|
|
1855
|
+
const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
|
|
1851
1856
|
if (isRunning || lastRunStatus === "running") return "running";
|
|
1857
|
+
if (isEnqueued) return "enqueued";
|
|
1852
1858
|
if (lastRunStatus === "pass") {
|
|
1853
1859
|
if (stale) return "stale";
|
|
1854
1860
|
if (outdated) return "outdated";
|
|
@@ -2651,6 +2657,7 @@ const scopeStorage = new AsyncLocalStorage();
|
|
|
2651
2657
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
2652
2658
|
const evalClockStorage = new AsyncLocalStorage();
|
|
2653
2659
|
const activeSpanStackStorage = new AsyncLocalStorage();
|
|
2660
|
+
const recordingStackStorage = new AsyncLocalStorage();
|
|
2654
2661
|
let activeEvalScopeCount = 0;
|
|
2655
2662
|
let activeEvalRuntimeScopeCount = 0;
|
|
2656
2663
|
let consoleCaptureEnabled = true;
|
|
@@ -2717,6 +2724,17 @@ var EvalAssertionError = class extends Error {
|
|
|
2717
2724
|
this.name = "EvalAssertionError";
|
|
2718
2725
|
}
|
|
2719
2726
|
};
|
|
2727
|
+
/** Error thrown when an SDK helper is used in an unsupported runner phase. */
|
|
2728
|
+
var EvalRuntimeUsageError = class extends Error {
|
|
2729
|
+
constructor(message) {
|
|
2730
|
+
super(message);
|
|
2731
|
+
this.name = "EvalRuntimeUsageError";
|
|
2732
|
+
}
|
|
2733
|
+
};
|
|
2734
|
+
/** Throw when assertion helpers are used in a runner phase that forbids them. */
|
|
2735
|
+
function assertEvalAssertionsAllowed(apiName) {
|
|
2736
|
+
if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
|
|
2737
|
+
}
|
|
2720
2738
|
function getEvalClockStateNowMs(state) {
|
|
2721
2739
|
const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
|
|
2722
2740
|
return state.startMs + elapsedMs + state.offsetMs;
|
|
@@ -2803,13 +2821,29 @@ async function runWithActiveSpan(span, fn) {
|
|
|
2803
2821
|
const currentStack = activeSpanStackStorage.getStore() ?? [];
|
|
2804
2822
|
return await activeSpanStackStorage.run([...currentStack, span], fn);
|
|
2805
2823
|
}
|
|
2824
|
+
/** Execute a callback with a cache recording frame scoped to this async branch. */
|
|
2825
|
+
async function runWithCacheRecordingFrame(frame, fn) {
|
|
2826
|
+
const currentStack = recordingStackStorage.getStore() ?? [];
|
|
2827
|
+
return await recordingStackStorage.run([...currentStack, frame], fn);
|
|
2828
|
+
}
|
|
2829
|
+
function getCurrentCacheRecordingFrame(scope) {
|
|
2830
|
+
if (scope.replayingDepth > 0) return void 0;
|
|
2831
|
+
return recordingStackStorage.getStore()?.at(-1);
|
|
2832
|
+
}
|
|
2833
|
+
/** Mark a span as created by the active cache recorder, when one exists. */
|
|
2834
|
+
function recordSpanForActiveCacheRecording(scope, spanId) {
|
|
2835
|
+
if (scope.replayingDepth > 0) return;
|
|
2836
|
+
for (const frame of recordingStackStorage.getStore() ?? []) frame.spanIds.add(spanId);
|
|
2837
|
+
}
|
|
2806
2838
|
/**
|
|
2807
2839
|
* Return the current eval runner phase for this async execution.
|
|
2808
2840
|
*
|
|
2809
2841
|
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2810
2842
|
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2811
|
-
* case `execute`, `derive` while deriving outputs from traces,
|
|
2812
|
-
*
|
|
2843
|
+
* case `execute`, `derive` while deriving outputs from traces,
|
|
2844
|
+
* `tracingAssertions` while checking trace-derived assertions,
|
|
2845
|
+
* `outputsSchema` while validating outputs, and `scorer` while computing
|
|
2846
|
+
* scores.
|
|
2813
2847
|
*/
|
|
2814
2848
|
function isInEvalScope() {
|
|
2815
2849
|
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
@@ -2830,7 +2864,7 @@ function normalizeLogLevel(level) {
|
|
|
2830
2864
|
}
|
|
2831
2865
|
function getCurrentLogPhase() {
|
|
2832
2866
|
const runtimeScope = runtimeScopeStorage.getStore();
|
|
2833
|
-
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2867
|
+
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2834
2868
|
return null;
|
|
2835
2869
|
}
|
|
2836
2870
|
function formatLogArgs(args) {
|
|
@@ -3110,7 +3144,6 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
3110
3144
|
logs: [],
|
|
3111
3145
|
spans: [],
|
|
3112
3146
|
checkpoints: /* @__PURE__ */ new Map(),
|
|
3113
|
-
recordingStack: [],
|
|
3114
3147
|
replayingDepth: 0,
|
|
3115
3148
|
cacheContext: options.cacheContext,
|
|
3116
3149
|
caseCacheRefs: [],
|
|
@@ -3150,10 +3183,16 @@ function nextEvalId() {
|
|
|
3150
3183
|
scope.nextEvalIdCounter++;
|
|
3151
3184
|
return `${scope.idPrefix}-${scope.nextEvalIdCounter}`;
|
|
3152
3185
|
}
|
|
3153
|
-
function
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3186
|
+
function recordCacheRecordingOpIfActive(scope, op) {
|
|
3187
|
+
getCurrentCacheRecordingFrame(scope)?.ops.push(op);
|
|
3188
|
+
}
|
|
3189
|
+
function recordCacheRecordingAttributesIfActive(scope, span, attributes) {
|
|
3190
|
+
const frames = recordingStackStorage.getStore();
|
|
3191
|
+
if (scope.replayingDepth > 0 || frames === void 0) return;
|
|
3192
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
3193
|
+
if (key.startsWith("cache.")) continue;
|
|
3194
|
+
for (const frame of frames) if (span.id === frame.replayParentSpanId) frame.finalAttributes[key] = value;
|
|
3195
|
+
}
|
|
3157
3196
|
}
|
|
3158
3197
|
function normalizeEvalOutputOptions(options) {
|
|
3159
3198
|
if (options === void 0) return void 0;
|
|
@@ -3185,7 +3224,7 @@ function setEvalOutput(key, value, options = void 0) {
|
|
|
3185
3224
|
scope.outputs[key] = value;
|
|
3186
3225
|
const column = normalizeEvalOutputOptions(options);
|
|
3187
3226
|
if (column !== void 0) scope.outputColumnOverrides[key] = column;
|
|
3188
|
-
|
|
3227
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3189
3228
|
kind: "setOutput",
|
|
3190
3229
|
key,
|
|
3191
3230
|
value,
|
|
@@ -3205,7 +3244,7 @@ function appendToEvalOutput(key, value) {
|
|
|
3205
3244
|
if (existing === void 0) scope.outputs[key] = [value];
|
|
3206
3245
|
else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
|
|
3207
3246
|
else scope.outputs[key] = [existing, value];
|
|
3208
|
-
|
|
3247
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3209
3248
|
kind: "appendOutput",
|
|
3210
3249
|
key,
|
|
3211
3250
|
value
|
|
@@ -3223,7 +3262,7 @@ function mergeEvalOutput(key, patch) {
|
|
|
3223
3262
|
const existing = scope.outputs[key];
|
|
3224
3263
|
if (existing === void 0) {
|
|
3225
3264
|
scope.outputs[key] = { ...patch };
|
|
3226
|
-
|
|
3265
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3227
3266
|
kind: "mergeOutput",
|
|
3228
3267
|
key,
|
|
3229
3268
|
patch
|
|
@@ -3238,7 +3277,7 @@ function mergeEvalOutput(key, patch) {
|
|
|
3238
3277
|
...existing,
|
|
3239
3278
|
...patch
|
|
3240
3279
|
};
|
|
3241
|
-
|
|
3280
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3242
3281
|
kind: "mergeOutput",
|
|
3243
3282
|
key,
|
|
3244
3283
|
patch
|
|
@@ -3256,7 +3295,7 @@ function incrementEvalOutput(key, delta) {
|
|
|
3256
3295
|
const existing = scope.outputs[key];
|
|
3257
3296
|
if (existing === void 0) {
|
|
3258
3297
|
scope.outputs[key] = delta;
|
|
3259
|
-
|
|
3298
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3260
3299
|
kind: "incrementOutput",
|
|
3261
3300
|
key,
|
|
3262
3301
|
delta
|
|
@@ -3268,7 +3307,7 @@ function incrementEvalOutput(key, delta) {
|
|
|
3268
3307
|
return;
|
|
3269
3308
|
}
|
|
3270
3309
|
scope.outputs[key] = existing + delta;
|
|
3271
|
-
|
|
3310
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3272
3311
|
kind: "incrementOutput",
|
|
3273
3312
|
key,
|
|
3274
3313
|
delta
|
|
@@ -3280,10 +3319,12 @@ function incrementEvalOutput(key, delta) {
|
|
|
3280
3319
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
3281
3320
|
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
3282
3321
|
* TypeScript assertion signature still narrows the checked value after the
|
|
3283
|
-
* call.
|
|
3322
|
+
* call. Calls inside `deriveFromTracing` throw because derivations must only
|
|
3323
|
+
* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
3284
3324
|
*/
|
|
3285
3325
|
function evalAssert(condition, message) {
|
|
3286
3326
|
const scope = getCurrentScope();
|
|
3327
|
+
assertEvalAssertionsAllowed("evalAssert(...)");
|
|
3287
3328
|
if (condition) {
|
|
3288
3329
|
if (scope) scope.assertions.push({
|
|
3289
3330
|
message,
|
|
@@ -3434,6 +3475,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
|
|
|
3434
3475
|
* case scope is active, matching `evalAssert(...)`.
|
|
3435
3476
|
*/
|
|
3436
3477
|
function evalExpect(value) {
|
|
3478
|
+
assertEvalAssertionsAllowed("evalExpect(...)");
|
|
3437
3479
|
return new EvalExpectationImpl(value, false);
|
|
3438
3480
|
}
|
|
3439
3481
|
//#endregion
|
|
@@ -3675,10 +3717,6 @@ async function materializeExternalJsonValues(value, store) {
|
|
|
3675
3717
|
if (!isRecordLike$3(value)) return value;
|
|
3676
3718
|
return Object.fromEntries(await Promise.all(Object.entries(value).map(async ([key, entryValue]) => [key, await materializeExternalJsonValues(entryValue, store)])));
|
|
3677
3719
|
}
|
|
3678
|
-
/** Clone one value through the same serialization path used for cache data. */
|
|
3679
|
-
async function cloneCacheValue(value, options = void 0) {
|
|
3680
|
-
return deserializeCacheValue(await serializeCacheValue(value, options));
|
|
3681
|
-
}
|
|
3682
3720
|
function normalizeCacheSerializationOptions(options) {
|
|
3683
3721
|
return {
|
|
3684
3722
|
compress: options?.compress !== false,
|
|
@@ -4109,29 +4147,6 @@ function valueKind$1(value) {
|
|
|
4109
4147
|
function copyArray(value) {
|
|
4110
4148
|
return value.map((item) => item);
|
|
4111
4149
|
}
|
|
4112
|
-
function stripCacheAttributes(attributes) {
|
|
4113
|
-
if (!attributes) return {};
|
|
4114
|
-
const result = {};
|
|
4115
|
-
for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
|
|
4116
|
-
return result;
|
|
4117
|
-
}
|
|
4118
|
-
async function snapshotNonCacheAttributes(span) {
|
|
4119
|
-
const snapshot = await cloneCacheValue(stripCacheAttributes(span?.attributes));
|
|
4120
|
-
return isRecordLike$2(snapshot) ? snapshot : {};
|
|
4121
|
-
}
|
|
4122
|
-
function diffNonCacheAttributes(before, after) {
|
|
4123
|
-
const result = {};
|
|
4124
|
-
for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
|
|
4125
|
-
return result;
|
|
4126
|
-
}
|
|
4127
|
-
function cacheAttributeValuesEqual(left, right) {
|
|
4128
|
-
if (Object.is(left, right)) return true;
|
|
4129
|
-
try {
|
|
4130
|
-
return JSON.stringify(left) === JSON.stringify(right);
|
|
4131
|
-
} catch {
|
|
4132
|
-
return false;
|
|
4133
|
-
}
|
|
4134
|
-
}
|
|
4135
4150
|
function appendCacheRef(span, ref) {
|
|
4136
4151
|
if (span === void 0) return;
|
|
4137
4152
|
const existing = span.attributes?.["cache.refs"];
|
|
@@ -4150,7 +4165,7 @@ function recordCacheRef(scope, span, ref) {
|
|
|
4150
4165
|
}
|
|
4151
4166
|
scope.caseCacheRefs.push(ref);
|
|
4152
4167
|
}
|
|
4153
|
-
function serializeSubSpanTree(scope, spanId) {
|
|
4168
|
+
function serializeSubSpanTree(scope, spanId, spanIds) {
|
|
4154
4169
|
const original = scope.spans.find((s) => s.id === spanId);
|
|
4155
4170
|
if (!original) return {
|
|
4156
4171
|
kind: "custom",
|
|
@@ -4163,7 +4178,7 @@ function serializeSubSpanTree(scope, spanId) {
|
|
|
4163
4178
|
warnings: void 0,
|
|
4164
4179
|
children: []
|
|
4165
4180
|
};
|
|
4166
|
-
const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
|
|
4181
|
+
const children = scope.spans.filter((s) => s.parentId === spanId && spanIds.has(s.id)).map((child) => serializeSubSpanTree(scope, child.id, spanIds));
|
|
4167
4182
|
return {
|
|
4168
4183
|
kind: original.kind,
|
|
4169
4184
|
name: original.name,
|
|
@@ -4179,9 +4194,9 @@ function serializeSubSpanTree(scope, spanId) {
|
|
|
4179
4194
|
function appendSubSpanOps(scope, frame) {
|
|
4180
4195
|
for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
|
|
4181
4196
|
const candidate = scope.spans[i];
|
|
4182
|
-
if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
|
|
4197
|
+
if (candidate?.parentId === frame.replayParentSpanId && frame.spanIds.has(candidate.id)) frame.ops.push({
|
|
4183
4198
|
kind: "subSpan",
|
|
4184
|
-
span: serializeSubSpanTree(scope, candidate.id)
|
|
4199
|
+
span: serializeSubSpanTree(scope, candidate.id, frame.spanIds)
|
|
4185
4200
|
});
|
|
4186
4201
|
}
|
|
4187
4202
|
}
|
|
@@ -4437,25 +4452,21 @@ function createTraceCache(generateSpanId) {
|
|
|
4437
4452
|
key: keyHash,
|
|
4438
4453
|
status: "bypass"
|
|
4439
4454
|
});
|
|
4440
|
-
const beforeAttributes = await snapshotNonCacheAttributes(activeSpan);
|
|
4441
4455
|
const frame = {
|
|
4442
4456
|
baseSpanIndex: scope.spans.length,
|
|
4443
4457
|
replayParentSpanId: activeSpan?.id ?? null,
|
|
4458
|
+
spanIds: /* @__PURE__ */ new Set(),
|
|
4459
|
+
finalAttributes: {},
|
|
4444
4460
|
ops: []
|
|
4445
4461
|
};
|
|
4446
|
-
|
|
4447
|
-
|
|
4448
|
-
|
|
4449
|
-
bodyResult = await fn();
|
|
4450
|
-
} finally {
|
|
4451
|
-
scope.recordingStack.pop();
|
|
4452
|
-
}
|
|
4462
|
+
const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
|
|
4463
|
+
return await fn();
|
|
4464
|
+
});
|
|
4453
4465
|
appendSubSpanOps(scope, frame);
|
|
4454
4466
|
if (canStore) {
|
|
4455
|
-
const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
|
|
4456
4467
|
const recording = {
|
|
4457
4468
|
returnValue: bodyResult,
|
|
4458
|
-
finalAttributes,
|
|
4469
|
+
finalAttributes: frame.finalAttributes,
|
|
4459
4470
|
ops: frame.ops
|
|
4460
4471
|
};
|
|
4461
4472
|
await cacheCtx.adapter.write({
|
|
@@ -4514,6 +4525,13 @@ function mergeSpanAttributes(span, attributes) {
|
|
|
4514
4525
|
...span.attributes,
|
|
4515
4526
|
...attributes
|
|
4516
4527
|
};
|
|
4528
|
+
const scope = getCurrentScope();
|
|
4529
|
+
if (scope !== void 0) recordCacheRecordingAttributesIfActive(scope, span, attributes);
|
|
4530
|
+
}
|
|
4531
|
+
function copyNonCacheAttributes(attributes) {
|
|
4532
|
+
const result = {};
|
|
4533
|
+
for (const [key, value] of Object.entries(attributes ?? {})) if (!key.startsWith("cache.")) result[key] = value;
|
|
4534
|
+
return result;
|
|
4517
4535
|
}
|
|
4518
4536
|
function isRecordLike$1(value) {
|
|
4519
4537
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4688,6 +4706,7 @@ function startExternalSpan(info) {
|
|
|
4688
4706
|
status: "running",
|
|
4689
4707
|
attributes: info.attributes
|
|
4690
4708
|
});
|
|
4709
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4691
4710
|
return createExternalSpanHandle(id);
|
|
4692
4711
|
}
|
|
4693
4712
|
function updateExternalSpan(info) {
|
|
@@ -4746,6 +4765,7 @@ function recordExternalSpan(info) {
|
|
|
4746
4765
|
warning: info.warning,
|
|
4747
4766
|
warnings: info.warnings
|
|
4748
4767
|
});
|
|
4768
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4749
4769
|
return id;
|
|
4750
4770
|
}
|
|
4751
4771
|
/**
|
|
@@ -4831,6 +4851,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
4831
4851
|
attributes: info.attributes
|
|
4832
4852
|
};
|
|
4833
4853
|
scope.spans.push(spanRecord);
|
|
4854
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4834
4855
|
const activeSpan = createSpanHandle(spanRecord);
|
|
4835
4856
|
return await runWithActiveSpan(spanRecord, async () => {
|
|
4836
4857
|
try {
|
|
@@ -4880,21 +4901,19 @@ async function traceSpanInternal(info, fn) {
|
|
|
4880
4901
|
const frame = {
|
|
4881
4902
|
baseSpanIndex: scope.spans.length,
|
|
4882
4903
|
replayParentSpanId: id,
|
|
4904
|
+
spanIds: /* @__PURE__ */ new Set(),
|
|
4905
|
+
finalAttributes: copyNonCacheAttributes(spanRecord.attributes),
|
|
4883
4906
|
ops: []
|
|
4884
4907
|
};
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
bodyResult = await fn(activeSpan);
|
|
4889
|
-
} finally {
|
|
4890
|
-
scope.recordingStack.pop();
|
|
4891
|
-
}
|
|
4908
|
+
const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
|
|
4909
|
+
return await fn(activeSpan);
|
|
4910
|
+
});
|
|
4892
4911
|
appendSubSpanOps(scope, frame);
|
|
4893
4912
|
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4894
4913
|
if (canStore) {
|
|
4895
4914
|
const recording = {
|
|
4896
4915
|
returnValue: bodyResult,
|
|
4897
|
-
finalAttributes:
|
|
4916
|
+
finalAttributes: frame.finalAttributes,
|
|
4898
4917
|
finalStatus: spanRecord.status,
|
|
4899
4918
|
finalError: spanRecord.error,
|
|
4900
4919
|
finalErrors: spanRecord.errors,
|
|
@@ -4998,37 +5017,63 @@ const evalTracer = {
|
|
|
4998
5017
|
status: "ok",
|
|
4999
5018
|
attributes: { value: data }
|
|
5000
5019
|
});
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
});
|
|
5008
|
-
}
|
|
5020
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
5021
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
5022
|
+
kind: "checkpoint",
|
|
5023
|
+
name,
|
|
5024
|
+
data
|
|
5025
|
+
});
|
|
5009
5026
|
}
|
|
5010
5027
|
};
|
|
5011
5028
|
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
5012
5029
|
function buildTraceTree(spans, checkpoints) {
|
|
5030
|
+
const rootSpans = spans.filter((s) => s.parentId === null);
|
|
5031
|
+
const flattenDfs = () => {
|
|
5032
|
+
const result = [];
|
|
5033
|
+
function visit(parentId) {
|
|
5034
|
+
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5035
|
+
result.push(childSpan);
|
|
5036
|
+
visit(childSpan.id);
|
|
5037
|
+
}
|
|
5038
|
+
}
|
|
5039
|
+
visit(null);
|
|
5040
|
+
return result;
|
|
5041
|
+
};
|
|
5042
|
+
const filterSpanNames = (sourceSpans, kind) => {
|
|
5043
|
+
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
5044
|
+
};
|
|
5013
5045
|
return {
|
|
5014
5046
|
spans,
|
|
5015
|
-
rootSpans
|
|
5047
|
+
rootSpans,
|
|
5016
5048
|
findSpan(name) {
|
|
5017
5049
|
return spans.find((s) => s.name === name);
|
|
5018
5050
|
},
|
|
5051
|
+
findSpans(name) {
|
|
5052
|
+
return spans.filter((s) => s.name === name);
|
|
5053
|
+
},
|
|
5054
|
+
hasSpan(name) {
|
|
5055
|
+
return spans.some((s) => s.name === name);
|
|
5056
|
+
},
|
|
5019
5057
|
findSpansByKind(kind) {
|
|
5020
5058
|
return spans.filter((s) => s.kind === kind);
|
|
5021
5059
|
},
|
|
5060
|
+
findToolCallSpans() {
|
|
5061
|
+
return spans.filter((s) => s.kind === "tool");
|
|
5062
|
+
},
|
|
5063
|
+
listToolCallSpanNames() {
|
|
5064
|
+
return filterSpanNames(spans, "tool");
|
|
5065
|
+
},
|
|
5066
|
+
hasToolCallSpan(name) {
|
|
5067
|
+
return spans.some((s) => s.kind === "tool" && s.name === name);
|
|
5068
|
+
},
|
|
5069
|
+
listSpanNames(kind) {
|
|
5070
|
+
return filterSpanNames(spans, kind);
|
|
5071
|
+
},
|
|
5072
|
+
listSpanNamesDfs(kind) {
|
|
5073
|
+
return filterSpanNames(flattenDfs(), kind);
|
|
5074
|
+
},
|
|
5022
5075
|
flattenDfs() {
|
|
5023
|
-
|
|
5024
|
-
function visit(parentId) {
|
|
5025
|
-
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5026
|
-
result.push(childSpan);
|
|
5027
|
-
visit(childSpan.id);
|
|
5028
|
-
}
|
|
5029
|
-
}
|
|
5030
|
-
visit(null);
|
|
5031
|
-
return result;
|
|
5076
|
+
return flattenDfs();
|
|
5032
5077
|
},
|
|
5033
5078
|
checkpoints
|
|
5034
5079
|
};
|
|
@@ -6670,7 +6715,7 @@ async function resolveDeriveFromTracingConfig(params) {
|
|
|
6670
6715
|
return derived;
|
|
6671
6716
|
}
|
|
6672
6717
|
async function runDeriveFromTracingConfig(params) {
|
|
6673
|
-
if (params.deriveFromTracing === void 0) return;
|
|
6718
|
+
if (params.deriveFromTracing === void 0) return null;
|
|
6674
6719
|
const { deriveFromTracing } = params;
|
|
6675
6720
|
try {
|
|
6676
6721
|
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
@@ -6682,13 +6727,53 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6682
6727
|
outputs: params.scope.outputs,
|
|
6683
6728
|
derived
|
|
6684
6729
|
});
|
|
6730
|
+
return null;
|
|
6685
6731
|
} catch (e) {
|
|
6732
|
+
if (e instanceof EvalRuntimeUsageError) return e;
|
|
6686
6733
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6687
6734
|
recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6735
|
+
return null;
|
|
6688
6736
|
}
|
|
6689
6737
|
}
|
|
6738
|
+
async function runOneTracingAssertion(params) {
|
|
6739
|
+
const { label, tracingAssertion, scope, traceTree, evalCase } = params;
|
|
6740
|
+
const failureCountBefore = scope.assertionFailures.length;
|
|
6741
|
+
const ctx = {
|
|
6742
|
+
trace: traceTree,
|
|
6743
|
+
input: evalCase.input,
|
|
6744
|
+
case: evalCase
|
|
6745
|
+
};
|
|
6746
|
+
try {
|
|
6747
|
+
await runInExistingEvalScope(scope, "tracingAssertions", async () => {
|
|
6748
|
+
await callUnknownFunction(tracingAssertion, [ctx]);
|
|
6749
|
+
});
|
|
6750
|
+
} catch (e) {
|
|
6751
|
+
if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
|
|
6752
|
+
recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
|
|
6753
|
+
}
|
|
6754
|
+
}
|
|
6755
|
+
async function runTracingAssertionsConfig(params) {
|
|
6756
|
+
if (params.tracingAssertions === void 0) return;
|
|
6757
|
+
if (typeof params.tracingAssertions === "function") {
|
|
6758
|
+
await runOneTracingAssertion({
|
|
6759
|
+
label: "tracingAssertions",
|
|
6760
|
+
tracingAssertion: params.tracingAssertions,
|
|
6761
|
+
scope: params.scope,
|
|
6762
|
+
traceTree: params.traceTree,
|
|
6763
|
+
evalCase: params.evalCase
|
|
6764
|
+
});
|
|
6765
|
+
return;
|
|
6766
|
+
}
|
|
6767
|
+
for (const [key, tracingAssertion] of Object.entries(params.tracingAssertions)) await runOneTracingAssertion({
|
|
6768
|
+
label: `tracingAssertions "${key}"`,
|
|
6769
|
+
tracingAssertion,
|
|
6770
|
+
scope: params.scope,
|
|
6771
|
+
traceTree: params.traceTree,
|
|
6772
|
+
evalCase: params.evalCase
|
|
6773
|
+
});
|
|
6774
|
+
}
|
|
6690
6775
|
async function runCase(params) {
|
|
6691
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6776
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6692
6777
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6693
6778
|
evalId,
|
|
6694
6779
|
evalFilePath,
|
|
@@ -6733,22 +6818,36 @@ async function runCase(params) {
|
|
|
6733
6818
|
apiCallsConfig
|
|
6734
6819
|
});
|
|
6735
6820
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
6736
|
-
|
|
6821
|
+
let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
6737
6822
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
|
|
6738
6823
|
if (!nonAssertError) {
|
|
6739
|
-
await runDeriveFromTracingConfig({
|
|
6824
|
+
nonAssertError = await runDeriveFromTracingConfig({
|
|
6740
6825
|
deriveFromTracing: globalDeriveFromTracing,
|
|
6741
6826
|
scope,
|
|
6742
6827
|
traceTree,
|
|
6743
6828
|
evalCase
|
|
6744
6829
|
});
|
|
6745
|
-
await runDeriveFromTracingConfig({
|
|
6830
|
+
if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
|
|
6746
6831
|
deriveFromTracing: evalDef.deriveFromTracing,
|
|
6747
6832
|
scope,
|
|
6748
6833
|
traceTree,
|
|
6749
6834
|
evalCase
|
|
6750
6835
|
});
|
|
6751
6836
|
}
|
|
6837
|
+
if (!nonAssertError) {
|
|
6838
|
+
await runTracingAssertionsConfig({
|
|
6839
|
+
tracingAssertions: globalTracingAssertions,
|
|
6840
|
+
scope,
|
|
6841
|
+
traceTree,
|
|
6842
|
+
evalCase
|
|
6843
|
+
});
|
|
6844
|
+
await runTracingAssertionsConfig({
|
|
6845
|
+
tracingAssertions: evalDef.tracingAssertions,
|
|
6846
|
+
scope,
|
|
6847
|
+
traceTree,
|
|
6848
|
+
evalCase
|
|
6849
|
+
});
|
|
6850
|
+
}
|
|
6752
6851
|
if (!nonAssertError) addDefaultOutputs({
|
|
6753
6852
|
outputs: scope.outputs,
|
|
6754
6853
|
spans: spansWithDerivedAttributes,
|
|
@@ -6938,4 +7037,4 @@ function recordAssertionFailure(scope, failure) {
|
|
|
6938
7037
|
});
|
|
6939
7038
|
}
|
|
6940
7039
|
//#endregion
|
|
6941
|
-
export {
|
|
7040
|
+
export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };
|