@ls-stack/agent-eval 0.57.0 → 0.58.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Db_x-Rit.mjs → app-DhMIbjlE.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +377 -0
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +3 -2
- package/dist/{cli-Ck0mqxd-.mjs → cli-_g2qOMK6.mjs} +7 -6
- package/dist/index.d.mts +54 -10
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +4 -3
- package/dist/{runExecution-BH7DlMXl.mjs → runExecution-d42Lm0i5.mjs} +178 -98
- package/dist/{runOrchestration-C1Ex9QI-.mjs → runOrchestration-CvmFeOmT.mjs} +1 -1
- package/dist/{runner-DbVYcapC.mjs → runner-BKogjiYd.mjs} +1 -1
- package/dist/{runner-B3hEOT_I.mjs → runner-MSr8sAWm.mjs} +2 -2
- package/dist/{src-B3iq-tuv.mjs → src-CdZsOn6y.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +5 -3
- package/dist/apps/web/dist/assets/index-Xa_7PteQ.css +0 -1
- package/dist/apps/web/dist/assets/index-o4o2EktS.js +0 -377
|
@@ -289,7 +289,8 @@ z.object({
|
|
|
289
289
|
key: z.string(),
|
|
290
290
|
namespace: z.string(),
|
|
291
291
|
storedAt: z.string(),
|
|
292
|
-
|
|
292
|
+
/** Last successful cache hit time. `null` means the entry has not been hit yet. */
|
|
293
|
+
lastAccessedAt: z.string().nullable()
|
|
293
294
|
});
|
|
294
295
|
z.object({
|
|
295
296
|
removedCacheFiles: z.number(),
|
|
@@ -814,8 +815,7 @@ const caseRowSchema = z.object({
|
|
|
814
815
|
/** Winning trial index for the persisted case result. */
|
|
815
816
|
trial: z.number()
|
|
816
817
|
});
|
|
817
|
-
|
|
818
|
-
const assertionFailureSchema = z.object({
|
|
818
|
+
const assertionBaseSchema = z.object({
|
|
819
819
|
/**
|
|
820
820
|
* Error class or category label rendered alongside the message (e.g.
|
|
821
821
|
* `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
|
|
@@ -827,7 +827,19 @@ const assertionFailureSchema = z.object({
|
|
|
827
827
|
/** Stack trace captured from the originating error when available. */
|
|
828
828
|
stack: z.string().optional()
|
|
829
829
|
});
|
|
830
|
+
/** Structured assertion failure metadata captured for one case run. */
|
|
831
|
+
const assertionFailureSchema = assertionBaseSchema;
|
|
832
|
+
/** Pass/fail outcome for one recorded eval assertion. */
|
|
833
|
+
const assertionStatusSchema = z.enum(["pass", "fail"]);
|
|
834
|
+
/** Structured assertion result metadata captured for one case run. */
|
|
835
|
+
const assertionResultSchema = assertionBaseSchema.extend({
|
|
836
|
+
/** Whether the recorded assertion passed or failed. */
|
|
837
|
+
status: assertionStatusSchema });
|
|
830
838
|
const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
|
|
839
|
+
const legacyAssertionResultSchema = z.string().transform((message) => ({
|
|
840
|
+
message,
|
|
841
|
+
status: "fail"
|
|
842
|
+
}));
|
|
831
843
|
/** Severity level for one log captured during a case run. */
|
|
832
844
|
const runLogLevelSchema = z.enum([
|
|
833
845
|
"log",
|
|
@@ -922,6 +934,12 @@ const caseDetailSchema = z.object({
|
|
|
922
934
|
* These complement eval-level `columns` without changing discovery metadata.
|
|
923
935
|
*/
|
|
924
936
|
outputColumnDefs: z.array(columnDefSchema).optional(),
|
|
937
|
+
/**
|
|
938
|
+
* Pass/fail assertion records captured from eval assertion helpers. New run
|
|
939
|
+
* artifacts include this alongside `assertionFailures`; older artifacts may
|
|
940
|
+
* omit it and should fall back to `assertionFailures` for failed outcomes.
|
|
941
|
+
*/
|
|
942
|
+
assertions: z.array(z.union([assertionResultSchema, legacyAssertionResultSchema])).optional(),
|
|
925
943
|
assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
|
|
926
944
|
/** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
|
|
927
945
|
logs: z.array(runLogEntrySchema).default([]),
|
|
@@ -1405,6 +1423,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1405
1423
|
maxEntriesPerNamespace: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1406
1424
|
maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
|
|
1407
1425
|
pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1426
|
+
lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1408
1427
|
maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
|
|
1409
1428
|
}).optional()
|
|
1410
1429
|
});
|
|
@@ -2632,6 +2651,7 @@ const scopeStorage = new AsyncLocalStorage();
|
|
|
2632
2651
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
2633
2652
|
const evalClockStorage = new AsyncLocalStorage();
|
|
2634
2653
|
const activeSpanStackStorage = new AsyncLocalStorage();
|
|
2654
|
+
const recordingStackStorage = new AsyncLocalStorage();
|
|
2635
2655
|
let activeEvalScopeCount = 0;
|
|
2636
2656
|
let activeEvalRuntimeScopeCount = 0;
|
|
2637
2657
|
let consoleCaptureEnabled = true;
|
|
@@ -2784,6 +2804,20 @@ async function runWithActiveSpan(span, fn) {
|
|
|
2784
2804
|
const currentStack = activeSpanStackStorage.getStore() ?? [];
|
|
2785
2805
|
return await activeSpanStackStorage.run([...currentStack, span], fn);
|
|
2786
2806
|
}
|
|
2807
|
+
/** Execute a callback with a cache recording frame scoped to this async branch. */
|
|
2808
|
+
async function runWithCacheRecordingFrame(frame, fn) {
|
|
2809
|
+
const currentStack = recordingStackStorage.getStore() ?? [];
|
|
2810
|
+
return await recordingStackStorage.run([...currentStack, frame], fn);
|
|
2811
|
+
}
|
|
2812
|
+
function getCurrentCacheRecordingFrame(scope) {
|
|
2813
|
+
if (scope.replayingDepth > 0) return void 0;
|
|
2814
|
+
return recordingStackStorage.getStore()?.at(-1);
|
|
2815
|
+
}
|
|
2816
|
+
/** Mark a span as created by the active cache recorder, when one exists. */
|
|
2817
|
+
function recordSpanForActiveCacheRecording(scope, spanId) {
|
|
2818
|
+
if (scope.replayingDepth > 0) return;
|
|
2819
|
+
for (const frame of recordingStackStorage.getStore() ?? []) frame.spanIds.add(spanId);
|
|
2820
|
+
}
|
|
2787
2821
|
/**
|
|
2788
2822
|
* Return the current eval runner phase for this async execution.
|
|
2789
2823
|
*
|
|
@@ -3086,11 +3120,11 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
3086
3120
|
tags: options.tags ?? [],
|
|
3087
3121
|
outputs: {},
|
|
3088
3122
|
outputColumnOverrides: {},
|
|
3123
|
+
assertions: [],
|
|
3089
3124
|
assertionFailures: [],
|
|
3090
3125
|
logs: [],
|
|
3091
3126
|
spans: [],
|
|
3092
3127
|
checkpoints: /* @__PURE__ */ new Map(),
|
|
3093
|
-
recordingStack: [],
|
|
3094
3128
|
replayingDepth: 0,
|
|
3095
3129
|
cacheContext: options.cacheContext,
|
|
3096
3130
|
caseCacheRefs: [],
|
|
@@ -3130,10 +3164,16 @@ function nextEvalId() {
|
|
|
3130
3164
|
scope.nextEvalIdCounter++;
|
|
3131
3165
|
return `${scope.idPrefix}-${scope.nextEvalIdCounter}`;
|
|
3132
3166
|
}
|
|
3133
|
-
function
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3167
|
+
function recordCacheRecordingOpIfActive(scope, op) {
|
|
3168
|
+
getCurrentCacheRecordingFrame(scope)?.ops.push(op);
|
|
3169
|
+
}
|
|
3170
|
+
function recordCacheRecordingAttributesIfActive(scope, span, attributes) {
|
|
3171
|
+
const frames = recordingStackStorage.getStore();
|
|
3172
|
+
if (scope.replayingDepth > 0 || frames === void 0) return;
|
|
3173
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
3174
|
+
if (key.startsWith("cache.")) continue;
|
|
3175
|
+
for (const frame of frames) if (span.id === frame.replayParentSpanId) frame.finalAttributes[key] = value;
|
|
3176
|
+
}
|
|
3137
3177
|
}
|
|
3138
3178
|
function normalizeEvalOutputOptions(options) {
|
|
3139
3179
|
if (options === void 0) return void 0;
|
|
@@ -3165,7 +3205,7 @@ function setEvalOutput(key, value, options = void 0) {
|
|
|
3165
3205
|
scope.outputs[key] = value;
|
|
3166
3206
|
const column = normalizeEvalOutputOptions(options);
|
|
3167
3207
|
if (column !== void 0) scope.outputColumnOverrides[key] = column;
|
|
3168
|
-
|
|
3208
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3169
3209
|
kind: "setOutput",
|
|
3170
3210
|
key,
|
|
3171
3211
|
value,
|
|
@@ -3185,7 +3225,7 @@ function appendToEvalOutput(key, value) {
|
|
|
3185
3225
|
if (existing === void 0) scope.outputs[key] = [value];
|
|
3186
3226
|
else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
|
|
3187
3227
|
else scope.outputs[key] = [existing, value];
|
|
3188
|
-
|
|
3228
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3189
3229
|
kind: "appendOutput",
|
|
3190
3230
|
key,
|
|
3191
3231
|
value
|
|
@@ -3203,7 +3243,7 @@ function mergeEvalOutput(key, patch) {
|
|
|
3203
3243
|
const existing = scope.outputs[key];
|
|
3204
3244
|
if (existing === void 0) {
|
|
3205
3245
|
scope.outputs[key] = { ...patch };
|
|
3206
|
-
|
|
3246
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3207
3247
|
kind: "mergeOutput",
|
|
3208
3248
|
key,
|
|
3209
3249
|
patch
|
|
@@ -3211,14 +3251,14 @@ function mergeEvalOutput(key, patch) {
|
|
|
3211
3251
|
return;
|
|
3212
3252
|
}
|
|
3213
3253
|
if (!isObjectRecord(existing)) {
|
|
3214
|
-
scope
|
|
3254
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
|
|
3215
3255
|
return;
|
|
3216
3256
|
}
|
|
3217
3257
|
scope.outputs[key] = {
|
|
3218
3258
|
...existing,
|
|
3219
3259
|
...patch
|
|
3220
3260
|
};
|
|
3221
|
-
|
|
3261
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3222
3262
|
kind: "mergeOutput",
|
|
3223
3263
|
key,
|
|
3224
3264
|
patch
|
|
@@ -3236,7 +3276,7 @@ function incrementEvalOutput(key, delta) {
|
|
|
3236
3276
|
const existing = scope.outputs[key];
|
|
3237
3277
|
if (existing === void 0) {
|
|
3238
3278
|
scope.outputs[key] = delta;
|
|
3239
|
-
|
|
3279
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3240
3280
|
kind: "incrementOutput",
|
|
3241
3281
|
key,
|
|
3242
3282
|
delta
|
|
@@ -3244,11 +3284,11 @@ function incrementEvalOutput(key, delta) {
|
|
|
3244
3284
|
return;
|
|
3245
3285
|
}
|
|
3246
3286
|
if (typeof existing !== "number") {
|
|
3247
|
-
scope
|
|
3287
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
|
|
3248
3288
|
return;
|
|
3249
3289
|
}
|
|
3250
3290
|
scope.outputs[key] = existing + delta;
|
|
3251
|
-
|
|
3291
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
3252
3292
|
kind: "incrementOutput",
|
|
3253
3293
|
key,
|
|
3254
3294
|
delta
|
|
@@ -3263,13 +3303,26 @@ function incrementEvalOutput(key, delta) {
|
|
|
3263
3303
|
* call.
|
|
3264
3304
|
*/
|
|
3265
3305
|
function evalAssert(condition, message) {
|
|
3266
|
-
if (condition) return;
|
|
3267
3306
|
const scope = getCurrentScope();
|
|
3307
|
+
if (condition) {
|
|
3308
|
+
if (scope) scope.assertions.push({
|
|
3309
|
+
message,
|
|
3310
|
+
status: "pass"
|
|
3311
|
+
});
|
|
3312
|
+
return;
|
|
3313
|
+
}
|
|
3268
3314
|
if (!scope) return;
|
|
3269
3315
|
const error = new EvalAssertionError(message);
|
|
3270
|
-
scope
|
|
3316
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(message, error));
|
|
3271
3317
|
throw error;
|
|
3272
3318
|
}
|
|
3319
|
+
function recordAssertionFailure$1(scope, failure) {
|
|
3320
|
+
scope.assertionFailures.push(failure);
|
|
3321
|
+
scope.assertions.push({
|
|
3322
|
+
...failure,
|
|
3323
|
+
status: "fail"
|
|
3324
|
+
});
|
|
3325
|
+
}
|
|
3273
3326
|
//#endregion
|
|
3274
3327
|
//#region ../sdk/src/evalExpect.ts
|
|
3275
3328
|
const expectFormatOptions = {
|
|
@@ -3642,10 +3695,6 @@ async function materializeExternalJsonValues(value, store) {
|
|
|
3642
3695
|
if (!isRecordLike$3(value)) return value;
|
|
3643
3696
|
return Object.fromEntries(await Promise.all(Object.entries(value).map(async ([key, entryValue]) => [key, await materializeExternalJsonValues(entryValue, store)])));
|
|
3644
3697
|
}
|
|
3645
|
-
/** Clone one value through the same serialization path used for cache data. */
|
|
3646
|
-
async function cloneCacheValue(value, options = void 0) {
|
|
3647
|
-
return deserializeCacheValue(await serializeCacheValue(value, options));
|
|
3648
|
-
}
|
|
3649
3698
|
function normalizeCacheSerializationOptions(options) {
|
|
3650
3699
|
return {
|
|
3651
3700
|
compress: options?.compress !== false,
|
|
@@ -4076,29 +4125,6 @@ function valueKind$1(value) {
|
|
|
4076
4125
|
function copyArray(value) {
|
|
4077
4126
|
return value.map((item) => item);
|
|
4078
4127
|
}
|
|
4079
|
-
function stripCacheAttributes(attributes) {
|
|
4080
|
-
if (!attributes) return {};
|
|
4081
|
-
const result = {};
|
|
4082
|
-
for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
|
|
4083
|
-
return result;
|
|
4084
|
-
}
|
|
4085
|
-
async function snapshotNonCacheAttributes(span) {
|
|
4086
|
-
const snapshot = await cloneCacheValue(stripCacheAttributes(span?.attributes));
|
|
4087
|
-
return isRecordLike$2(snapshot) ? snapshot : {};
|
|
4088
|
-
}
|
|
4089
|
-
function diffNonCacheAttributes(before, after) {
|
|
4090
|
-
const result = {};
|
|
4091
|
-
for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
|
|
4092
|
-
return result;
|
|
4093
|
-
}
|
|
4094
|
-
function cacheAttributeValuesEqual(left, right) {
|
|
4095
|
-
if (Object.is(left, right)) return true;
|
|
4096
|
-
try {
|
|
4097
|
-
return JSON.stringify(left) === JSON.stringify(right);
|
|
4098
|
-
} catch {
|
|
4099
|
-
return false;
|
|
4100
|
-
}
|
|
4101
|
-
}
|
|
4102
4128
|
function appendCacheRef(span, ref) {
|
|
4103
4129
|
if (span === void 0) return;
|
|
4104
4130
|
const existing = span.attributes?.["cache.refs"];
|
|
@@ -4117,7 +4143,7 @@ function recordCacheRef(scope, span, ref) {
|
|
|
4117
4143
|
}
|
|
4118
4144
|
scope.caseCacheRefs.push(ref);
|
|
4119
4145
|
}
|
|
4120
|
-
function serializeSubSpanTree(scope, spanId) {
|
|
4146
|
+
function serializeSubSpanTree(scope, spanId, spanIds) {
|
|
4121
4147
|
const original = scope.spans.find((s) => s.id === spanId);
|
|
4122
4148
|
if (!original) return {
|
|
4123
4149
|
kind: "custom",
|
|
@@ -4130,7 +4156,7 @@ function serializeSubSpanTree(scope, spanId) {
|
|
|
4130
4156
|
warnings: void 0,
|
|
4131
4157
|
children: []
|
|
4132
4158
|
};
|
|
4133
|
-
const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
|
|
4159
|
+
const children = scope.spans.filter((s) => s.parentId === spanId && spanIds.has(s.id)).map((child) => serializeSubSpanTree(scope, child.id, spanIds));
|
|
4134
4160
|
return {
|
|
4135
4161
|
kind: original.kind,
|
|
4136
4162
|
name: original.name,
|
|
@@ -4146,9 +4172,9 @@ function serializeSubSpanTree(scope, spanId) {
|
|
|
4146
4172
|
function appendSubSpanOps(scope, frame) {
|
|
4147
4173
|
for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
|
|
4148
4174
|
const candidate = scope.spans[i];
|
|
4149
|
-
if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
|
|
4175
|
+
if (candidate?.parentId === frame.replayParentSpanId && frame.spanIds.has(candidate.id)) frame.ops.push({
|
|
4150
4176
|
kind: "subSpan",
|
|
4151
|
-
span: serializeSubSpanTree(scope, candidate.id)
|
|
4177
|
+
span: serializeSubSpanTree(scope, candidate.id, frame.spanIds)
|
|
4152
4178
|
});
|
|
4153
4179
|
}
|
|
4154
4180
|
}
|
|
@@ -4185,14 +4211,28 @@ function applyRecordingOp(scope, parentSpan, op, options) {
|
|
|
4185
4211
|
...existing,
|
|
4186
4212
|
...op.patch
|
|
4187
4213
|
};
|
|
4188
|
-
else
|
|
4214
|
+
else {
|
|
4215
|
+
const message = `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object`;
|
|
4216
|
+
scope.assertionFailures.push({ message });
|
|
4217
|
+
scope.assertions.push({
|
|
4218
|
+
message,
|
|
4219
|
+
status: "fail"
|
|
4220
|
+
});
|
|
4221
|
+
}
|
|
4189
4222
|
return;
|
|
4190
4223
|
}
|
|
4191
4224
|
if (op.kind === "incrementOutput") {
|
|
4192
4225
|
const existing = scope.outputs[op.key];
|
|
4193
4226
|
if (existing === void 0) scope.outputs[op.key] = op.delta;
|
|
4194
4227
|
else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
|
|
4195
|
-
else
|
|
4228
|
+
else {
|
|
4229
|
+
const message = `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number`;
|
|
4230
|
+
scope.assertionFailures.push({ message });
|
|
4231
|
+
scope.assertions.push({
|
|
4232
|
+
message,
|
|
4233
|
+
status: "fail"
|
|
4234
|
+
});
|
|
4235
|
+
}
|
|
4196
4236
|
return;
|
|
4197
4237
|
}
|
|
4198
4238
|
if (op.kind === "checkpoint") {
|
|
@@ -4390,25 +4430,21 @@ function createTraceCache(generateSpanId) {
|
|
|
4390
4430
|
key: keyHash,
|
|
4391
4431
|
status: "bypass"
|
|
4392
4432
|
});
|
|
4393
|
-
const beforeAttributes = await snapshotNonCacheAttributes(activeSpan);
|
|
4394
4433
|
const frame = {
|
|
4395
4434
|
baseSpanIndex: scope.spans.length,
|
|
4396
4435
|
replayParentSpanId: activeSpan?.id ?? null,
|
|
4436
|
+
spanIds: /* @__PURE__ */ new Set(),
|
|
4437
|
+
finalAttributes: {},
|
|
4397
4438
|
ops: []
|
|
4398
4439
|
};
|
|
4399
|
-
|
|
4400
|
-
|
|
4401
|
-
|
|
4402
|
-
bodyResult = await fn();
|
|
4403
|
-
} finally {
|
|
4404
|
-
scope.recordingStack.pop();
|
|
4405
|
-
}
|
|
4440
|
+
const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
|
|
4441
|
+
return await fn();
|
|
4442
|
+
});
|
|
4406
4443
|
appendSubSpanOps(scope, frame);
|
|
4407
4444
|
if (canStore) {
|
|
4408
|
-
const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
|
|
4409
4445
|
const recording = {
|
|
4410
4446
|
returnValue: bodyResult,
|
|
4411
|
-
finalAttributes,
|
|
4447
|
+
finalAttributes: frame.finalAttributes,
|
|
4412
4448
|
ops: frame.ops
|
|
4413
4449
|
};
|
|
4414
4450
|
await cacheCtx.adapter.write({
|
|
@@ -4467,6 +4503,13 @@ function mergeSpanAttributes(span, attributes) {
|
|
|
4467
4503
|
...span.attributes,
|
|
4468
4504
|
...attributes
|
|
4469
4505
|
};
|
|
4506
|
+
const scope = getCurrentScope();
|
|
4507
|
+
if (scope !== void 0) recordCacheRecordingAttributesIfActive(scope, span, attributes);
|
|
4508
|
+
}
|
|
4509
|
+
function copyNonCacheAttributes(attributes) {
|
|
4510
|
+
const result = {};
|
|
4511
|
+
for (const [key, value] of Object.entries(attributes ?? {})) if (!key.startsWith("cache.")) result[key] = value;
|
|
4512
|
+
return result;
|
|
4470
4513
|
}
|
|
4471
4514
|
function isRecordLike$1(value) {
|
|
4472
4515
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4478,6 +4521,10 @@ function recordSpanAttributeAssertion(message) {
|
|
|
4478
4521
|
const scope = getCurrentScope();
|
|
4479
4522
|
if (!scope) return;
|
|
4480
4523
|
scope.assertionFailures.push({ message });
|
|
4524
|
+
scope.assertions.push({
|
|
4525
|
+
message,
|
|
4526
|
+
status: "fail"
|
|
4527
|
+
});
|
|
4481
4528
|
}
|
|
4482
4529
|
function incrementSpanAttribute(span, key, delta) {
|
|
4483
4530
|
const existing = span.attributes?.[key];
|
|
@@ -4637,6 +4684,7 @@ function startExternalSpan(info) {
|
|
|
4637
4684
|
status: "running",
|
|
4638
4685
|
attributes: info.attributes
|
|
4639
4686
|
});
|
|
4687
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4640
4688
|
return createExternalSpanHandle(id);
|
|
4641
4689
|
}
|
|
4642
4690
|
function updateExternalSpan(info) {
|
|
@@ -4695,6 +4743,7 @@ function recordExternalSpan(info) {
|
|
|
4695
4743
|
warning: info.warning,
|
|
4696
4744
|
warnings: info.warnings
|
|
4697
4745
|
});
|
|
4746
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4698
4747
|
return id;
|
|
4699
4748
|
}
|
|
4700
4749
|
/**
|
|
@@ -4780,6 +4829,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
4780
4829
|
attributes: info.attributes
|
|
4781
4830
|
};
|
|
4782
4831
|
scope.spans.push(spanRecord);
|
|
4832
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4783
4833
|
const activeSpan = createSpanHandle(spanRecord);
|
|
4784
4834
|
return await runWithActiveSpan(spanRecord, async () => {
|
|
4785
4835
|
try {
|
|
@@ -4829,21 +4879,19 @@ async function traceSpanInternal(info, fn) {
|
|
|
4829
4879
|
const frame = {
|
|
4830
4880
|
baseSpanIndex: scope.spans.length,
|
|
4831
4881
|
replayParentSpanId: id,
|
|
4882
|
+
spanIds: /* @__PURE__ */ new Set(),
|
|
4883
|
+
finalAttributes: copyNonCacheAttributes(spanRecord.attributes),
|
|
4832
4884
|
ops: []
|
|
4833
4885
|
};
|
|
4834
|
-
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
bodyResult = await fn(activeSpan);
|
|
4838
|
-
} finally {
|
|
4839
|
-
scope.recordingStack.pop();
|
|
4840
|
-
}
|
|
4886
|
+
const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
|
|
4887
|
+
return await fn(activeSpan);
|
|
4888
|
+
});
|
|
4841
4889
|
appendSubSpanOps(scope, frame);
|
|
4842
4890
|
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4843
4891
|
if (canStore) {
|
|
4844
4892
|
const recording = {
|
|
4845
4893
|
returnValue: bodyResult,
|
|
4846
|
-
finalAttributes:
|
|
4894
|
+
finalAttributes: frame.finalAttributes,
|
|
4847
4895
|
finalStatus: spanRecord.status,
|
|
4848
4896
|
finalError: spanRecord.error,
|
|
4849
4897
|
finalErrors: spanRecord.errors,
|
|
@@ -4947,14 +4995,12 @@ const evalTracer = {
|
|
|
4947
4995
|
status: "ok",
|
|
4948
4996
|
attributes: { value: data }
|
|
4949
4997
|
});
|
|
4950
|
-
|
|
4951
|
-
|
|
4952
|
-
|
|
4953
|
-
|
|
4954
|
-
|
|
4955
|
-
|
|
4956
|
-
});
|
|
4957
|
-
}
|
|
4998
|
+
recordSpanForActiveCacheRecording(scope, id);
|
|
4999
|
+
recordCacheRecordingOpIfActive(scope, {
|
|
5000
|
+
kind: "checkpoint",
|
|
5001
|
+
name,
|
|
5002
|
+
data
|
|
5003
|
+
});
|
|
4958
5004
|
}
|
|
4959
5005
|
};
|
|
4960
5006
|
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
@@ -4983,6 +5029,24 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
4983
5029
|
};
|
|
4984
5030
|
}
|
|
4985
5031
|
//#endregion
|
|
5032
|
+
//#region ../runner/src/cacheAccessTime.ts
|
|
5033
|
+
const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
|
|
5034
|
+
function normalizeLastAccessedAtUpdateIntervalMs(value) {
|
|
5035
|
+
if (value === void 0 || !Number.isFinite(value) || value < 0) return defaultLastAccessedAtUpdateIntervalMs;
|
|
5036
|
+
return Math.floor(value);
|
|
5037
|
+
}
|
|
5038
|
+
function cacheAccessSortTime(entry) {
|
|
5039
|
+
return entry.lastAccessedAt ?? entry.storedAt;
|
|
5040
|
+
}
|
|
5041
|
+
function shouldRefreshLastAccessedAt(params) {
|
|
5042
|
+
return params.lastAccessedAt === null || params.nowMs - Date.parse(params.lastAccessedAt) > params.updateIntervalMs;
|
|
5043
|
+
}
|
|
5044
|
+
//#endregion
|
|
5045
|
+
//#region ../runner/src/cacheKeys.ts
|
|
5046
|
+
function toPendingKey(namespace, keyHash) {
|
|
5047
|
+
return `${namespace}::${keyHash}`;
|
|
5048
|
+
}
|
|
5049
|
+
//#endregion
|
|
4986
5050
|
//#region ../runner/src/cacheStore.ts
|
|
4987
5051
|
const defaultMaxEntriesPerNamespace = 100;
|
|
4988
5052
|
const cacheSerializationMarker = "__aecs";
|
|
@@ -5015,6 +5079,7 @@ function createFsCacheStore(options) {
|
|
|
5015
5079
|
primaryDir: blobDir
|
|
5016
5080
|
});
|
|
5017
5081
|
const defaultMaxEntries = normalizeMaxEntries(options.maxEntriesPerNamespace);
|
|
5082
|
+
const lastAccessedAtUpdateIntervalMs = normalizeLastAccessedAtUpdateIntervalMs(options.lastAccessedAtUpdateIntervalMs);
|
|
5018
5083
|
return {
|
|
5019
5084
|
externalJsonStore,
|
|
5020
5085
|
dir() {
|
|
@@ -5034,7 +5099,12 @@ function createFsCacheStore(options) {
|
|
|
5034
5099
|
});
|
|
5035
5100
|
if (entry === null) return null;
|
|
5036
5101
|
const materialized = await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
|
|
5037
|
-
if (materialized !== null) await updateCacheIndexLastAccessedAt(
|
|
5102
|
+
if (materialized !== null) await updateCacheIndexLastAccessedAt({
|
|
5103
|
+
cacheDir,
|
|
5104
|
+
key: keyHash,
|
|
5105
|
+
namespace,
|
|
5106
|
+
updateIntervalMs: lastAccessedAtUpdateIntervalMs
|
|
5107
|
+
});
|
|
5038
5108
|
return materialized;
|
|
5039
5109
|
},
|
|
5040
5110
|
async lookupWithDebug(namespace, keyHash) {
|
|
@@ -5063,7 +5133,7 @@ function createFsCacheStore(options) {
|
|
|
5063
5133
|
const index = await readNamespaceIndex(cacheDir, entry.namespace);
|
|
5064
5134
|
index.entries[entry.key] = {
|
|
5065
5135
|
storedAt: entry.storedAt,
|
|
5066
|
-
lastAccessedAt:
|
|
5136
|
+
lastAccessedAt: null,
|
|
5067
5137
|
blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
|
|
5068
5138
|
};
|
|
5069
5139
|
await writeNamespaceIndex(cacheDir, index);
|
|
@@ -5082,7 +5152,7 @@ function createFsCacheStore(options) {
|
|
|
5082
5152
|
async list() {
|
|
5083
5153
|
const items = [];
|
|
5084
5154
|
for (const index of await listCacheIndexes(cacheDir)) for (const [key, entry] of Object.entries(index.entries)) items.push(toCacheListItem(index.namespace, key, entry));
|
|
5085
|
-
items.sort((a, b) => a
|
|
5155
|
+
items.sort((a, b) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
|
|
5086
5156
|
return items;
|
|
5087
5157
|
},
|
|
5088
5158
|
async clear(filter) {
|
|
@@ -5183,9 +5253,6 @@ function maxEntriesForNamespace(namespace, defaultMaxEntries, maxEntriesByNamesp
|
|
|
5183
5253
|
const namespaceMaxEntries = maxEntriesByNamespace?.[namespace];
|
|
5184
5254
|
return namespaceMaxEntries === void 0 ? defaultMaxEntries : normalizeMaxEntries(namespaceMaxEntries, defaultMaxEntries);
|
|
5185
5255
|
}
|
|
5186
|
-
function toPendingKey(namespace, keyHash) {
|
|
5187
|
-
return `${namespace}::${keyHash}`;
|
|
5188
|
-
}
|
|
5189
5256
|
function sanitizeSegment$1(segment) {
|
|
5190
5257
|
return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
5191
5258
|
}
|
|
@@ -5231,16 +5298,22 @@ async function readIndexedCacheEntry(params) {
|
|
|
5231
5298
|
return fileEntry.entry;
|
|
5232
5299
|
});
|
|
5233
5300
|
}
|
|
5234
|
-
async function updateCacheIndexLastAccessedAt(
|
|
5235
|
-
await withCacheFileLock(namespaceLockPath(cacheDir, namespace), async () => {
|
|
5236
|
-
const index = await readNamespaceIndex(cacheDir, namespace);
|
|
5237
|
-
const entry = index.entries[key];
|
|
5301
|
+
async function updateCacheIndexLastAccessedAt(params) {
|
|
5302
|
+
await withCacheFileLock(namespaceLockPath(params.cacheDir, params.namespace), async () => {
|
|
5303
|
+
const index = await readNamespaceIndex(params.cacheDir, params.namespace);
|
|
5304
|
+
const entry = index.entries[params.key];
|
|
5238
5305
|
if (entry === void 0) return;
|
|
5239
|
-
|
|
5306
|
+
const nowMs = getRealDateNowMs();
|
|
5307
|
+
if (!shouldRefreshLastAccessedAt({
|
|
5308
|
+
lastAccessedAt: entry.lastAccessedAt,
|
|
5309
|
+
nowMs,
|
|
5310
|
+
updateIntervalMs: params.updateIntervalMs
|
|
5311
|
+
})) return;
|
|
5312
|
+
index.entries[params.key] = {
|
|
5240
5313
|
...entry,
|
|
5241
|
-
lastAccessedAt: new Date(
|
|
5314
|
+
lastAccessedAt: new Date(nowMs).toISOString()
|
|
5242
5315
|
};
|
|
5243
|
-
await writeNamespaceIndex(cacheDir, index);
|
|
5316
|
+
await writeNamespaceIndex(params.cacheDir, index);
|
|
5244
5317
|
});
|
|
5245
5318
|
}
|
|
5246
5319
|
async function readCacheEntryFilePath(filePath, expected) {
|
|
@@ -5371,7 +5444,7 @@ function parseCacheIndexFile(value, expectedNamespace) {
|
|
|
5371
5444
|
}
|
|
5372
5445
|
function parseCacheIndexEntry(value) {
|
|
5373
5446
|
if (!isRecordLike(value)) return null;
|
|
5374
|
-
if (typeof value.storedAt !== "string" || typeof value.lastAccessedAt !== "string") return null;
|
|
5447
|
+
if (typeof value.storedAt !== "string" || value.lastAccessedAt !== null && typeof value.lastAccessedAt !== "string") return null;
|
|
5375
5448
|
if (!Array.isArray(value.blobRefs)) return null;
|
|
5376
5449
|
const blobRefs = [];
|
|
5377
5450
|
for (const blobRef of value.blobRefs) {
|
|
@@ -5441,7 +5514,7 @@ function entryMatchesFilter(entry, filter) {
|
|
|
5441
5514
|
async function pruneCacheEntriesForNamespace(params) {
|
|
5442
5515
|
const { cacheDir, index, maxEntries } = params;
|
|
5443
5516
|
const entries = Object.entries(index.entries);
|
|
5444
|
-
const sorted = entries.toSorted(([, a], [, b]) => a
|
|
5517
|
+
const sorted = entries.toSorted(([, a], [, b]) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
|
|
5445
5518
|
const keptKeys = /* @__PURE__ */ new Set();
|
|
5446
5519
|
for (const [key] of sorted) {
|
|
5447
5520
|
if (keptKeys.size >= maxEntries) break;
|
|
@@ -6606,7 +6679,7 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6606
6679
|
});
|
|
6607
6680
|
} catch (e) {
|
|
6608
6681
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6609
|
-
params.scope
|
|
6682
|
+
recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6610
6683
|
}
|
|
6611
6684
|
}
|
|
6612
6685
|
async function runCase(params) {
|
|
@@ -6656,7 +6729,7 @@ async function runCase(params) {
|
|
|
6656
6729
|
});
|
|
6657
6730
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
6658
6731
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
6659
|
-
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope
|
|
6732
|
+
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
|
|
6660
6733
|
if (!nonAssertError) {
|
|
6661
6734
|
await runDeriveFromTracingConfig({
|
|
6662
6735
|
deriveFromTracing: globalDeriveFromTracing,
|
|
@@ -6686,7 +6759,7 @@ async function runCase(params) {
|
|
|
6686
6759
|
...scope.outputs,
|
|
6687
6760
|
...parsedOutputs.data
|
|
6688
6761
|
};
|
|
6689
|
-
else scope
|
|
6762
|
+
else recordAssertionFailure(scope, toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
|
|
6690
6763
|
}
|
|
6691
6764
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
6692
6765
|
const scoringTraces = {};
|
|
@@ -6728,8 +6801,7 @@ async function runCase(params) {
|
|
|
6728
6801
|
};
|
|
6729
6802
|
const rawValue = scoreRun.result;
|
|
6730
6803
|
if (scoreRun.error) {
|
|
6731
|
-
|
|
6732
|
-
scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
|
|
6804
|
+
recordAssertionFailure(scope, toAssertionFailure(`score "${key}" threw: ${scoreRun.error.message}`, scoreRun.error));
|
|
6733
6805
|
scope.outputs[key] = 0;
|
|
6734
6806
|
scoreResults.set(key, {
|
|
6735
6807
|
value: 0,
|
|
@@ -6739,7 +6811,7 @@ async function runCase(params) {
|
|
|
6739
6811
|
continue;
|
|
6740
6812
|
}
|
|
6741
6813
|
if (typeof rawValue !== "number") {
|
|
6742
|
-
scope
|
|
6814
|
+
recordAssertionFailure(scope, toAssertionFailure(`score "${key}" must return a number`));
|
|
6743
6815
|
scope.outputs[key] = 0;
|
|
6744
6816
|
scoreResults.set(key, {
|
|
6745
6817
|
value: 0,
|
|
@@ -6801,6 +6873,7 @@ async function runCase(params) {
|
|
|
6801
6873
|
traceDisplay,
|
|
6802
6874
|
columns,
|
|
6803
6875
|
...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
|
|
6876
|
+
assertions: scope.assertions,
|
|
6804
6877
|
assertionFailures: scope.assertionFailures,
|
|
6805
6878
|
logs: scope.logs,
|
|
6806
6879
|
error: errorInfo,
|
|
@@ -6852,5 +6925,12 @@ function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
|
|
|
6852
6925
|
...stack !== void 0 ? { stack } : {}
|
|
6853
6926
|
};
|
|
6854
6927
|
}
|
|
6928
|
+
function recordAssertionFailure(scope, failure) {
|
|
6929
|
+
scope.assertionFailures.push(failure);
|
|
6930
|
+
scope.assertions.push({
|
|
6931
|
+
...failure,
|
|
6932
|
+
status: "fail"
|
|
6933
|
+
});
|
|
6934
|
+
}
|
|
6855
6935
|
//#endregion
|
|
6856
6936
|
export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-_g2qOMK6.mjs";
|
|
2
|
+
import "./src-CdZsOn6y.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
|
|
2
|
+
import "./cli-_g2qOMK6.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|