@ls-stack/agent-eval 0.57.0 → 0.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Db_x-Rit.mjs → app-L9GdY28I.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +377 -0
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +3 -2
- package/dist/{cli-Ck0mqxd-.mjs → cli-Cf37PZKi.mjs} +7 -6
- package/dist/index.d.mts +108 -61
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +4 -3
- package/dist/{runExecution-BH7DlMXl.mjs → runExecution-C4kAOhC1.mjs} +115 -30
- package/dist/{runOrchestration-C1Ex9QI-.mjs → runOrchestration-5xEiQxiS.mjs} +1 -1
- package/dist/{runner-DbVYcapC.mjs → runner-JIykMlve.mjs} +1 -1
- package/dist/{runner-B3hEOT_I.mjs → runner-bjd_UB9i.mjs} +2 -2
- package/dist/{src-B3iq-tuv.mjs → src-303BocMW.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +5 -3
- package/dist/apps/web/dist/assets/index-Xa_7PteQ.css +0 -1
- package/dist/apps/web/dist/assets/index-o4o2EktS.js +0 -377
|
@@ -289,7 +289,8 @@ z.object({
|
|
|
289
289
|
key: z.string(),
|
|
290
290
|
namespace: z.string(),
|
|
291
291
|
storedAt: z.string(),
|
|
292
|
-
|
|
292
|
+
/** Last successful cache hit time. `null` means the entry has not been hit yet. */
|
|
293
|
+
lastAccessedAt: z.string().nullable()
|
|
293
294
|
});
|
|
294
295
|
z.object({
|
|
295
296
|
removedCacheFiles: z.number(),
|
|
@@ -814,8 +815,7 @@ const caseRowSchema = z.object({
|
|
|
814
815
|
/** Winning trial index for the persisted case result. */
|
|
815
816
|
trial: z.number()
|
|
816
817
|
});
|
|
817
|
-
|
|
818
|
-
const assertionFailureSchema = z.object({
|
|
818
|
+
const assertionBaseSchema = z.object({
|
|
819
819
|
/**
|
|
820
820
|
* Error class or category label rendered alongside the message (e.g.
|
|
821
821
|
* `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
|
|
@@ -827,7 +827,19 @@ const assertionFailureSchema = z.object({
|
|
|
827
827
|
/** Stack trace captured from the originating error when available. */
|
|
828
828
|
stack: z.string().optional()
|
|
829
829
|
});
|
|
830
|
+
/** Structured assertion failure metadata captured for one case run. */
|
|
831
|
+
const assertionFailureSchema = assertionBaseSchema;
|
|
832
|
+
/** Pass/fail outcome for one recorded eval assertion. */
|
|
833
|
+
const assertionStatusSchema = z.enum(["pass", "fail"]);
|
|
834
|
+
/** Structured assertion result metadata captured for one case run. */
|
|
835
|
+
const assertionResultSchema = assertionBaseSchema.extend({
|
|
836
|
+
/** Whether the recorded assertion passed or failed. */
|
|
837
|
+
status: assertionStatusSchema });
|
|
830
838
|
const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
|
|
839
|
+
const legacyAssertionResultSchema = z.string().transform((message) => ({
|
|
840
|
+
message,
|
|
841
|
+
status: "fail"
|
|
842
|
+
}));
|
|
831
843
|
/** Severity level for one log captured during a case run. */
|
|
832
844
|
const runLogLevelSchema = z.enum([
|
|
833
845
|
"log",
|
|
@@ -922,6 +934,12 @@ const caseDetailSchema = z.object({
|
|
|
922
934
|
* These complement eval-level `columns` without changing discovery metadata.
|
|
923
935
|
*/
|
|
924
936
|
outputColumnDefs: z.array(columnDefSchema).optional(),
|
|
937
|
+
/**
|
|
938
|
+
* Pass/fail assertion records captured from eval assertion helpers. New run
|
|
939
|
+
* artifacts include this alongside `assertionFailures`; older artifacts may
|
|
940
|
+
* omit it and should fall back to `assertionFailures` for failed outcomes.
|
|
941
|
+
*/
|
|
942
|
+
assertions: z.array(z.union([assertionResultSchema, legacyAssertionResultSchema])).optional(),
|
|
925
943
|
assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
|
|
926
944
|
/** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
|
|
927
945
|
logs: z.array(runLogEntrySchema).default([]),
|
|
@@ -1405,6 +1423,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1405
1423
|
maxEntriesPerNamespace: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1406
1424
|
maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
|
|
1407
1425
|
pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1426
|
+
lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1408
1427
|
maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
|
|
1409
1428
|
}).optional()
|
|
1410
1429
|
});
|
|
@@ -3086,6 +3105,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
3086
3105
|
tags: options.tags ?? [],
|
|
3087
3106
|
outputs: {},
|
|
3088
3107
|
outputColumnOverrides: {},
|
|
3108
|
+
assertions: [],
|
|
3089
3109
|
assertionFailures: [],
|
|
3090
3110
|
logs: [],
|
|
3091
3111
|
spans: [],
|
|
@@ -3211,7 +3231,7 @@ function mergeEvalOutput(key, patch) {
|
|
|
3211
3231
|
return;
|
|
3212
3232
|
}
|
|
3213
3233
|
if (!isObjectRecord(existing)) {
|
|
3214
|
-
scope
|
|
3234
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
|
|
3215
3235
|
return;
|
|
3216
3236
|
}
|
|
3217
3237
|
scope.outputs[key] = {
|
|
@@ -3244,7 +3264,7 @@ function incrementEvalOutput(key, delta) {
|
|
|
3244
3264
|
return;
|
|
3245
3265
|
}
|
|
3246
3266
|
if (typeof existing !== "number") {
|
|
3247
|
-
scope
|
|
3267
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
|
|
3248
3268
|
return;
|
|
3249
3269
|
}
|
|
3250
3270
|
scope.outputs[key] = existing + delta;
|
|
@@ -3263,13 +3283,26 @@ function incrementEvalOutput(key, delta) {
|
|
|
3263
3283
|
* call.
|
|
3264
3284
|
*/
|
|
3265
3285
|
function evalAssert(condition, message) {
|
|
3266
|
-
if (condition) return;
|
|
3267
3286
|
const scope = getCurrentScope();
|
|
3287
|
+
if (condition) {
|
|
3288
|
+
if (scope) scope.assertions.push({
|
|
3289
|
+
message,
|
|
3290
|
+
status: "pass"
|
|
3291
|
+
});
|
|
3292
|
+
return;
|
|
3293
|
+
}
|
|
3268
3294
|
if (!scope) return;
|
|
3269
3295
|
const error = new EvalAssertionError(message);
|
|
3270
|
-
scope
|
|
3296
|
+
recordAssertionFailure$1(scope, toAssertionFailure$1(message, error));
|
|
3271
3297
|
throw error;
|
|
3272
3298
|
}
|
|
3299
|
+
function recordAssertionFailure$1(scope, failure) {
|
|
3300
|
+
scope.assertionFailures.push(failure);
|
|
3301
|
+
scope.assertions.push({
|
|
3302
|
+
...failure,
|
|
3303
|
+
status: "fail"
|
|
3304
|
+
});
|
|
3305
|
+
}
|
|
3273
3306
|
//#endregion
|
|
3274
3307
|
//#region ../sdk/src/evalExpect.ts
|
|
3275
3308
|
const expectFormatOptions = {
|
|
@@ -4185,14 +4218,28 @@ function applyRecordingOp(scope, parentSpan, op, options) {
|
|
|
4185
4218
|
...existing,
|
|
4186
4219
|
...op.patch
|
|
4187
4220
|
};
|
|
4188
|
-
else
|
|
4221
|
+
else {
|
|
4222
|
+
const message = `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object`;
|
|
4223
|
+
scope.assertionFailures.push({ message });
|
|
4224
|
+
scope.assertions.push({
|
|
4225
|
+
message,
|
|
4226
|
+
status: "fail"
|
|
4227
|
+
});
|
|
4228
|
+
}
|
|
4189
4229
|
return;
|
|
4190
4230
|
}
|
|
4191
4231
|
if (op.kind === "incrementOutput") {
|
|
4192
4232
|
const existing = scope.outputs[op.key];
|
|
4193
4233
|
if (existing === void 0) scope.outputs[op.key] = op.delta;
|
|
4194
4234
|
else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
|
|
4195
|
-
else
|
|
4235
|
+
else {
|
|
4236
|
+
const message = `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number`;
|
|
4237
|
+
scope.assertionFailures.push({ message });
|
|
4238
|
+
scope.assertions.push({
|
|
4239
|
+
message,
|
|
4240
|
+
status: "fail"
|
|
4241
|
+
});
|
|
4242
|
+
}
|
|
4196
4243
|
return;
|
|
4197
4244
|
}
|
|
4198
4245
|
if (op.kind === "checkpoint") {
|
|
@@ -4478,6 +4525,10 @@ function recordSpanAttributeAssertion(message) {
|
|
|
4478
4525
|
const scope = getCurrentScope();
|
|
4479
4526
|
if (!scope) return;
|
|
4480
4527
|
scope.assertionFailures.push({ message });
|
|
4528
|
+
scope.assertions.push({
|
|
4529
|
+
message,
|
|
4530
|
+
status: "fail"
|
|
4531
|
+
});
|
|
4481
4532
|
}
|
|
4482
4533
|
function incrementSpanAttribute(span, key, delta) {
|
|
4483
4534
|
const existing = span.attributes?.[key];
|
|
@@ -4983,6 +5034,24 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
4983
5034
|
};
|
|
4984
5035
|
}
|
|
4985
5036
|
//#endregion
|
|
5037
|
+
//#region ../runner/src/cacheAccessTime.ts
|
|
5038
|
+
const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
|
|
5039
|
+
function normalizeLastAccessedAtUpdateIntervalMs(value) {
|
|
5040
|
+
if (value === void 0 || !Number.isFinite(value) || value < 0) return defaultLastAccessedAtUpdateIntervalMs;
|
|
5041
|
+
return Math.floor(value);
|
|
5042
|
+
}
|
|
5043
|
+
function cacheAccessSortTime(entry) {
|
|
5044
|
+
return entry.lastAccessedAt ?? entry.storedAt;
|
|
5045
|
+
}
|
|
5046
|
+
function shouldRefreshLastAccessedAt(params) {
|
|
5047
|
+
return params.lastAccessedAt === null || params.nowMs - Date.parse(params.lastAccessedAt) > params.updateIntervalMs;
|
|
5048
|
+
}
|
|
5049
|
+
//#endregion
|
|
5050
|
+
//#region ../runner/src/cacheKeys.ts
|
|
5051
|
+
function toPendingKey(namespace, keyHash) {
|
|
5052
|
+
return `${namespace}::${keyHash}`;
|
|
5053
|
+
}
|
|
5054
|
+
//#endregion
|
|
4986
5055
|
//#region ../runner/src/cacheStore.ts
|
|
4987
5056
|
const defaultMaxEntriesPerNamespace = 100;
|
|
4988
5057
|
const cacheSerializationMarker = "__aecs";
|
|
@@ -5015,6 +5084,7 @@ function createFsCacheStore(options) {
|
|
|
5015
5084
|
primaryDir: blobDir
|
|
5016
5085
|
});
|
|
5017
5086
|
const defaultMaxEntries = normalizeMaxEntries(options.maxEntriesPerNamespace);
|
|
5087
|
+
const lastAccessedAtUpdateIntervalMs = normalizeLastAccessedAtUpdateIntervalMs(options.lastAccessedAtUpdateIntervalMs);
|
|
5018
5088
|
return {
|
|
5019
5089
|
externalJsonStore,
|
|
5020
5090
|
dir() {
|
|
@@ -5034,7 +5104,12 @@ function createFsCacheStore(options) {
|
|
|
5034
5104
|
});
|
|
5035
5105
|
if (entry === null) return null;
|
|
5036
5106
|
const materialized = await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
|
|
5037
|
-
if (materialized !== null) await updateCacheIndexLastAccessedAt(
|
|
5107
|
+
if (materialized !== null) await updateCacheIndexLastAccessedAt({
|
|
5108
|
+
cacheDir,
|
|
5109
|
+
key: keyHash,
|
|
5110
|
+
namespace,
|
|
5111
|
+
updateIntervalMs: lastAccessedAtUpdateIntervalMs
|
|
5112
|
+
});
|
|
5038
5113
|
return materialized;
|
|
5039
5114
|
},
|
|
5040
5115
|
async lookupWithDebug(namespace, keyHash) {
|
|
@@ -5063,7 +5138,7 @@ function createFsCacheStore(options) {
|
|
|
5063
5138
|
const index = await readNamespaceIndex(cacheDir, entry.namespace);
|
|
5064
5139
|
index.entries[entry.key] = {
|
|
5065
5140
|
storedAt: entry.storedAt,
|
|
5066
|
-
lastAccessedAt:
|
|
5141
|
+
lastAccessedAt: null,
|
|
5067
5142
|
blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
|
|
5068
5143
|
};
|
|
5069
5144
|
await writeNamespaceIndex(cacheDir, index);
|
|
@@ -5082,7 +5157,7 @@ function createFsCacheStore(options) {
|
|
|
5082
5157
|
async list() {
|
|
5083
5158
|
const items = [];
|
|
5084
5159
|
for (const index of await listCacheIndexes(cacheDir)) for (const [key, entry] of Object.entries(index.entries)) items.push(toCacheListItem(index.namespace, key, entry));
|
|
5085
|
-
items.sort((a, b) => a
|
|
5160
|
+
items.sort((a, b) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
|
|
5086
5161
|
return items;
|
|
5087
5162
|
},
|
|
5088
5163
|
async clear(filter) {
|
|
@@ -5183,9 +5258,6 @@ function maxEntriesForNamespace(namespace, defaultMaxEntries, maxEntriesByNamesp
|
|
|
5183
5258
|
const namespaceMaxEntries = maxEntriesByNamespace?.[namespace];
|
|
5184
5259
|
return namespaceMaxEntries === void 0 ? defaultMaxEntries : normalizeMaxEntries(namespaceMaxEntries, defaultMaxEntries);
|
|
5185
5260
|
}
|
|
5186
|
-
function toPendingKey(namespace, keyHash) {
|
|
5187
|
-
return `${namespace}::${keyHash}`;
|
|
5188
|
-
}
|
|
5189
5261
|
function sanitizeSegment$1(segment) {
|
|
5190
5262
|
return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
5191
5263
|
}
|
|
@@ -5231,16 +5303,22 @@ async function readIndexedCacheEntry(params) {
|
|
|
5231
5303
|
return fileEntry.entry;
|
|
5232
5304
|
});
|
|
5233
5305
|
}
|
|
5234
|
-
async function updateCacheIndexLastAccessedAt(
|
|
5235
|
-
await withCacheFileLock(namespaceLockPath(cacheDir, namespace), async () => {
|
|
5236
|
-
const index = await readNamespaceIndex(cacheDir, namespace);
|
|
5237
|
-
const entry = index.entries[key];
|
|
5306
|
+
async function updateCacheIndexLastAccessedAt(params) {
|
|
5307
|
+
await withCacheFileLock(namespaceLockPath(params.cacheDir, params.namespace), async () => {
|
|
5308
|
+
const index = await readNamespaceIndex(params.cacheDir, params.namespace);
|
|
5309
|
+
const entry = index.entries[params.key];
|
|
5238
5310
|
if (entry === void 0) return;
|
|
5239
|
-
|
|
5311
|
+
const nowMs = getRealDateNowMs();
|
|
5312
|
+
if (!shouldRefreshLastAccessedAt({
|
|
5313
|
+
lastAccessedAt: entry.lastAccessedAt,
|
|
5314
|
+
nowMs,
|
|
5315
|
+
updateIntervalMs: params.updateIntervalMs
|
|
5316
|
+
})) return;
|
|
5317
|
+
index.entries[params.key] = {
|
|
5240
5318
|
...entry,
|
|
5241
|
-
lastAccessedAt: new Date(
|
|
5319
|
+
lastAccessedAt: new Date(nowMs).toISOString()
|
|
5242
5320
|
};
|
|
5243
|
-
await writeNamespaceIndex(cacheDir, index);
|
|
5321
|
+
await writeNamespaceIndex(params.cacheDir, index);
|
|
5244
5322
|
});
|
|
5245
5323
|
}
|
|
5246
5324
|
async function readCacheEntryFilePath(filePath, expected) {
|
|
@@ -5371,7 +5449,7 @@ function parseCacheIndexFile(value, expectedNamespace) {
|
|
|
5371
5449
|
}
|
|
5372
5450
|
function parseCacheIndexEntry(value) {
|
|
5373
5451
|
if (!isRecordLike(value)) return null;
|
|
5374
|
-
if (typeof value.storedAt !== "string" || typeof value.lastAccessedAt !== "string") return null;
|
|
5452
|
+
if (typeof value.storedAt !== "string" || value.lastAccessedAt !== null && typeof value.lastAccessedAt !== "string") return null;
|
|
5375
5453
|
if (!Array.isArray(value.blobRefs)) return null;
|
|
5376
5454
|
const blobRefs = [];
|
|
5377
5455
|
for (const blobRef of value.blobRefs) {
|
|
@@ -5441,7 +5519,7 @@ function entryMatchesFilter(entry, filter) {
|
|
|
5441
5519
|
async function pruneCacheEntriesForNamespace(params) {
|
|
5442
5520
|
const { cacheDir, index, maxEntries } = params;
|
|
5443
5521
|
const entries = Object.entries(index.entries);
|
|
5444
|
-
const sorted = entries.toSorted(([, a], [, b]) => a
|
|
5522
|
+
const sorted = entries.toSorted(([, a], [, b]) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
|
|
5445
5523
|
const keptKeys = /* @__PURE__ */ new Set();
|
|
5446
5524
|
for (const [key] of sorted) {
|
|
5447
5525
|
if (keptKeys.size >= maxEntries) break;
|
|
@@ -6606,7 +6684,7 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6606
6684
|
});
|
|
6607
6685
|
} catch (e) {
|
|
6608
6686
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6609
|
-
params.scope
|
|
6687
|
+
recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6610
6688
|
}
|
|
6611
6689
|
}
|
|
6612
6690
|
async function runCase(params) {
|
|
@@ -6656,7 +6734,7 @@ async function runCase(params) {
|
|
|
6656
6734
|
});
|
|
6657
6735
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
6658
6736
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
6659
|
-
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope
|
|
6737
|
+
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
|
|
6660
6738
|
if (!nonAssertError) {
|
|
6661
6739
|
await runDeriveFromTracingConfig({
|
|
6662
6740
|
deriveFromTracing: globalDeriveFromTracing,
|
|
@@ -6686,7 +6764,7 @@ async function runCase(params) {
|
|
|
6686
6764
|
...scope.outputs,
|
|
6687
6765
|
...parsedOutputs.data
|
|
6688
6766
|
};
|
|
6689
|
-
else scope
|
|
6767
|
+
else recordAssertionFailure(scope, toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
|
|
6690
6768
|
}
|
|
6691
6769
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
6692
6770
|
const scoringTraces = {};
|
|
@@ -6728,8 +6806,7 @@ async function runCase(params) {
|
|
|
6728
6806
|
};
|
|
6729
6807
|
const rawValue = scoreRun.result;
|
|
6730
6808
|
if (scoreRun.error) {
|
|
6731
|
-
|
|
6732
|
-
scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
|
|
6809
|
+
recordAssertionFailure(scope, toAssertionFailure(`score "${key}" threw: ${scoreRun.error.message}`, scoreRun.error));
|
|
6733
6810
|
scope.outputs[key] = 0;
|
|
6734
6811
|
scoreResults.set(key, {
|
|
6735
6812
|
value: 0,
|
|
@@ -6739,7 +6816,7 @@ async function runCase(params) {
|
|
|
6739
6816
|
continue;
|
|
6740
6817
|
}
|
|
6741
6818
|
if (typeof rawValue !== "number") {
|
|
6742
|
-
scope
|
|
6819
|
+
recordAssertionFailure(scope, toAssertionFailure(`score "${key}" must return a number`));
|
|
6743
6820
|
scope.outputs[key] = 0;
|
|
6744
6821
|
scoreResults.set(key, {
|
|
6745
6822
|
value: 0,
|
|
@@ -6801,6 +6878,7 @@ async function runCase(params) {
|
|
|
6801
6878
|
traceDisplay,
|
|
6802
6879
|
columns,
|
|
6803
6880
|
...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
|
|
6881
|
+
assertions: scope.assertions,
|
|
6804
6882
|
assertionFailures: scope.assertionFailures,
|
|
6805
6883
|
logs: scope.logs,
|
|
6806
6884
|
error: errorInfo,
|
|
@@ -6852,5 +6930,12 @@ function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
|
|
|
6852
6930
|
...stack !== void 0 ? { stack } : {}
|
|
6853
6931
|
};
|
|
6854
6932
|
}
|
|
6933
|
+
function recordAssertionFailure(scope, failure) {
|
|
6934
|
+
scope.assertionFailures.push(failure);
|
|
6935
|
+
scope.assertions.push({
|
|
6936
|
+
...failure,
|
|
6937
|
+
status: "fail"
|
|
6938
|
+
});
|
|
6939
|
+
}
|
|
6855
6940
|
//#endregion
|
|
6856
6941
|
export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-C4kAOhC1.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-bjd_UB9i.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Cf37PZKi.mjs";
|
|
2
|
+
import "./src-303BocMW.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C4kAOhC1.mjs";
|
|
2
|
+
import "./cli-Cf37PZKi.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -626,9 +626,11 @@ When adding or changing evals:
|
|
|
626
626
|
1. Put the tracing + ambient SDK calls in the product code that runs in both
|
|
627
627
|
production and evals. Keep eval files thin.
|
|
628
628
|
2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
|
|
629
|
-
3. `evalAssert` for hard invariants and truthy type narrowing
|
|
630
|
-
|
|
631
|
-
|
|
629
|
+
3. `evalAssert` for hard invariants and truthy type narrowing. It records
|
|
630
|
+
pass/fail entries in case-detail `assertions`; failed entries are also kept
|
|
631
|
+
in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
|
|
632
|
+
comparisons, `scores` for graded signals, and `passThreshold` only on
|
|
633
|
+
scores that should gate pass/fail.
|
|
632
634
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
633
635
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
634
636
|
formats from the `ColumnFormat` type.
|