@ls-stack/agent-eval 0.16.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/p
4
4
  import { extname, isAbsolute, join, relative, resolve } from "node:path";
5
5
  import { z, z as z$1 } from "zod/v4";
6
6
  import { AsyncLocalStorage } from "node:async_hooks";
7
+ import { formatWithOptions } from "node:util";
7
8
  import { Buffer as Buffer$1 } from "node:buffer";
8
9
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
9
10
  import { existsSync } from "node:fs";
@@ -49,6 +50,25 @@ const scopeStorage = new AsyncLocalStorage();
49
50
  const runtimeScopeStorage = new AsyncLocalStorage();
50
51
  let activeEvalScopeCount = 0;
51
52
  let activeEvalRuntimeScopeCount = 0;
53
+ let consoleCaptureEnabled = true;
54
+ const maxLogMessageLength = 2e4;
55
+ const maxLogStringLength = 1e4;
56
+ const maxLogArrayLength = 100;
57
+ const maxLogObjectEntries = 100;
58
+ const maxLogValueDepth = 5;
59
+ const consoleCaptureMethods = [
60
+ "log",
61
+ "info",
62
+ "warn",
63
+ "error"
64
+ ];
65
+ const runtimeConsole = globalThis.console;
66
+ const originalConsoleMethods = {
67
+ log: runtimeConsole.log.bind(runtimeConsole),
68
+ info: runtimeConsole.info.bind(runtimeConsole),
69
+ warn: runtimeConsole.warn.bind(runtimeConsole),
70
+ error: runtimeConsole.error.bind(runtimeConsole)
71
+ };
52
72
  /** Error thrown when an eval assertion fails during case execution. */
53
73
  var EvalAssertionError = class extends Error {
54
74
  constructor(message) {
@@ -73,6 +93,155 @@ function isInEvalScope() {
73
93
  if (activeEvalRuntimeScopeCount === 0) return null;
74
94
  return runtimeScopeStorage.getStore() ?? null;
75
95
  }
96
+ function normalizeLogLevel(level) {
97
+ return level === "warning" ? "warn" : level;
98
+ }
99
+ function getCurrentLogPhase() {
100
+ const runtimeScope = runtimeScopeStorage.getStore();
101
+ if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
102
+ return null;
103
+ }
104
+ function formatLogArgs(args) {
105
+ const formatted = formatWithOptions({
106
+ depth: 2,
107
+ maxArrayLength: 100,
108
+ maxStringLength: 1e4,
109
+ breakLength: 80,
110
+ compact: 3
111
+ }, ...args);
112
+ if (formatted.length <= maxLogMessageLength) return {
113
+ message: formatted,
114
+ truncated: false
115
+ };
116
+ return {
117
+ message: `${formatted.slice(0, maxLogMessageLength)}...`,
118
+ truncated: true
119
+ };
120
+ }
121
+ function truncateLogString(value, ctx) {
122
+ if (value.length <= maxLogStringLength) return value;
123
+ ctx.truncated = true;
124
+ return `${value.slice(0, maxLogStringLength)}...`;
125
+ }
126
+ function primitiveToLogValue(value, ctx) {
127
+ if (typeof value === "string") return {
128
+ handled: true,
129
+ value: truncateLogString(value, ctx)
130
+ };
131
+ if (value === null || typeof value === "number" || typeof value === "boolean") return {
132
+ handled: true,
133
+ value
134
+ };
135
+ if (value === void 0) return {
136
+ handled: true,
137
+ value: "[undefined]"
138
+ };
139
+ if (typeof value === "bigint") return {
140
+ handled: true,
141
+ value: `${value.toString()}n`
142
+ };
143
+ if (typeof value === "symbol") return {
144
+ handled: true,
145
+ value: String(value)
146
+ };
147
+ if (typeof value === "function") return {
148
+ handled: true,
149
+ value: `[Function${value.name.length > 0 ? `: ${value.name}` : ""}]`
150
+ };
151
+ return {
152
+ handled: false,
153
+ value: null
154
+ };
155
+ }
156
+ function objectToLogValue(value, ctx, depth) {
157
+ if (value instanceof Date) return value.toISOString();
158
+ if (value instanceof Error) return {
159
+ name: value.name,
160
+ message: value.message,
161
+ stack: value.stack
162
+ };
163
+ if (ctx.seen.has(value)) return "[Circular]";
164
+ if (depth >= maxLogValueDepth) {
165
+ ctx.truncated = true;
166
+ return Array.isArray(value) ? "[Array]" : "[Object]";
167
+ }
168
+ ctx.seen.add(value);
169
+ try {
170
+ if (Array.isArray(value)) {
171
+ const limited = value.slice(0, maxLogArrayLength).map((item) => toLogJsonValue(item, ctx, depth + 1));
172
+ if (value.length > maxLogArrayLength) {
173
+ ctx.truncated = true;
174
+ limited.push(`[... ${String(value.length - maxLogArrayLength)} more]`);
175
+ }
176
+ return limited;
177
+ }
178
+ const entries = Object.entries(value);
179
+ const result = {};
180
+ for (const [key, entryValue] of entries.slice(0, maxLogObjectEntries)) result[key] = toLogJsonValue(entryValue, ctx, depth + 1);
181
+ if (entries.length > maxLogObjectEntries) {
182
+ ctx.truncated = true;
183
+ result.__truncated = `${String(entries.length - maxLogObjectEntries)} more properties`;
184
+ }
185
+ return result;
186
+ } finally {
187
+ ctx.seen.delete(value);
188
+ }
189
+ }
190
+ function toLogJsonValue(value, ctx, depth) {
191
+ const primitive = primitiveToLogValue(value, ctx);
192
+ if (primitive.handled) return primitive.value;
193
+ if (typeof value === "object" && value !== null) return objectToLogValue(value, ctx, depth);
194
+ return String(value);
195
+ }
196
+ function toLogJsonArgs(args) {
197
+ const ctx = {
198
+ seen: /* @__PURE__ */ new WeakSet(),
199
+ truncated: false
200
+ };
201
+ return {
202
+ args: args.map((value) => toLogJsonValue(value, ctx, 0)),
203
+ truncated: ctx.truncated
204
+ };
205
+ }
206
+ function recordEvalLog(level, args) {
207
+ const scope = getCurrentScope();
208
+ const phase = getCurrentLogPhase();
209
+ if (!scope || !phase) return;
210
+ const preview = formatLogArgs(args);
211
+ const jsonArgs = toLogJsonArgs(args);
212
+ scope.logs.push({
213
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
214
+ level: normalizeLogLevel(level),
215
+ phase,
216
+ message: preview.message,
217
+ args: jsonArgs.args,
218
+ truncated: preview.truncated || jsonArgs.truncated
219
+ });
220
+ }
221
+ for (const method of consoleCaptureMethods) runtimeConsole[method] = (...args) => {
222
+ if (consoleCaptureEnabled) recordEvalLog(method, args);
223
+ originalConsoleMethods[method](...args);
224
+ };
225
+ /**
226
+ * Configure whether console methods are captured as eval case logs.
227
+ *
228
+ * Runner-internal helper. When disabled, console output still prints normally;
229
+ * only automatic persistence to `caseDetail.logs` is skipped. Manual
230
+ * `evalLog(...)` calls are unaffected.
231
+ */
232
+ function configureEvalRunLogs(options) {
233
+ consoleCaptureEnabled = options.captureConsole;
234
+ }
235
+ /**
236
+ * Record a manual log entry on the active eval case.
237
+ *
238
+ * Values are formatted with Node-style console formatting and capped before
239
+ * persistence so a single log cannot make run artifacts unbounded. Calls made
240
+ * outside active case-owned eval phases are ignored.
241
+ */
242
+ function evalLog(level, ...args) {
243
+ recordEvalLog(level, args);
244
+ }
76
245
  function registerBackgroundJobInScope(scope, promise) {
77
246
  const trackedPromise = promise.then(() => {
78
247
  scope.pendingBackgroundJobs.delete(trackedPromise);
@@ -164,6 +333,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
164
333
  input: options.input,
165
334
  outputs: {},
166
335
  assertionFailures: [],
336
+ logs: [],
167
337
  spans: [],
168
338
  checkpoints: /* @__PURE__ */ new Map(),
169
339
  spanStack: [],
@@ -332,107 +502,6 @@ function evalAssert(condition, message) {
332
502
  throw error;
333
503
  }
334
504
  //#endregion
335
- //#region ../sdk/src/cacheKey.ts
336
- var SerializedCacheKeyValue = class {
337
- value;
338
- constructor(value) {
339
- this.value = value;
340
- }
341
- };
342
- /**
343
- * Hash the components of a cache key into a deterministic hex digest.
344
- *
345
- * Native `Blob` and `File` values use stable metadata by default. Pass
346
- * `serializeFileBytes: true` to read them asynchronously and include their byte
347
- * hash in the key.
348
- */
349
- async function hashCacheKey(input, options = {}) {
350
- return hashCacheKeySyncMaterialized(options.serializeFileBytes === true ? await materializeAsyncCacheKeyValue(input) : input);
351
- }
352
- /**
353
- * Synchronously hash cache key components. This supports JSON-like data and
354
- * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
355
- * plus stable metadata for native `Blob` and `File` values.
356
- */
357
- function hashCacheKeySync(input) {
358
- return hashCacheKeySyncMaterialized(input);
359
- }
360
- function hashCacheKeySyncMaterialized(input) {
361
- return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
362
- }
363
- function stringifyCacheKeyValue(value) {
364
- if (value instanceof SerializedCacheKeyValue) return value.value;
365
- if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
366
- if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
367
- if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
368
- if (isArrayBufferView(value)) {
369
- const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
370
- return `$${value.constructor.name}:${hashBytes(bytes)}`;
371
- }
372
- if (isFile$1(value)) return `$file:${getCompositeKey({
373
- lastModified: value.lastModified,
374
- name: value.name,
375
- size: value.size,
376
- type: value.type
377
- })}`;
378
- if (isBlob$1(value)) return `$blob:${getCompositeKey({
379
- size: value.size,
380
- type: value.type
381
- })}`;
382
- }
383
- async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
384
- const serialized = await stringifyAsyncCacheKeyValue(value);
385
- if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
386
- if (stringifyCacheKeyValue(value) !== void 0) return value;
387
- if (!value || typeof value !== "object") return value;
388
- if (Array.isArray(value)) {
389
- const items = [];
390
- for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
391
- return items;
392
- }
393
- if (refs.has(value)) throw new Error("Circular reference detected");
394
- refs.add(value);
395
- const entries = [];
396
- for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
397
- refs.delete(value);
398
- return Object.fromEntries(entries);
399
- }
400
- async function stringifyAsyncCacheKeyValue(value) {
401
- if (isFile$1(value)) return `$file:${getCompositeKey({
402
- bytes: await hashBlobBytes(value),
403
- lastModified: value.lastModified,
404
- name: value.name,
405
- size: value.size,
406
- type: value.type
407
- })}`;
408
- if (isBlob$1(value)) return `$blob:${getCompositeKey({
409
- bytes: await hashBlobBytes(value),
410
- size: value.size,
411
- type: value.type
412
- })}`;
413
- }
414
- async function hashBlobBytes(value) {
415
- return hashBytes(new Uint8Array(await value.arrayBuffer()));
416
- }
417
- function hashBytes(value) {
418
- return createHash("sha256").update(value).digest("hex");
419
- }
420
- function isArrayBuffer(value) {
421
- return value instanceof ArrayBuffer;
422
- }
423
- function isSharedArrayBuffer(value) {
424
- return value instanceof SharedArrayBuffer;
425
- }
426
- function isArrayBufferView(value) {
427
- return ArrayBuffer.isView(value);
428
- }
429
- function isBlob$1(value) {
430
- return value instanceof Blob;
431
- }
432
- function isFile$1(value) {
433
- return value instanceof File;
434
- }
435
- //#endregion
436
505
  //#region ../../node_modules/.pnpm/seroval@1.5.2/node_modules/seroval/dist/esm/production/index.mjs
437
506
  var L$1 = ((i) => (i[i.AggregateError = 1] = "AggregateError", i[i.ArrowFunction = 2] = "ArrowFunction", i[i.ErrorPrototypeStack = 4] = "ErrorPrototypeStack", i[i.ObjectAssign = 8] = "ObjectAssign", i[i.BigIntTypedArray = 16] = "BigIntTypedArray", i[i.RegExp = 32] = "RegExp", i))(L$1 || {});
438
507
  var v$1 = Symbol.asyncIterator, mr = Symbol.hasInstance, R = Symbol.isConcatSpreadable, C = Symbol.iterator, pr = Symbol.match, dr = Symbol.matchAll, gr = Symbol.replace, yr = Symbol.search, Nr = Symbol.species, br = Symbol.split, vr = Symbol.toPrimitive, P$1 = Symbol.toStringTag, Cr = Symbol.unscopables, ve = {
@@ -2223,6 +2292,107 @@ function deserializeCacheRecording(recording) {
2223
2292
  };
2224
2293
  }
2225
2294
  //#endregion
2295
+ //#region ../sdk/src/cacheKey.ts
2296
+ var SerializedCacheKeyValue = class {
2297
+ value;
2298
+ constructor(value) {
2299
+ this.value = value;
2300
+ }
2301
+ };
2302
+ /**
2303
+ * Hash the components of a cache key into a deterministic hex digest.
2304
+ *
2305
+ * Native `Blob` and `File` values use stable metadata by default. Pass
2306
+ * `serializeFileBytes: true` to read them asynchronously and include their byte
2307
+ * hash in the key.
2308
+ */
2309
+ async function hashCacheKey(input, options = {}) {
2310
+ return hashCacheKeySyncMaterialized(options.serializeFileBytes === true ? await materializeAsyncCacheKeyValue(input) : input);
2311
+ }
2312
+ /**
2313
+ * Synchronously hash cache key components. This supports JSON-like data and
2314
+ * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
2315
+ * plus stable metadata for native `Blob` and `File` values.
2316
+ */
2317
+ function hashCacheKeySync(input) {
2318
+ return hashCacheKeySyncMaterialized(input);
2319
+ }
2320
+ function hashCacheKeySyncMaterialized(input) {
2321
+ return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
2322
+ }
2323
+ function stringifyCacheKeyValue(value) {
2324
+ if (value instanceof SerializedCacheKeyValue) return value.value;
2325
+ if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
2326
+ if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
2327
+ if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
2328
+ if (isArrayBufferView(value)) {
2329
+ const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
2330
+ return `$${value.constructor.name}:${hashBytes(bytes)}`;
2331
+ }
2332
+ if (isFile$1(value)) return `$file:${getCompositeKey({
2333
+ lastModified: value.lastModified,
2334
+ name: value.name,
2335
+ size: value.size,
2336
+ type: value.type
2337
+ })}`;
2338
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
2339
+ size: value.size,
2340
+ type: value.type
2341
+ })}`;
2342
+ }
2343
+ async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
2344
+ const serialized = await stringifyAsyncCacheKeyValue(value);
2345
+ if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
2346
+ if (stringifyCacheKeyValue(value) !== void 0) return value;
2347
+ if (!value || typeof value !== "object") return value;
2348
+ if (Array.isArray(value)) {
2349
+ const items = [];
2350
+ for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
2351
+ return items;
2352
+ }
2353
+ if (refs.has(value)) throw new Error("Circular reference detected");
2354
+ refs.add(value);
2355
+ const entries = [];
2356
+ for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
2357
+ refs.delete(value);
2358
+ return Object.fromEntries(entries);
2359
+ }
2360
+ async function stringifyAsyncCacheKeyValue(value) {
2361
+ if (isFile$1(value)) return `$file:${getCompositeKey({
2362
+ bytes: await hashBlobBytes(value),
2363
+ lastModified: value.lastModified,
2364
+ name: value.name,
2365
+ size: value.size,
2366
+ type: value.type
2367
+ })}`;
2368
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
2369
+ bytes: await hashBlobBytes(value),
2370
+ size: value.size,
2371
+ type: value.type
2372
+ })}`;
2373
+ }
2374
+ async function hashBlobBytes(value) {
2375
+ return hashBytes(new Uint8Array(await value.arrayBuffer()));
2376
+ }
2377
+ function hashBytes(value) {
2378
+ return createHash("sha256").update(value).digest("hex");
2379
+ }
2380
+ function isArrayBuffer(value) {
2381
+ return value instanceof ArrayBuffer;
2382
+ }
2383
+ function isSharedArrayBuffer(value) {
2384
+ return value instanceof SharedArrayBuffer;
2385
+ }
2386
+ function isArrayBufferView(value) {
2387
+ return ArrayBuffer.isView(value);
2388
+ }
2389
+ function isBlob$1(value) {
2390
+ return value instanceof Blob;
2391
+ }
2392
+ function isFile$1(value) {
2393
+ return value instanceof File;
2394
+ }
2395
+ //#endregion
2226
2396
  //#region ../sdk/src/cacheRecording.ts
2227
2397
  function mergeSpanAttributes$1(span, attributes) {
2228
2398
  span.attributes = {
@@ -2571,6 +2741,11 @@ function createTraceCache(generateSpanId) {
2571
2741
  storedAt: (/* @__PURE__ */ new Date()).toISOString(),
2572
2742
  codeFingerprint: cacheCtx.codeFingerprint,
2573
2743
  recording: await serializeCacheRecording(recording)
2744
+ }, {
2745
+ rawKey: info.key,
2746
+ operationType: "value",
2747
+ operationName: info.name,
2748
+ codeFingerprint: cacheCtx.codeFingerprint
2574
2749
  });
2575
2750
  }
2576
2751
  return bodyResult;
@@ -2996,7 +3171,12 @@ async function traceSpanInternal(info, fn) {
2996
3171
  codeFingerprint: ctx.codeFingerprint,
2997
3172
  recording: await serializeCacheRecording(recording)
2998
3173
  };
2999
- await ctx.adapter.write(entry);
3174
+ await ctx.adapter.write(entry, {
3175
+ rawKey: cacheOpts.key,
3176
+ operationType: "span",
3177
+ operationName: info.name,
3178
+ codeFingerprint: ctx.codeFingerprint
3179
+ });
3000
3180
  }
3001
3181
  return bodyResult;
3002
3182
  }
@@ -3415,12 +3595,31 @@ const cacheEntrySchema = z.object({
3415
3595
  codeFingerprint: z.string(),
3416
3596
  recording: cacheRecordingSchema
3417
3597
  });
3598
+ /** Debug-only raw key metadata stored outside the reusable cache entry. */
3599
+ const cacheDebugKeyEntrySchema = z.object({
3600
+ version: z.literal(1),
3601
+ key: z.string(),
3602
+ namespace: z.string(),
3603
+ operationType: cacheOperationTypeSchema,
3604
+ operationName: z.string(),
3605
+ storedAt: z.string(),
3606
+ codeFingerprint: z.string(),
3607
+ rawKey: z.unknown()
3608
+ });
3609
+ /** Cache lookup response with optional debug-only raw key data. */
3610
+ const cacheEntryWithDebugKeySchema = cacheEntrySchema.extend({ debugKey: cacheDebugKeyEntrySchema.optional() });
3418
3611
  /** Persisted per-owner cache file containing multiple cache entries. */
3419
3612
  const cacheFileSchema = z.object({
3420
3613
  version: z.literal(1),
3421
3614
  owner: z.string(),
3422
3615
  entries: z.record(z.string(), cacheEntrySchema)
3423
3616
  });
3617
+ /** Persisted per-owner debug file containing raw cache key metadata. */
3618
+ const cacheDebugKeyFileSchema = z.object({
3619
+ version: z.literal(1),
3620
+ owner: z.string(),
3621
+ entries: z.record(z.string(), cacheDebugKeyEntrySchema)
3622
+ });
3424
3623
  //#endregion
3425
3624
  //#region ../shared/src/schemas/chart.ts
3426
3625
  /** Chart type rendered for a single eval history chart. */
@@ -3623,6 +3822,40 @@ const assertionFailureSchema = z.object({
3623
3822
  stack: z.string().optional()
3624
3823
  });
3625
3824
  const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
3825
+ /** Severity level for one log captured during a case run. */
3826
+ const runLogLevelSchema = z.enum([
3827
+ "log",
3828
+ "info",
3829
+ "warn",
3830
+ "error"
3831
+ ]);
3832
+ /** Eval runner phase that emitted a captured case log. */
3833
+ const runLogPhaseSchema = z.enum([
3834
+ "eval",
3835
+ "derive",
3836
+ "outputsSchema",
3837
+ "scorer"
3838
+ ]);
3839
+ /** Schema for one persisted log entry captured during a case run. */
3840
+ const runLogEntrySchema = z.object({
3841
+ /** ISO timestamp for when the log was captured. */
3842
+ timestamp: z.string(),
3843
+ /** Normalized log level. */
3844
+ level: runLogLevelSchema,
3845
+ /** Case-owned runner phase that emitted the log. */
3846
+ phase: runLogPhaseSchema,
3847
+ /** Human-readable preview formatted from the original log arguments. */
3848
+ message: z.string(),
3849
+ /** JSON-safe captured log arguments rendered in the UI. */
3850
+ args: z.array(z.unknown()).default([]),
3851
+ /** Whether `message` was capped before persistence. */
3852
+ truncated: z.boolean().default(false),
3853
+ /**
3854
+ * Optional source label for logs emitted from a nested case-owned activity,
3855
+ * such as a score key.
3856
+ */
3857
+ source: z.string().optional()
3858
+ });
3626
3859
  /** Trace payload captured while computing one score for a case. */
3627
3860
  const scoreTraceSchema = z.object({
3628
3861
  trace: z.array(traceSpanSchema),
@@ -3651,6 +3884,8 @@ const caseDetailSchema = z.object({
3651
3884
  scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
3652
3885
  columns: z.record(z.string(), cellValueSchema),
3653
3886
  assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
3887
+ /** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
3888
+ logs: z.array(runLogEntrySchema).default([]),
3654
3889
  error: z.object({
3655
3890
  name: z.string().optional(),
3656
3891
  message: z.string(),
@@ -3802,6 +4037,14 @@ const apiCallsConfigSchema = z.object({
3802
4037
  /** Custom user-defined metrics surfaced on each API call. */
3803
4038
  metrics: z.array(apiCallMetricSchema).optional()
3804
4039
  });
4040
+ /** Schema for workspace-level run log capture options. */
4041
+ const runLogsConfigSchema = z.object({
4042
+ /**
4043
+ * Capture `console.log`, `console.info`, `console.warn`, and
4044
+ * `console.error` calls made inside active eval case scopes. Defaults to
4045
+ * `true`; manual `evalLog(...)` calls are always captured.
4046
+ */
4047
+ captureConsole: z.boolean().optional() });
3805
4048
  /** Default LLM-calls config the UI uses before the workspace fetch resolves. */
3806
4049
  const DEFAULT_LLM_CALLS_CONFIG = {
3807
4050
  kinds: ["llm"],
@@ -3917,6 +4160,7 @@ const agentEvalsConfigSchema = z.object({
3917
4160
  traceDisplay: traceDisplayInputConfigSchema.optional(),
3918
4161
  llmCalls: llmCallsConfigSchema.optional(),
3919
4162
  apiCalls: apiCallsConfigSchema.optional(),
4163
+ runLogs: runLogsConfigSchema.optional(),
3920
4164
  cache: z.object({
3921
4165
  enabled: z.boolean().optional(),
3922
4166
  dir: z.string().optional(),
@@ -4490,15 +4734,33 @@ const defaultMaxEntriesPerNamespace = 100;
4490
4734
  */
4491
4735
  function createFsCacheStore(options) {
4492
4736
  const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
4737
+ const debugDir = resolve(options.workspaceRoot, options.debugDir ?? ".agent-evals/cache-debug");
4493
4738
  const defaultMaxEntries = normalizeMaxEntries(options.maxEntriesPerNamespace);
4494
4739
  return {
4495
4740
  dir() {
4496
4741
  return cacheDir;
4497
4742
  },
4743
+ debugDir() {
4744
+ return debugDir;
4745
+ },
4498
4746
  async lookup(namespace, keyHash) {
4499
4747
  return (await readCacheFile(cacheDir, ownerFromNamespace(namespace)))?.entries[keyHash] ?? null;
4500
4748
  },
4501
- async write(entry) {
4749
+ async lookupWithDebug(namespace, keyHash) {
4750
+ const owner = ownerFromNamespace(namespace);
4751
+ const entry = (await readCacheFile(cacheDir, owner))?.entries[keyHash] ?? null;
4752
+ if (entry === null) return null;
4753
+ const debugKey = (await readDebugKeyFile(debugDir, owner))?.entries[keyHash];
4754
+ const deserializedEntry = {
4755
+ ...entry,
4756
+ recording: deserializeCacheRecording(entry.recording)
4757
+ };
4758
+ return debugKey === void 0 ? deserializedEntry : {
4759
+ ...deserializedEntry,
4760
+ debugKey
4761
+ };
4762
+ },
4763
+ async write(entry, debugKey) {
4502
4764
  const owner = ownerFromNamespace(entry.namespace);
4503
4765
  const filePath = ownerPath(cacheDir, owner);
4504
4766
  await mkdir(cacheDir, { recursive: true });
@@ -4512,6 +4774,17 @@ function createFsCacheStore(options) {
4512
4774
  }, entry.namespace, maxEntriesForNamespace(entry.namespace, defaultMaxEntries, options.maxEntriesByNamespace), entry.key)
4513
4775
  });
4514
4776
  });
4777
+ if (debugKey !== void 0) {
4778
+ if ((await resultify(() => writeDebugKeyEntry({
4779
+ debugDir,
4780
+ entry,
4781
+ debugKey,
4782
+ maxEntries: maxEntriesForNamespace(entry.namespace, defaultMaxEntries, options.maxEntriesByNamespace)
4783
+ }))).error) await resultify(() => clearDebugEntries(debugDir, {
4784
+ namespace: entry.namespace,
4785
+ key: entry.key
4786
+ }));
4787
+ }
4515
4788
  },
4516
4789
  async list() {
4517
4790
  if (!existsSync(cacheDir)) return [];
@@ -4544,17 +4817,21 @@ function createFsCacheStore(options) {
4544
4817
  return items;
4545
4818
  },
4546
4819
  async clear(filter) {
4547
- if (!existsSync(cacheDir)) return;
4548
4820
  if (!filter || filter.namespace === void 0 && filter.key === void 0) {
4549
4821
  await rm(cacheDir, {
4550
4822
  recursive: true,
4551
4823
  force: true
4552
4824
  });
4825
+ await rm(debugDir, {
4826
+ recursive: true,
4827
+ force: true
4828
+ });
4553
4829
  return;
4554
4830
  }
4555
4831
  if (filter.namespace !== void 0) {
4556
4832
  const owner = ownerFromNamespace(filter.namespace);
4557
- await withCacheFileLock(ownerPath(cacheDir, owner), async () => {
4833
+ const filePath = ownerPath(cacheDir, owner);
4834
+ if (existsSync(cacheDir)) await withCacheFileLock(filePath, async () => {
4558
4835
  const cacheFile = await readCacheFile(cacheDir, owner);
4559
4836
  if (cacheFile === null) return;
4560
4837
  await writeOrRemoveCacheFile(cacheDir, {
@@ -4566,23 +4843,27 @@ function createFsCacheStore(options) {
4566
4843
  }))
4567
4844
  });
4568
4845
  });
4846
+ await clearDebugEntries(debugDir, filter);
4569
4847
  return;
4570
4848
  }
4571
- const files = await readdir(cacheDir);
4572
- for (const fileName of files) {
4573
- if (!fileName.endsWith(".json")) continue;
4574
- const filePath = join(cacheDir, fileName);
4575
- await withCacheFileLock(filePath, async () => {
4576
- const cacheFile = await readCacheFilePath(filePath);
4577
- if (cacheFile === null) return;
4578
- const entries = Object.fromEntries(Object.entries(cacheFile.entries).filter(([key]) => key !== filter.key));
4579
- await writeOrRemoveCacheFile(cacheDir, {
4580
- version: 1,
4581
- owner: cacheFile.owner,
4582
- entries
4849
+ if (existsSync(cacheDir)) {
4850
+ const files = await readdir(cacheDir);
4851
+ for (const fileName of files) {
4852
+ if (!fileName.endsWith(".json")) continue;
4853
+ const filePath = join(cacheDir, fileName);
4854
+ await withCacheFileLock(filePath, async () => {
4855
+ const cacheFile = await readCacheFilePath(filePath);
4856
+ if (cacheFile === null) return;
4857
+ const entries = Object.fromEntries(Object.entries(cacheFile.entries).filter(([key]) => key !== filter.key));
4858
+ await writeOrRemoveCacheFile(cacheDir, {
4859
+ version: 1,
4860
+ owner: cacheFile.owner,
4861
+ entries
4862
+ });
4583
4863
  });
4584
- });
4864
+ }
4585
4865
  }
4866
+ await clearDebugEntries(debugDir, filter);
4586
4867
  }
4587
4868
  };
4588
4869
  }
@@ -4598,18 +4879,21 @@ function createBufferedCacheStore(backingStore) {
4598
4879
  return {
4599
4880
  async lookup(namespace, keyHash) {
4600
4881
  const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
4601
- if (buffered !== void 0) return buffered;
4882
+ if (buffered !== void 0) return buffered.entry;
4602
4883
  return backingStore.lookup(namespace, keyHash);
4603
4884
  },
4604
- write(entry) {
4605
- pendingEntries.set(toPendingKey(entry.namespace, entry.key), entry);
4885
+ write(entry, debugKey) {
4886
+ pendingEntries.set(toPendingKey(entry.namespace, entry.key), {
4887
+ entry,
4888
+ debugKey
4889
+ });
4606
4890
  return Promise.resolve();
4607
4891
  },
4608
4892
  async commit() {
4609
- for (const entry of pendingEntries.values()) await backingStore.write(entry);
4893
+ for (const pending of pendingEntries.values()) await backingStore.write(pending.entry, pending.debugKey);
4610
4894
  },
4611
4895
  getPendingEntries() {
4612
- return [...pendingEntries.values()];
4896
+ return [...pendingEntries.values()].map((pending) => pending.entry);
4613
4897
  }
4614
4898
  };
4615
4899
  }
@@ -4661,6 +4945,94 @@ async function writeCacheFile(cacheDir, cacheFile) {
4661
4945
  await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
4662
4946
  await rename(tmpPath, filePath);
4663
4947
  }
4948
+ async function readDebugKeyFile(debugDir, owner) {
4949
+ return readDebugKeyFilePath(ownerPath(debugDir, owner));
4950
+ }
4951
+ async function readDebugKeyFilePath(filePath) {
4952
+ if (!existsSync(filePath)) return null;
4953
+ const rawResult = await resultify(() => readFile(filePath, "utf-8"));
4954
+ if (rawResult.error) return null;
4955
+ const json = safeJsonParse(rawResult.value);
4956
+ if (json === null) return null;
4957
+ const parsed = cacheDebugKeyFileSchema.safeParse(json);
4958
+ if (!parsed.success) return null;
4959
+ return parsed.data;
4960
+ }
4961
+ async function writeDebugKeyEntry(params) {
4962
+ const { debugDir, entry, debugKey, maxEntries } = params;
4963
+ const owner = ownerFromNamespace(entry.namespace);
4964
+ const filePath = ownerPath(debugDir, owner);
4965
+ await mkdir(debugDir, { recursive: true });
4966
+ await withCacheFileLock(filePath, async () => {
4967
+ const entries = (await readDebugKeyFile(debugDir, owner))?.entries ?? {};
4968
+ const debugEntry = {
4969
+ version: 1,
4970
+ key: entry.key,
4971
+ namespace: entry.namespace,
4972
+ operationType: debugKey.operationType,
4973
+ operationName: debugKey.operationName,
4974
+ storedAt: entry.storedAt,
4975
+ codeFingerprint: debugKey.codeFingerprint,
4976
+ rawKey: debugKey.rawKey
4977
+ };
4978
+ await writeDebugKeyFile(debugDir, {
4979
+ version: 1,
4980
+ owner,
4981
+ entries: pruneDebugKeyEntries({
4982
+ ...entries,
4983
+ [entry.key]: debugEntry
4984
+ }, entry.namespace, maxEntries, entry.key)
4985
+ });
4986
+ });
4987
+ }
4988
+ async function clearDebugEntries(debugDir, filter) {
4989
+ if (!existsSync(debugDir)) return;
4990
+ if (filter.namespace !== void 0) {
4991
+ const owner = ownerFromNamespace(filter.namespace);
4992
+ await withCacheFileLock(ownerPath(debugDir, owner), async () => {
4993
+ const debugFile = await readDebugKeyFile(debugDir, owner);
4994
+ if (debugFile === null) return;
4995
+ await writeOrRemoveDebugKeyFile(debugDir, {
4996
+ version: 1,
4997
+ owner,
4998
+ entries: Object.fromEntries(Object.entries(debugFile.entries).filter(([key, entry]) => {
4999
+ if (filter.key !== void 0) return key !== filter.key;
5000
+ return entry.namespace !== filter.namespace;
5001
+ }))
5002
+ });
5003
+ });
5004
+ return;
5005
+ }
5006
+ const files = await readdir(debugDir);
5007
+ for (const fileName of files) {
5008
+ if (!fileName.endsWith(".json")) continue;
5009
+ const filePath = join(debugDir, fileName);
5010
+ await withCacheFileLock(filePath, async () => {
5011
+ const debugFile = await readDebugKeyFilePath(filePath);
5012
+ if (debugFile === null) return;
5013
+ const entries = Object.fromEntries(Object.entries(debugFile.entries).filter(([key]) => key !== filter.key));
5014
+ await writeOrRemoveDebugKeyFile(debugDir, {
5015
+ version: 1,
5016
+ owner: debugFile.owner,
5017
+ entries
5018
+ });
5019
+ });
5020
+ }
5021
+ }
5022
+ async function writeOrRemoveDebugKeyFile(debugDir, debugFile) {
5023
+ if (Object.keys(debugFile.entries).length === 0) {
5024
+ await rm(ownerPath(debugDir, debugFile.owner), { force: true });
5025
+ return;
5026
+ }
5027
+ await writeDebugKeyFile(debugDir, debugFile);
5028
+ }
5029
+ async function writeDebugKeyFile(debugDir, debugFile) {
5030
+ await mkdir(debugDir, { recursive: true });
5031
+ const filePath = ownerPath(debugDir, debugFile.owner);
5032
+ const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
5033
+ await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
5034
+ await rename(tmpPath, filePath);
5035
+ }
4664
5036
  function pruneEntries(entries, namespace, maxEntries, protectedKey) {
4665
5037
  const sorted = Object.values(entries).filter((entry) => entry.namespace === namespace).toSorted((a, b) => a.storedAt < b.storedAt ? 1 : -1);
4666
5038
  const kept = /* @__PURE__ */ new Map();
@@ -4672,6 +5044,17 @@ function pruneEntries(entries, namespace, maxEntries, protectedKey) {
4672
5044
  }
4673
5045
  return Object.fromEntries(Object.values(entries).filter((entry) => entry.namespace !== namespace || kept.has(entry.key)).toSorted((a, b) => a.key < b.key ? -1 : 1).map((entry) => [entry.key, entry]));
4674
5046
  }
5047
+ function pruneDebugKeyEntries(entries, namespace, maxEntries, protectedKey) {
5048
+ const sorted = Object.values(entries).filter((entry) => entry.namespace === namespace).toSorted((a, b) => a.storedAt < b.storedAt ? 1 : -1);
5049
+ const kept = /* @__PURE__ */ new Map();
5050
+ const protectedEntry = entries[protectedKey];
5051
+ if (protectedEntry?.namespace === namespace) kept.set(protectedEntry.key, protectedEntry);
5052
+ for (const entry of sorted) {
5053
+ if (kept.size >= maxEntries) break;
5054
+ kept.set(entry.key, entry);
5055
+ }
5056
+ return Object.fromEntries(Object.values(entries).filter((entry) => entry.namespace !== namespace || kept.has(entry.key)).toSorted((a, b) => a.key < b.key ? -1 : 1).map((entry) => [entry.key, entry]));
5057
+ }
4675
5058
  async function withCacheFileLock(filePath, fn) {
4676
5059
  const lockPath = `${filePath}.lock`;
4677
5060
  await acquireLock(lockPath);
@@ -4893,7 +5276,8 @@ const defaultConfig = {
4893
5276
  label: "Output",
4894
5277
  format: "json",
4895
5278
  placements: ["section"]
4896
- }] }
5279
+ }] },
5280
+ runLogs: { captureConsole: true }
4897
5281
  };
4898
5282
  async function loadConfig() {
4899
5283
  const configPath = resolve(process.cwd(), "agent-evals.config.ts");
@@ -5657,6 +6041,10 @@ async function runCase(params) {
5657
6041
  } : void 0
5658
6042
  });
5659
6043
  const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
6044
+ scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
6045
+ ...entry,
6046
+ source: key
6047
+ })));
5660
6048
  if (trace.length > 0) scoringTraces[key] = {
5661
6049
  trace,
5662
6050
  traceDisplay
@@ -5727,6 +6115,7 @@ async function runCase(params) {
5727
6115
  traceDisplay,
5728
6116
  columns,
5729
6117
  assertionFailures: scope.assertionFailures,
6118
+ logs: scope.logs,
5730
6119
  error: errorInfo,
5731
6120
  trial,
5732
6121
  cacheRefs: scope.caseCacheRefs
@@ -6142,4 +6531,4 @@ function toLastRunStatus(status) {
6142
6531
  return status === "pending" ? null : status;
6143
6532
  }
6144
6533
  //#endregion
6145
- export { caseRowSchema as $, appendToEvalOutput as $t, getEvalTitle as A, traceDisplayConfigSchema as At, apiCallMetricFormatSchema as B, fileRefSchema as Bt, createRunRequestSchema as C, serializedCacheSpanSchema as Ct, extractApiCalls as D, traceAttributeDisplayInputSchema as Dt, extractCacheHits as E, traceAttributeDisplayFormatSchema as Et, runManifestSchema as F, traceSpanWarningSchema as Ft, llmCallMetricPlacementSchema as G, z$1 as Gt, apiCallMetricSchema as H, numberDisplayOptionsSchema as Ht, runSummarySchema as I, cellValueSchema as It, resolveApiCallsConfig as J, evalSpan as Jt, llmCallMetricSchema as K, buildTraceTree as Kt, DEFAULT_API_CALLS_CONFIG as L, columnDefSchema as Lt, deriveScopedSummaryFromCases as M, traceSpanErrorSchema as Mt, deriveStatusFromCaseRows as N, traceSpanKindSchema as Nt, extractLlmCalls as O, traceAttributeDisplayPlacementSchema as Ot, deriveStatusFromChildStatuses as P, traceSpanSchema as Pt, caseDetailSchema as Q, EvalAssertionError as Qt, DEFAULT_LLM_CALLS_CONFIG as R, columnFormatSchema as Rt, createFsCacheStore as S, cacheStatusSchema as St, sseEnvelopeSchema as T, traceCacheRefSchema as Tt, apiCallsConfigSchema as U, repoFileRefSchema as Ut, apiCallMetricPlacementSchema as V, jsonCellSchema as Vt, llmCallMetricFormatSchema as W, runArtifactRefSchema as Wt, trialSelectionModeSchema as X, hashCacheKey as Xt, resolveLlmCallsConfig as Y, evalTracer as Yt, assertionFailureSchema as Z, hashCacheKeySync as Zt, loadEvalModule as _, cacheListItemSchema as _t, loadPersistedRunSnapshot as a, mergeEvalOutput as an, scoreTraceSchema as at, buildDeclaredColumnDefs as b, cacheRecordingOpSchema as bt, persistCaseDetail as c, runInEvalScope as cn, evalChartBuiltinMetricSchema as ct, recomputePersistedCaseStatus as d, setScopeCacheContext as dn, evalChartMetricSchema as dt, evalAssert as en, evalFreshnessStatusSchema as et, runTouchesEval as f, startEvalBackgroundJob as fn, evalChartTooltipExtraSchema as ft, setLatestRunInfoMap as g, cacheFileSchema as gt, getTargetEvalIds as h, getEvalRegistry as hn, cacheEntrySchema as ht, getLatestRunInfos as i, isInEvalScope as in, evalSummarySchema as it, getEvalDisplayStatus as j, traceDisplayInputConfigSchema as jt, getNestedAttribute as k, traceAttributeDisplaySchema as kt, persistRunState as l, runInExistingEvalScope as ln, evalChartColorSchema as lt, buildEvalSummary as m, defineEval as mn, evalChartsConfigSchema as mt, generateRunId as n, getEvalCaseInput as nn, evalStatItemSchema as nt, loadPersistedRunSnapshots as o, nextEvalId as on, evalChartAggregateSchema as ot, resolveArtifactPath as p, repoFile as pn, evalChartTypeSchema as pt, llmCallsConfigSchema as q, captureEvalSpanError as qt, getLastRunStatuses as r, incrementEvalOutput as rn, evalStatsConfigSchema as rt, nextShortIdFromSnapshots as s, runInEvalRuntimeScope as sn, evalChartAxisSchema as st, executeRun as t, getCurrentScope as tn, evalStatAggregateSchema as tt, recomputeEvalStatusesInRuns as u, setEvalOutput as un, evalChartConfigSchema as ut, parseEvalMetas as v, cacheModeSchema as vt, updateManualScoreRequestSchema as w, spanCacheOptionsSchema as wt, normalizeScoreDef as x, cacheRecordingSchema as xt, loadConfig as y, cacheOperationTypeSchema as yt, agentEvalsConfigSchema as z, columnKindSchema as zt };
6534
+ export { caseDetailSchema as $, buildTraceTree as $t, getEvalTitle as A, serializedCacheSpanSchema as At, apiCallMetricFormatSchema as B, traceSpanKindSchema as Bt, createRunRequestSchema as C, setEvalOutput as Cn, cacheFileSchema as Ct, extractApiCalls as D, defineEval as Dn, cacheRecordingOpSchema as Dt, extractCacheHits as E, repoFile as En, cacheOperationTypeSchema as Et, runManifestSchema as F, traceAttributeDisplayPlacementSchema as Ft, llmCallMetricPlacementSchema as G, columnFormatSchema as Gt, apiCallMetricSchema as H, traceSpanWarningSchema as Ht, runSummarySchema as I, traceAttributeDisplaySchema as It, resolveApiCallsConfig as J, jsonCellSchema as Jt, llmCallMetricSchema as K, columnKindSchema as Kt, DEFAULT_API_CALLS_CONFIG as L, traceDisplayConfigSchema as Lt, deriveScopedSummaryFromCases as M, traceCacheRefSchema as Mt, deriveStatusFromCaseRows as N, traceAttributeDisplayFormatSchema as Nt, extractLlmCalls as O, getEvalRegistry as On, cacheRecordingSchema as Ot, deriveStatusFromChildStatuses as P, traceAttributeDisplayInputSchema as Pt, assertionFailureSchema as Q, z$1 as Qt, DEFAULT_LLM_CALLS_CONFIG as R, traceDisplayInputConfigSchema as Rt, createFsCacheStore as S, runInExistingEvalScope as Sn, cacheEntryWithDebugKeySchema as St, sseEnvelopeSchema as T, startEvalBackgroundJob as Tn, cacheModeSchema as Tt, apiCallsConfigSchema as U, cellValueSchema as Ut, apiCallMetricPlacementSchema as V, traceSpanSchema as Vt, llmCallMetricFormatSchema as W, columnDefSchema as Wt, runLogsConfigSchema as X, repoFileRefSchema as Xt, resolveLlmCallsConfig as Y, numberDisplayOptionsSchema as Yt, trialSelectionModeSchema as Z, runArtifactRefSchema as Zt, loadEvalModule as _, isInEvalScope as _n, evalChartTypeSchema as _t, loadPersistedRunSnapshot as a, deserializeCacheRecording as an, evalSummarySchema as at, buildDeclaredColumnDefs as b, runInEvalRuntimeScope as bn, cacheDebugKeyFileSchema as bt, persistCaseDetail as c, serializeCacheValue as cn, runLogPhaseSchema as ct, recomputePersistedCaseStatus as d, configureEvalRunLogs as dn, evalChartAxisSchema as dt, captureEvalSpanError as en, caseRowSchema as et, runTouchesEval as f, evalAssert as fn, evalChartBuiltinMetricSchema as ft, setLatestRunInfoMap as g, incrementEvalOutput as gn, evalChartTooltipExtraSchema as gt, getTargetEvalIds as h, getEvalCaseInput as hn, evalChartMetricSchema as ht, getLatestRunInfos as i, hashCacheKeySync as in, evalStatsConfigSchema as it, getEvalDisplayStatus as j, spanCacheOptionsSchema as jt, getNestedAttribute as k, cacheStatusSchema as kt, persistRunState as l, EvalAssertionError as ln, scoreTraceSchema as lt, buildEvalSummary as m, getCurrentScope as mn, evalChartConfigSchema as mt, generateRunId as n, evalTracer as nn, evalStatAggregateSchema as nt, loadPersistedRunSnapshots as o, deserializeCacheValue as on, runLogEntrySchema as ot, resolveArtifactPath as p, evalLog as pn, evalChartColorSchema as pt, llmCallsConfigSchema as q, fileRefSchema as qt, getLastRunStatuses as r, hashCacheKey as rn, evalStatItemSchema as rt, nextShortIdFromSnapshots as s, serializeCacheRecording as sn, runLogLevelSchema as st, executeRun as t, evalSpan as tn, evalFreshnessStatusSchema as tt, recomputeEvalStatusesInRuns as u, appendToEvalOutput as un, evalChartAggregateSchema as ut, parseEvalMetas as v, mergeEvalOutput as vn, evalChartsConfigSchema as vt, updateManualScoreRequestSchema as w, setScopeCacheContext as wn, cacheListItemSchema as wt, normalizeScoreDef as x, runInEvalScope as xn, cacheEntrySchema as xt, loadConfig as y, nextEvalId as yn, cacheDebugKeyEntrySchema as yt, agentEvalsConfigSchema as z, traceSpanErrorSchema as zt };