@ls-stack/agent-eval 0.58.0 → 0.58.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-C4kAOhC1.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Cf37PZKi.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-303BocMW.mjs";
4
- export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
1
+ import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-pHJ0_TzH.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-HBwXIJsg.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-AeXGBJ26.mjs";
4
+ export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-C4kAOhC1.mjs";
2
- import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-5xEiQxiS.mjs";
1
+ import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-pHJ0_TzH.mjs";
2
+ import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-ngVXShH4.mjs";
3
3
  import { z } from "zod/v4";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
851
851
  const runLogPhaseSchema = z.enum([
852
852
  "eval",
853
853
  "derive",
854
+ "tracingAssertions",
854
855
  "outputsSchema",
855
856
  "scorer"
856
857
  ]);
@@ -1008,6 +1009,9 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
1008
1009
  const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
1009
1010
  /** Schema for keyed or object-returning trace-derived output config. */
1010
1011
  const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
1012
+ const evalTracingAssertionsFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracing assertions function" });
1013
+ /** Schema for function or keyed trace-derived assertion config. */
1014
+ const evalTracingAssertionsConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" }), z.record(z.string().min(1), evalTracingAssertionsFnSchema)]);
1011
1015
  /** Schema for UI overrides on derived or scored columns. */
1012
1016
  const evalColumnOverrideSchema = z.object({
1013
1017
  label: z.string().optional(),
@@ -1411,6 +1415,7 @@ const agentEvalsConfigSchema = z.object({
1411
1415
  traceDisplay: traceDisplayInputConfigSchema.optional(),
1412
1416
  columns: evalColumnsSchema.optional(),
1413
1417
  deriveFromTracing: evalDeriveConfigSchema.optional(),
1418
+ tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
1414
1419
  stats: evalStatsConfigSchema.optional(),
1415
1420
  defaultStatAggregate: evalStatAggregateSchema.optional(),
1416
1421
  llmCalls: llmCallsConfigSchema.optional(),
@@ -1847,8 +1852,9 @@ function deriveScopedSummaryFromCases(params) {
1847
1852
  * freshness state.
1848
1853
  */
1849
1854
  function getEvalDisplayStatus(params) {
1850
- const { stale, outdated, lastRunStatus, isRunning = false } = params;
1855
+ const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
1851
1856
  if (isRunning || lastRunStatus === "running") return "running";
1857
+ if (isEnqueued) return "enqueued";
1852
1858
  if (lastRunStatus === "pass") {
1853
1859
  if (stale) return "stale";
1854
1860
  if (outdated) return "outdated";
@@ -2651,6 +2657,7 @@ const scopeStorage = new AsyncLocalStorage();
2651
2657
  const runtimeScopeStorage = new AsyncLocalStorage();
2652
2658
  const evalClockStorage = new AsyncLocalStorage();
2653
2659
  const activeSpanStackStorage = new AsyncLocalStorage();
2660
+ const recordingStackStorage = new AsyncLocalStorage();
2654
2661
  let activeEvalScopeCount = 0;
2655
2662
  let activeEvalRuntimeScopeCount = 0;
2656
2663
  let consoleCaptureEnabled = true;
@@ -2717,6 +2724,17 @@ var EvalAssertionError = class extends Error {
2717
2724
  this.name = "EvalAssertionError";
2718
2725
  }
2719
2726
  };
2727
+ /** Error thrown when an SDK helper is used in an unsupported runner phase. */
2728
+ var EvalRuntimeUsageError = class extends Error {
2729
+ constructor(message) {
2730
+ super(message);
2731
+ this.name = "EvalRuntimeUsageError";
2732
+ }
2733
+ };
2734
+ /** Throw when assertion helpers are used in a runner phase that forbids them. */
2735
+ function assertEvalAssertionsAllowed(apiName) {
2736
+ if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
2737
+ }
2720
2738
  function getEvalClockStateNowMs(state) {
2721
2739
  const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
2722
2740
  return state.startMs + elapsedMs + state.offsetMs;
@@ -2803,13 +2821,29 @@ async function runWithActiveSpan(span, fn) {
2803
2821
  const currentStack = activeSpanStackStorage.getStore() ?? [];
2804
2822
  return await activeSpanStackStorage.run([...currentStack, span], fn);
2805
2823
  }
2824
+ /** Execute a callback with a cache recording frame scoped to this async branch. */
2825
+ async function runWithCacheRecordingFrame(frame, fn) {
2826
+ const currentStack = recordingStackStorage.getStore() ?? [];
2827
+ return await recordingStackStorage.run([...currentStack, frame], fn);
2828
+ }
2829
+ function getCurrentCacheRecordingFrame(scope) {
2830
+ if (scope.replayingDepth > 0) return void 0;
2831
+ return recordingStackStorage.getStore()?.at(-1);
2832
+ }
2833
+ /** Mark a span as created by the active cache recorder, when one exists. */
2834
+ function recordSpanForActiveCacheRecording(scope, spanId) {
2835
+ if (scope.replayingDepth > 0) return;
2836
+ for (const frame of recordingStackStorage.getStore() ?? []) frame.spanIds.add(spanId);
2837
+ }
2806
2838
  /**
2807
2839
  * Return the current eval runner phase for this async execution.
2808
2840
  *
2809
2841
  * Returns `null` outside eval-owned work, `env` while the runner is loading
2810
2842
  * eval modules for a run, `cases` while generating cases, `eval` while running
2811
- * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
2812
- * while validating outputs, and `scorer` while computing scores.
2843
+ * case `execute`, `derive` while deriving outputs from traces,
2844
+ * `tracingAssertions` while checking trace-derived assertions,
2845
+ * `outputsSchema` while validating outputs, and `scorer` while computing
2846
+ * scores.
2813
2847
  */
2814
2848
  function isInEvalScope() {
2815
2849
  if (activeEvalRuntimeScopeCount === 0) return null;
@@ -2830,7 +2864,7 @@ function normalizeLogLevel(level) {
2830
2864
  }
2831
2865
  function getCurrentLogPhase() {
2832
2866
  const runtimeScope = runtimeScopeStorage.getStore();
2833
- if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
2867
+ if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
2834
2868
  return null;
2835
2869
  }
2836
2870
  function formatLogArgs(args) {
@@ -3110,7 +3144,6 @@ async function runInEvalScope(caseId, fn, options = {}) {
3110
3144
  logs: [],
3111
3145
  spans: [],
3112
3146
  checkpoints: /* @__PURE__ */ new Map(),
3113
- recordingStack: [],
3114
3147
  replayingDepth: 0,
3115
3148
  cacheContext: options.cacheContext,
3116
3149
  caseCacheRefs: [],
@@ -3150,10 +3183,16 @@ function nextEvalId() {
3150
3183
  scope.nextEvalIdCounter++;
3151
3184
  return `${scope.idPrefix}-${scope.nextEvalIdCounter}`;
3152
3185
  }
3153
- function recordOpIfActive(scope, op) {
3154
- if (scope.replayingDepth > 0) return;
3155
- const top = scope.recordingStack.at(-1);
3156
- if (top) top.ops.push(op);
3186
+ function recordCacheRecordingOpIfActive(scope, op) {
3187
+ getCurrentCacheRecordingFrame(scope)?.ops.push(op);
3188
+ }
3189
+ function recordCacheRecordingAttributesIfActive(scope, span, attributes) {
3190
+ const frames = recordingStackStorage.getStore();
3191
+ if (scope.replayingDepth > 0 || frames === void 0) return;
3192
+ for (const [key, value] of Object.entries(attributes)) {
3193
+ if (key.startsWith("cache.")) continue;
3194
+ for (const frame of frames) if (span.id === frame.replayParentSpanId) frame.finalAttributes[key] = value;
3195
+ }
3157
3196
  }
3158
3197
  function normalizeEvalOutputOptions(options) {
3159
3198
  if (options === void 0) return void 0;
@@ -3185,7 +3224,7 @@ function setEvalOutput(key, value, options = void 0) {
3185
3224
  scope.outputs[key] = value;
3186
3225
  const column = normalizeEvalOutputOptions(options);
3187
3226
  if (column !== void 0) scope.outputColumnOverrides[key] = column;
3188
- recordOpIfActive(scope, {
3227
+ recordCacheRecordingOpIfActive(scope, {
3189
3228
  kind: "setOutput",
3190
3229
  key,
3191
3230
  value,
@@ -3205,7 +3244,7 @@ function appendToEvalOutput(key, value) {
3205
3244
  if (existing === void 0) scope.outputs[key] = [value];
3206
3245
  else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
3207
3246
  else scope.outputs[key] = [existing, value];
3208
- recordOpIfActive(scope, {
3247
+ recordCacheRecordingOpIfActive(scope, {
3209
3248
  kind: "appendOutput",
3210
3249
  key,
3211
3250
  value
@@ -3223,7 +3262,7 @@ function mergeEvalOutput(key, patch) {
3223
3262
  const existing = scope.outputs[key];
3224
3263
  if (existing === void 0) {
3225
3264
  scope.outputs[key] = { ...patch };
3226
- recordOpIfActive(scope, {
3265
+ recordCacheRecordingOpIfActive(scope, {
3227
3266
  kind: "mergeOutput",
3228
3267
  key,
3229
3268
  patch
@@ -3238,7 +3277,7 @@ function mergeEvalOutput(key, patch) {
3238
3277
  ...existing,
3239
3278
  ...patch
3240
3279
  };
3241
- recordOpIfActive(scope, {
3280
+ recordCacheRecordingOpIfActive(scope, {
3242
3281
  kind: "mergeOutput",
3243
3282
  key,
3244
3283
  patch
@@ -3256,7 +3295,7 @@ function incrementEvalOutput(key, delta) {
3256
3295
  const existing = scope.outputs[key];
3257
3296
  if (existing === void 0) {
3258
3297
  scope.outputs[key] = delta;
3259
- recordOpIfActive(scope, {
3298
+ recordCacheRecordingOpIfActive(scope, {
3260
3299
  kind: "incrementOutput",
3261
3300
  key,
3262
3301
  delta
@@ -3268,7 +3307,7 @@ function incrementEvalOutput(key, delta) {
3268
3307
  return;
3269
3308
  }
3270
3309
  scope.outputs[key] = existing + delta;
3271
- recordOpIfActive(scope, {
3310
+ recordCacheRecordingOpIfActive(scope, {
3272
3311
  kind: "incrementOutput",
3273
3312
  key,
3274
3313
  delta
@@ -3280,10 +3319,12 @@ function incrementEvalOutput(key, delta) {
3280
3319
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
3281
3320
  * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
3282
3321
  * TypeScript assertion signature still narrows the checked value after the
3283
- * call.
3322
+ * call. Calls inside `deriveFromTracing` throw because derivations must only
3323
+ * write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
3284
3324
  */
3285
3325
  function evalAssert(condition, message) {
3286
3326
  const scope = getCurrentScope();
3327
+ assertEvalAssertionsAllowed("evalAssert(...)");
3287
3328
  if (condition) {
3288
3329
  if (scope) scope.assertions.push({
3289
3330
  message,
@@ -3434,6 +3475,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
3434
3475
  * case scope is active, matching `evalAssert(...)`.
3435
3476
  */
3436
3477
  function evalExpect(value) {
3478
+ assertEvalAssertionsAllowed("evalExpect(...)");
3437
3479
  return new EvalExpectationImpl(value, false);
3438
3480
  }
3439
3481
  //#endregion
@@ -3675,10 +3717,6 @@ async function materializeExternalJsonValues(value, store) {
3675
3717
  if (!isRecordLike$3(value)) return value;
3676
3718
  return Object.fromEntries(await Promise.all(Object.entries(value).map(async ([key, entryValue]) => [key, await materializeExternalJsonValues(entryValue, store)])));
3677
3719
  }
3678
- /** Clone one value through the same serialization path used for cache data. */
3679
- async function cloneCacheValue(value, options = void 0) {
3680
- return deserializeCacheValue(await serializeCacheValue(value, options));
3681
- }
3682
3720
  function normalizeCacheSerializationOptions(options) {
3683
3721
  return {
3684
3722
  compress: options?.compress !== false,
@@ -4109,29 +4147,6 @@ function valueKind$1(value) {
4109
4147
  function copyArray(value) {
4110
4148
  return value.map((item) => item);
4111
4149
  }
4112
- function stripCacheAttributes(attributes) {
4113
- if (!attributes) return {};
4114
- const result = {};
4115
- for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
4116
- return result;
4117
- }
4118
- async function snapshotNonCacheAttributes(span) {
4119
- const snapshot = await cloneCacheValue(stripCacheAttributes(span?.attributes));
4120
- return isRecordLike$2(snapshot) ? snapshot : {};
4121
- }
4122
- function diffNonCacheAttributes(before, after) {
4123
- const result = {};
4124
- for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
4125
- return result;
4126
- }
4127
- function cacheAttributeValuesEqual(left, right) {
4128
- if (Object.is(left, right)) return true;
4129
- try {
4130
- return JSON.stringify(left) === JSON.stringify(right);
4131
- } catch {
4132
- return false;
4133
- }
4134
- }
4135
4150
  function appendCacheRef(span, ref) {
4136
4151
  if (span === void 0) return;
4137
4152
  const existing = span.attributes?.["cache.refs"];
@@ -4150,7 +4165,7 @@ function recordCacheRef(scope, span, ref) {
4150
4165
  }
4151
4166
  scope.caseCacheRefs.push(ref);
4152
4167
  }
4153
- function serializeSubSpanTree(scope, spanId) {
4168
+ function serializeSubSpanTree(scope, spanId, spanIds) {
4154
4169
  const original = scope.spans.find((s) => s.id === spanId);
4155
4170
  if (!original) return {
4156
4171
  kind: "custom",
@@ -4163,7 +4178,7 @@ function serializeSubSpanTree(scope, spanId) {
4163
4178
  warnings: void 0,
4164
4179
  children: []
4165
4180
  };
4166
- const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
4181
+ const children = scope.spans.filter((s) => s.parentId === spanId && spanIds.has(s.id)).map((child) => serializeSubSpanTree(scope, child.id, spanIds));
4167
4182
  return {
4168
4183
  kind: original.kind,
4169
4184
  name: original.name,
@@ -4179,9 +4194,9 @@ function serializeSubSpanTree(scope, spanId) {
4179
4194
  function appendSubSpanOps(scope, frame) {
4180
4195
  for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
4181
4196
  const candidate = scope.spans[i];
4182
- if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
4197
+ if (candidate?.parentId === frame.replayParentSpanId && frame.spanIds.has(candidate.id)) frame.ops.push({
4183
4198
  kind: "subSpan",
4184
- span: serializeSubSpanTree(scope, candidate.id)
4199
+ span: serializeSubSpanTree(scope, candidate.id, frame.spanIds)
4185
4200
  });
4186
4201
  }
4187
4202
  }
@@ -4437,25 +4452,21 @@ function createTraceCache(generateSpanId) {
4437
4452
  key: keyHash,
4438
4453
  status: "bypass"
4439
4454
  });
4440
- const beforeAttributes = await snapshotNonCacheAttributes(activeSpan);
4441
4455
  const frame = {
4442
4456
  baseSpanIndex: scope.spans.length,
4443
4457
  replayParentSpanId: activeSpan?.id ?? null,
4458
+ spanIds: /* @__PURE__ */ new Set(),
4459
+ finalAttributes: {},
4444
4460
  ops: []
4445
4461
  };
4446
- scope.recordingStack.push(frame);
4447
- let bodyResult;
4448
- try {
4449
- bodyResult = await fn();
4450
- } finally {
4451
- scope.recordingStack.pop();
4452
- }
4462
+ const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
4463
+ return await fn();
4464
+ });
4453
4465
  appendSubSpanOps(scope, frame);
4454
4466
  if (canStore) {
4455
- const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
4456
4467
  const recording = {
4457
4468
  returnValue: bodyResult,
4458
- finalAttributes,
4469
+ finalAttributes: frame.finalAttributes,
4459
4470
  ops: frame.ops
4460
4471
  };
4461
4472
  await cacheCtx.adapter.write({
@@ -4514,6 +4525,13 @@ function mergeSpanAttributes(span, attributes) {
4514
4525
  ...span.attributes,
4515
4526
  ...attributes
4516
4527
  };
4528
+ const scope = getCurrentScope();
4529
+ if (scope !== void 0) recordCacheRecordingAttributesIfActive(scope, span, attributes);
4530
+ }
4531
+ function copyNonCacheAttributes(attributes) {
4532
+ const result = {};
4533
+ for (const [key, value] of Object.entries(attributes ?? {})) if (!key.startsWith("cache.")) result[key] = value;
4534
+ return result;
4517
4535
  }
4518
4536
  function isRecordLike$1(value) {
4519
4537
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4688,6 +4706,7 @@ function startExternalSpan(info) {
4688
4706
  status: "running",
4689
4707
  attributes: info.attributes
4690
4708
  });
4709
+ recordSpanForActiveCacheRecording(scope, id);
4691
4710
  return createExternalSpanHandle(id);
4692
4711
  }
4693
4712
  function updateExternalSpan(info) {
@@ -4746,6 +4765,7 @@ function recordExternalSpan(info) {
4746
4765
  warning: info.warning,
4747
4766
  warnings: info.warnings
4748
4767
  });
4768
+ recordSpanForActiveCacheRecording(scope, id);
4749
4769
  return id;
4750
4770
  }
4751
4771
  /**
@@ -4831,6 +4851,7 @@ async function traceSpanInternal(info, fn) {
4831
4851
  attributes: info.attributes
4832
4852
  };
4833
4853
  scope.spans.push(spanRecord);
4854
+ recordSpanForActiveCacheRecording(scope, id);
4834
4855
  const activeSpan = createSpanHandle(spanRecord);
4835
4856
  return await runWithActiveSpan(spanRecord, async () => {
4836
4857
  try {
@@ -4880,21 +4901,19 @@ async function traceSpanInternal(info, fn) {
4880
4901
  const frame = {
4881
4902
  baseSpanIndex: scope.spans.length,
4882
4903
  replayParentSpanId: id,
4904
+ spanIds: /* @__PURE__ */ new Set(),
4905
+ finalAttributes: copyNonCacheAttributes(spanRecord.attributes),
4883
4906
  ops: []
4884
4907
  };
4885
- scope.recordingStack.push(frame);
4886
- let bodyResult;
4887
- try {
4888
- bodyResult = await fn(activeSpan);
4889
- } finally {
4890
- scope.recordingStack.pop();
4891
- }
4908
+ const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
4909
+ return await fn(activeSpan);
4910
+ });
4892
4911
  appendSubSpanOps(scope, frame);
4893
4912
  finishSpanWithoutThrownError(spanRecord, realStartedAt);
4894
4913
  if (canStore) {
4895
4914
  const recording = {
4896
4915
  returnValue: bodyResult,
4897
- finalAttributes: stripCacheAttributes(spanRecord.attributes),
4916
+ finalAttributes: frame.finalAttributes,
4898
4917
  finalStatus: spanRecord.status,
4899
4918
  finalError: spanRecord.error,
4900
4919
  finalErrors: spanRecord.errors,
@@ -4998,37 +5017,63 @@ const evalTracer = {
4998
5017
  status: "ok",
4999
5018
  attributes: { value: data }
5000
5019
  });
5001
- if (scope.replayingDepth === 0) {
5002
- const top = scope.recordingStack.at(-1);
5003
- if (top) top.ops.push({
5004
- kind: "checkpoint",
5005
- name,
5006
- data
5007
- });
5008
- }
5020
+ recordSpanForActiveCacheRecording(scope, id);
5021
+ recordCacheRecordingOpIfActive(scope, {
5022
+ kind: "checkpoint",
5023
+ name,
5024
+ data
5025
+ });
5009
5026
  }
5010
5027
  };
5011
5028
  /** Build a queryable trace tree helper from a flat span list and checkpoints. */
5012
5029
  function buildTraceTree(spans, checkpoints) {
5030
+ const rootSpans = spans.filter((s) => s.parentId === null);
5031
+ const flattenDfs = () => {
5032
+ const result = [];
5033
+ function visit(parentId) {
5034
+ for (const childSpan of spans) if (childSpan.parentId === parentId) {
5035
+ result.push(childSpan);
5036
+ visit(childSpan.id);
5037
+ }
5038
+ }
5039
+ visit(null);
5040
+ return result;
5041
+ };
5042
+ const filterSpanNames = (sourceSpans, kind) => {
5043
+ return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
5044
+ };
5013
5045
  return {
5014
5046
  spans,
5015
- rootSpans: spans.filter((s) => s.parentId === null),
5047
+ rootSpans,
5016
5048
  findSpan(name) {
5017
5049
  return spans.find((s) => s.name === name);
5018
5050
  },
5051
+ findSpans(name) {
5052
+ return spans.filter((s) => s.name === name);
5053
+ },
5054
+ hasSpan(name) {
5055
+ return spans.some((s) => s.name === name);
5056
+ },
5019
5057
  findSpansByKind(kind) {
5020
5058
  return spans.filter((s) => s.kind === kind);
5021
5059
  },
5060
+ findToolCallSpans() {
5061
+ return spans.filter((s) => s.kind === "tool");
5062
+ },
5063
+ listToolCallSpanNames() {
5064
+ return filterSpanNames(spans, "tool");
5065
+ },
5066
+ hasToolCallSpan(name) {
5067
+ return spans.some((s) => s.kind === "tool" && s.name === name);
5068
+ },
5069
+ listSpanNames(kind) {
5070
+ return filterSpanNames(spans, kind);
5071
+ },
5072
+ listSpanNamesDfs(kind) {
5073
+ return filterSpanNames(flattenDfs(), kind);
5074
+ },
5022
5075
  flattenDfs() {
5023
- const result = [];
5024
- function visit(parentId) {
5025
- for (const childSpan of spans) if (childSpan.parentId === parentId) {
5026
- result.push(childSpan);
5027
- visit(childSpan.id);
5028
- }
5029
- }
5030
- visit(null);
5031
- return result;
5076
+ return flattenDfs();
5032
5077
  },
5033
5078
  checkpoints
5034
5079
  };
@@ -6670,7 +6715,7 @@ async function resolveDeriveFromTracingConfig(params) {
6670
6715
  return derived;
6671
6716
  }
6672
6717
  async function runDeriveFromTracingConfig(params) {
6673
- if (params.deriveFromTracing === void 0) return;
6718
+ if (params.deriveFromTracing === void 0) return null;
6674
6719
  const { deriveFromTracing } = params;
6675
6720
  try {
6676
6721
  const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
@@ -6682,13 +6727,53 @@ async function runDeriveFromTracingConfig(params) {
6682
6727
  outputs: params.scope.outputs,
6683
6728
  derived
6684
6729
  });
6730
+ return null;
6685
6731
  } catch (e) {
6732
+ if (e instanceof EvalRuntimeUsageError) return e;
6686
6733
  const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
6687
6734
  recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
6735
+ return null;
6688
6736
  }
6689
6737
  }
6738
+ async function runOneTracingAssertion(params) {
6739
+ const { label, tracingAssertion, scope, traceTree, evalCase } = params;
6740
+ const failureCountBefore = scope.assertionFailures.length;
6741
+ const ctx = {
6742
+ trace: traceTree,
6743
+ input: evalCase.input,
6744
+ case: evalCase
6745
+ };
6746
+ try {
6747
+ await runInExistingEvalScope(scope, "tracingAssertions", async () => {
6748
+ await callUnknownFunction(tracingAssertion, [ctx]);
6749
+ });
6750
+ } catch (e) {
6751
+ if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
6752
+ recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
6753
+ }
6754
+ }
6755
+ async function runTracingAssertionsConfig(params) {
6756
+ if (params.tracingAssertions === void 0) return;
6757
+ if (typeof params.tracingAssertions === "function") {
6758
+ await runOneTracingAssertion({
6759
+ label: "tracingAssertions",
6760
+ tracingAssertion: params.tracingAssertions,
6761
+ scope: params.scope,
6762
+ traceTree: params.traceTree,
6763
+ evalCase: params.evalCase
6764
+ });
6765
+ return;
6766
+ }
6767
+ for (const [key, tracingAssertion] of Object.entries(params.tracingAssertions)) await runOneTracingAssertion({
6768
+ label: `tracingAssertions "${key}"`,
6769
+ tracingAssertion,
6770
+ scope: params.scope,
6771
+ traceTree: params.traceTree,
6772
+ evalCase: params.evalCase
6773
+ });
6774
+ }
6690
6775
  async function runCase(params) {
6691
- const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6776
+ const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6692
6777
  const scopedIdPrefix = buildScopedEvalIdPrefix({
6693
6778
  evalId,
6694
6779
  evalFilePath,
@@ -6733,22 +6818,36 @@ async function runCase(params) {
6733
6818
  apiCallsConfig
6734
6819
  });
6735
6820
  const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
6736
- const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6821
+ let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6737
6822
  if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
6738
6823
  if (!nonAssertError) {
6739
- await runDeriveFromTracingConfig({
6824
+ nonAssertError = await runDeriveFromTracingConfig({
6740
6825
  deriveFromTracing: globalDeriveFromTracing,
6741
6826
  scope,
6742
6827
  traceTree,
6743
6828
  evalCase
6744
6829
  });
6745
- await runDeriveFromTracingConfig({
6830
+ if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
6746
6831
  deriveFromTracing: evalDef.deriveFromTracing,
6747
6832
  scope,
6748
6833
  traceTree,
6749
6834
  evalCase
6750
6835
  });
6751
6836
  }
6837
+ if (!nonAssertError) {
6838
+ await runTracingAssertionsConfig({
6839
+ tracingAssertions: globalTracingAssertions,
6840
+ scope,
6841
+ traceTree,
6842
+ evalCase
6843
+ });
6844
+ await runTracingAssertionsConfig({
6845
+ tracingAssertions: evalDef.tracingAssertions,
6846
+ scope,
6847
+ traceTree,
6848
+ evalCase
6849
+ });
6850
+ }
6752
6851
  if (!nonAssertError) addDefaultOutputs({
6753
6852
  outputs: scope.outputs,
6754
6853
  spans: spansWithDerivedAttributes,
@@ -6938,4 +7037,4 @@ function recordAssertionFailure(scope, failure) {
6938
7037
  });
6939
7038
  }
6940
7039
  //#endregion
6941
- export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
7040
+ export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };