@ls-stack/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-Cn6fGL2s.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-3FrKBc9l.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-BNmtaqeC.mjs";
1
+ import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-BWyE5lRX.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-rvPrUj6S.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-DEENkbkn.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-Cn6fGL2s.mjs";
1
+ import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Mt as evalStatAggregateSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BWyE5lRX.mjs";
2
2
  import { z } from "zod/v4";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -17,6 +17,7 @@ const evalMetaSchema = z.object({
17
17
  caseCount: z.number().nullable(),
18
18
  caseIds: z.array(z.string()).optional(),
19
19
  stats: evalStatsConfigSchema.optional(),
20
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
20
21
  charts: evalChartsConfigSchema.optional(),
21
22
  manualInputDescriptor: manualInputDescriptorSchema.optional(),
22
23
  requiresManualInput: z.boolean().optional()
@@ -1,6 +1,6 @@
1
1
  import { createRequire, registerHooks } from "node:module";
2
- import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
3
2
  import { AsyncLocalStorage } from "node:async_hooks";
3
+ import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
4
4
  import { z, z as z$1 } from "zod/v4";
5
5
  import dayjs from "dayjs";
6
6
  import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
@@ -14,16 +14,31 @@ import { Result, resultify } from "t-result";
14
14
  import { fileURLToPath, pathToFileURL } from "node:url";
15
15
  //#region ../sdk/src/defineEval.ts
16
16
  const evalRegistry = /* @__PURE__ */ new Map();
17
+ const evalRegistryStorage = new AsyncLocalStorage();
17
18
  /** Return the in-memory registry of evals defined in the current process. */
18
19
  function getEvalRegistry() {
19
- return evalRegistry;
20
+ return evalRegistryStorage.getStore() ?? evalRegistry;
21
+ }
22
+ /**
23
+ * Execute a callback with an empty async-local eval registry.
24
+ *
25
+ * Runner internals use this when importing eval modules concurrently so
26
+ * `defineEval(...)` calls from one import cannot overwrite another import's
27
+ * registered definitions. The callback receives the scoped registry populated
28
+ * during its async execution.
29
+ */
30
+ async function runWithEvalRegistry(fn) {
31
+ const scopedRegistry = /* @__PURE__ */ new Map();
32
+ return await evalRegistryStorage.run(scopedRegistry, async () => {
33
+ return await fn(scopedRegistry);
34
+ });
20
35
  }
21
36
  /**
22
37
  * Register an eval definition with the SDK so the runner can discover it
23
38
  * after importing the eval module.
24
39
  */
25
40
  function defineEval(definition) {
26
- evalRegistry.set(definition.id, {
41
+ getEvalRegistry().set(definition.id, {
27
42
  id: definition.id,
28
43
  title: definition.title,
29
44
  use: (fn) => fn(definition)
@@ -608,13 +623,18 @@ const evalFreshnessStatusSchema = z.enum([
608
623
  "stale",
609
624
  "outdated"
610
625
  ]);
611
- /** Reducer used to collapse a column's per-case values into a single stat. */
626
+ /**
627
+ * Reducer used to collapse per-case values into a single duration or column
628
+ * stat.
629
+ * `best` selects the highest finite value and `worst` selects the lowest.
630
+ */
612
631
  const evalStatAggregateSchema = z.enum([
613
632
  "avg",
614
633
  "min",
615
634
  "max",
616
635
  "sum",
617
- "last"
636
+ "best",
637
+ "worst"
618
638
  ]);
619
639
  const hideIfNoValueShape = {
620
640
  /**
@@ -624,10 +644,12 @@ const hideIfNoValueShape = {
624
644
  */
625
645
  hideIfNoValue: z.boolean().optional() };
626
646
  /**
627
- * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
628
- * `cacheHits` counts Agent Eval operation-level cache hits from spans and
629
- * `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
630
- * `column` aggregates a score or numeric output column across the latest run.
647
+ * One entry in the EvalCard stats row. Built-in kinds read from the latest run;
648
+ * `duration` aggregates per-case durations, `cacheHits` counts Agent Eval
649
+ * operation-level cache hits from spans and `evalTracer.cache(...)` refs, not
650
+ * LLM provider prompt-cache read tokens. Cache hits use an independent
651
+ * aggregate mode and default to `sum`. `column` aggregates a score or numeric
652
+ * output column across the latest run.
631
653
  */
632
654
  const evalStatItemSchema = z.discriminatedUnion("kind", [
633
655
  z.object({
@@ -641,10 +663,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
641
663
  }),
642
664
  z.object({
643
665
  kind: z.literal("duration"),
666
+ aggregate: evalStatAggregateSchema.optional(),
644
667
  ...hideIfNoValueShape
645
668
  }),
646
669
  z.object({
647
670
  kind: z.literal("cacheHits"),
671
+ aggregate: evalStatAggregateSchema.optional(),
648
672
  ...hideIfNoValueShape
649
673
  }),
650
674
  z.object({
@@ -704,6 +728,11 @@ z.object({
704
728
  */
705
729
  stats: evalStatsConfigSchema.optional(),
706
730
  /**
731
+ * Initial aggregate mode used for duration and column stats on this eval
732
+ * card. Overrides workspace-level `defaultStatAggregate` when present.
733
+ */
734
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
735
+ /**
707
736
  * Ordered per-eval history chart configuration for the EvalCard. Opt-in:
708
737
  * when omitted or empty, the UI renders no history chart at all.
709
738
  */
@@ -1338,6 +1367,7 @@ const agentEvalsConfigSchema = z.object({
1338
1367
  columns: evalColumnsSchema.optional(),
1339
1368
  deriveFromTracing: evalDeriveConfigSchema.optional(),
1340
1369
  stats: evalStatsConfigSchema.optional(),
1370
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
1341
1371
  llmCalls: llmCallsConfigSchema.optional(),
1342
1372
  removeDefaultConfig: removeDefaultConfigSchema.optional(),
1343
1373
  apiCalls: apiCallsConfigSchema.optional(),
@@ -4924,12 +4954,13 @@ function createFsCacheStore(options) {
4924
4954
  },
4925
4955
  async lookup(namespace, keyHash) {
4926
4956
  const entry = await readCacheEntry(cacheDir, namespace, keyHash);
4927
- return entry === null ? null : await materializeExternalJsonCacheEntry(entry, externalJsonStore);
4957
+ return entry === null ? null : await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
4928
4958
  },
4929
4959
  async lookupWithDebug(namespace, keyHash) {
4930
4960
  const rawEntry = await readCacheEntry(cacheDir, namespace, keyHash);
4931
4961
  if (rawEntry === null) return null;
4932
- const entry = await materializeExternalJsonCacheEntry(rawEntry, externalJsonStore);
4962
+ const entry = await materializeExternalJsonCacheEntryOrNull(rawEntry, externalJsonStore);
4963
+ if (entry === null) return null;
4933
4964
  const debugKey = await readDebugEntry(debugDir, namespace, keyHash);
4934
4965
  const deserializedEntry = {
4935
4966
  ...entry,
@@ -5026,7 +5057,7 @@ function createBufferedCacheStore(backingStore) {
5026
5057
  externalJsonStore: backingStore.externalJsonStore,
5027
5058
  async lookup(namespace, keyHash) {
5028
5059
  const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
5029
- if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntry(buffered.entry, backingStore.externalJsonStore);
5060
+ if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntryOrNull(buffered.entry, backingStore.externalJsonStore);
5030
5061
  return backingStore.lookup(namespace, keyHash);
5031
5062
  },
5032
5063
  write(entry, debugKey) {
@@ -5273,6 +5304,10 @@ async function materializeExternalJsonCacheEntry(entry, store) {
5273
5304
  recording: cacheRecordingSchema.parse(await materializeExternalJsonValues(entry.recording, store))
5274
5305
  };
5275
5306
  }
5307
+ async function materializeExternalJsonCacheEntryOrNull(entry, store) {
5308
+ const result = await resultify(() => materializeExternalJsonCacheEntry(entry, store));
5309
+ return result.error ? null : result.value;
5310
+ }
5276
5311
  async function pruneExternalJsonBlobs(cacheDir, blobDir) {
5277
5312
  if (!existsSync(blobDir)) return;
5278
5313
  const referenced = await collectReferencedExternalJsonBlobPaths(cacheDir);
@@ -5864,6 +5899,7 @@ function resolveEvalDefaultConfig(params) {
5864
5899
  globalRemove: params.globalRemove,
5865
5900
  evalRemove
5866
5901
  }),
5902
+ defaultStatAggregate: params.evalDef.defaultStatAggregate ?? params.globalDefaultStatAggregate,
5867
5903
  charts: appendDefaultCharts({
5868
5904
  charts: params.evalDef.charts,
5869
5905
  globalRemove: params.globalRemove,
@@ -6072,6 +6108,122 @@ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
6072
6108
  await import(moduleUrl.href);
6073
6109
  }
6074
6110
  //#endregion
6111
+ //#region ../runner/src/moduleIsolation.ts
6112
+ const isolationParam = "agent-evals-isolate";
6113
+ const pathSegmentSeparatorPattern = /[\\/]+/;
6114
+ const isolationStorage = new AsyncLocalStorage();
6115
+ const activeIsolationRoots = /* @__PURE__ */ new Map();
6116
+ const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
6117
+ let hooksRegistered = false;
6118
+ const requireFromRunner = createRequire(import.meta.url);
6119
+ const agentPackageUrlBySpecifier = new Map([
6120
+ "@ls-stack/agent-eval",
6121
+ "@agent-evals/sdk",
6122
+ "@agent-evals/shared",
6123
+ "@agent-evals/runner",
6124
+ "@agent-evals/runner/run-child"
6125
+ ].flatMap((specifier) => {
6126
+ try {
6127
+ return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
6128
+ } catch {
6129
+ return [];
6130
+ }
6131
+ }));
6132
+ const agentPackageDirectoryPaths = [...new Set([...agentPackageUrlBySpecifier.values()].map((packageUrl) => dirname(fileURLToPath(packageUrl))))];
6133
+ function isAgentEvalsPackageSpecifier(specifier) {
6134
+ return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
6135
+ }
6136
+ function getIsolationKeyFromParent(parentURL) {
6137
+ if (!parentURL?.startsWith("file:")) return null;
6138
+ const value = new URL(parentURL).searchParams.get(isolationParam);
6139
+ return activeIsolationRoots.has(value ?? "") ? value : null;
6140
+ }
6141
+ function isIsolatableFile(url, workspaceRoot) {
6142
+ if (url.protocol !== "file:") return false;
6143
+ return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
6144
+ }
6145
+ function isIsolatableFilePath(filePath, workspaceRoot) {
6146
+ if (isAgentEvalsPackageFilePath(filePath)) return false;
6147
+ const relativePath = relative(workspaceRoot, filePath);
6148
+ if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
6149
+ return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
6150
+ }
6151
+ function isAgentEvalsPackageFilePath(filePath) {
6152
+ return agentPackageDirectoryPaths.some((packageDirectoryPath) => {
6153
+ const packageRelativePath = relative(packageDirectoryPath, filePath);
6154
+ return packageRelativePath === "" || !packageRelativePath.startsWith("..") && !isAbsolute(packageRelativePath);
6155
+ });
6156
+ }
6157
+ function addIsolationParam(url, key) {
6158
+ const moduleUrl = new URL(url);
6159
+ if (moduleUrl.searchParams.get(isolationParam) === key) return url;
6160
+ moduleUrl.searchParams.set(isolationParam, key);
6161
+ return moduleUrl.href;
6162
+ }
6163
+ function registerModuleIsolationHooks() {
6164
+ if (hooksRegistered) return;
6165
+ hooksRegistered = true;
6166
+ registerHooks({ resolve(specifier, context, nextResolve) {
6167
+ const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
6168
+ if (agentPackageUrl !== void 0) return {
6169
+ url: agentPackageUrl,
6170
+ shortCircuit: true
6171
+ };
6172
+ const resolved = nextResolve(specifier, context);
6173
+ if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
6174
+ const activeContext = isolationStorage.getStore();
6175
+ const inferredKey = getIsolationKeyFromParent(context.parentURL);
6176
+ const isolationKey = activeContext?.key ?? inferredKey;
6177
+ if (isolationKey === null) return resolved;
6178
+ const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
6179
+ if (workspaceRoot === void 0) return resolved;
6180
+ if (!isIsolatableFile(new URL(resolved.url), workspaceRoot)) return resolved;
6181
+ return {
6182
+ ...resolved,
6183
+ url: addIsolationParam(resolved.url, isolationKey)
6184
+ };
6185
+ } });
6186
+ }
6187
+ function clearWorkspaceRequireCacheOnce(context) {
6188
+ if (clearedRequireCacheKeys.has(context.key)) return;
6189
+ clearedRequireCacheKeys.add(context.key);
6190
+ for (const filePath of Object.keys(requireFromRunner.cache)) if (isIsolatableFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
6191
+ }
6192
+ /**
6193
+ * Execute module loading and eval code with fresh module URLs.
6194
+ *
6195
+ * Node does not expose an ESM cache reset API, so the runner appends a
6196
+ * scoped query parameter to workspace and dependency file imports. CommonJS
6197
+ * modules use `require.cache` behind ESM imports, so isolatable entries are
6198
+ * cleared once per scope. Agent Evals package imports are left alone so SDK
6199
+ * singletons, such as the eval registry, remain shared.
6200
+ */
6201
+ async function runWithModuleIsolation(context, fn) {
6202
+ registerModuleIsolationHooks();
6203
+ clearWorkspaceRequireCacheOnce(context);
6204
+ activeIsolationRoots.set(context.key, context.workspaceRoot);
6205
+ return await isolationStorage.run(context, fn);
6206
+ }
6207
+ //#endregion
6208
+ //#region ../runner/src/evalRegistryLoader.ts
6209
+ async function loadIsolatedEvalRegistry(params) {
6210
+ return await runWithEvalRegistry(async (registry) => {
6211
+ await runWithModuleIsolation(params.moduleIsolation, async () => {
6212
+ await runInEvalRuntimeScope(params.runtimeScope, async () => {
6213
+ await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
6214
+ });
6215
+ });
6216
+ return registry;
6217
+ });
6218
+ }
6219
+ async function useIsolatedEvalDefinition(params) {
6220
+ const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
6221
+ if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
6222
+ return await entry.use(async (evalDef) => {
6223
+ return await params.use(evalDef);
6224
+ });
6225
+ }
6226
+ //#endregion
6075
6227
  //#region ../runner/src/freshness.ts
6076
6228
  /**
6077
6229
  * Derive eval freshness from the latest run, current eval-file fingerprint,
@@ -6872,96 +7024,6 @@ function stripTerminalControlCodes(value) {
6872
7024
  return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
6873
7025
  }
6874
7026
  //#endregion
6875
- //#region ../runner/src/moduleIsolation.ts
6876
- const isolationParam = "agent-evals-isolate";
6877
- const pathSegmentSeparatorPattern = /[\\/]+/;
6878
- const isolationStorage = new AsyncLocalStorage();
6879
- const activeIsolationRoots = /* @__PURE__ */ new Map();
6880
- const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
6881
- let hooksRegistered = false;
6882
- const requireFromRunner = createRequire(import.meta.url);
6883
- const agentPackageUrlBySpecifier = new Map([
6884
- "@ls-stack/agent-eval",
6885
- "@agent-evals/sdk",
6886
- "@agent-evals/shared",
6887
- "@agent-evals/runner",
6888
- "@agent-evals/runner/run-child"
6889
- ].flatMap((specifier) => {
6890
- try {
6891
- return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
6892
- } catch {
6893
- return [];
6894
- }
6895
- }));
6896
- function isAgentEvalsPackageSpecifier(specifier) {
6897
- return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
6898
- }
6899
- function getIsolationKeyFromParent(parentURL) {
6900
- if (!parentURL?.startsWith("file:")) return null;
6901
- const value = new URL(parentURL).searchParams.get(isolationParam);
6902
- return activeIsolationRoots.has(value ?? "") ? value : null;
6903
- }
6904
- function isWorkspaceFile(url, workspaceRoot) {
6905
- if (url.protocol !== "file:") return false;
6906
- return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
6907
- }
6908
- function isWorkspaceFilePath(filePath, workspaceRoot) {
6909
- const relativePath = relative(workspaceRoot, filePath);
6910
- if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
6911
- const segments = relativePath.split(pathSegmentSeparatorPattern);
6912
- return !segments.includes("node_modules") && !segments.includes(".agent-evals");
6913
- }
6914
- function addIsolationParam(url, key) {
6915
- const moduleUrl = new URL(url);
6916
- if (moduleUrl.searchParams.get(isolationParam) === key) return url;
6917
- moduleUrl.searchParams.set(isolationParam, key);
6918
- return moduleUrl.href;
6919
- }
6920
- function registerModuleIsolationHooks() {
6921
- if (hooksRegistered) return;
6922
- hooksRegistered = true;
6923
- registerHooks({ resolve(specifier, context, nextResolve) {
6924
- const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
6925
- if (agentPackageUrl !== void 0) return {
6926
- url: agentPackageUrl,
6927
- shortCircuit: true
6928
- };
6929
- const resolved = nextResolve(specifier, context);
6930
- if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
6931
- const activeContext = isolationStorage.getStore();
6932
- const inferredKey = getIsolationKeyFromParent(context.parentURL);
6933
- const isolationKey = activeContext?.key ?? inferredKey;
6934
- if (isolationKey === null) return resolved;
6935
- const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
6936
- if (workspaceRoot === void 0) return resolved;
6937
- if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
6938
- return {
6939
- ...resolved,
6940
- url: addIsolationParam(resolved.url, isolationKey)
6941
- };
6942
- } });
6943
- }
6944
- function clearWorkspaceRequireCacheOnce(context) {
6945
- if (clearedRequireCacheKeys.has(context.key)) return;
6946
- clearedRequireCacheKeys.add(context.key);
6947
- for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
6948
- }
6949
- /**
6950
- * Execute module loading and eval code with fresh workspace module URLs.
6951
- *
6952
- * Node does not expose an ESM cache reset API, so the runner appends a
6953
- * run-scoped query parameter to workspace file imports. CommonJS modules use
6954
- * `require.cache` behind ESM imports, so workspace entries are cleared once per
6955
- * run. Package imports are left alone so SDK singletons, such as the eval
6956
- * registry, remain shared.
6957
- */
6958
- async function runWithModuleIsolation(context, fn) {
6959
- registerModuleIsolationHooks();
6960
- clearWorkspaceRequireCacheOnce(context);
6961
- activeIsolationRoots.set(context.key, context.workspaceRoot);
6962
- return await isolationStorage.run(context, fn);
6963
- }
6964
- //#endregion
6965
7027
  //#region ../runner/src/runExecution.ts
6966
7028
  function filterEvalCases(cases, caseIds) {
6967
7029
  if (!caseIds || caseIds.length === 0) return cases;
@@ -7432,6 +7494,30 @@ function getTargetEvalKeys(params) {
7432
7494
  }
7433
7495
  //#endregion
7434
7496
  //#region ../runner/src/runOrchestration.ts
7497
+ function toOptionalSourceFingerprint(sourceFingerprint) {
7498
+ return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
7499
+ }
7500
+ function buildCaseModuleIsolation(params) {
7501
+ return {
7502
+ key: [
7503
+ params.runId,
7504
+ params.evalKey,
7505
+ params.caseId,
7506
+ `trial-${String(params.trial)}`
7507
+ ].join(":"),
7508
+ workspaceRoot: params.workspaceRoot
7509
+ };
7510
+ }
7511
+ function buildEvalPreparationModuleIsolation(params) {
7512
+ return {
7513
+ key: [
7514
+ params.runId,
7515
+ params.evalKey,
7516
+ "prepare"
7517
+ ].join(":"),
7518
+ workspaceRoot: params.workspaceRoot
7519
+ };
7520
+ }
7435
7521
  /**
7436
7522
  * Ranks case statuses from worst to best. Used to order trial attempts so the
7437
7523
  * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
@@ -7569,14 +7655,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7569
7655
  const preparedEvals = [];
7570
7656
  const cacheMode = runState.manifest.cacheMode ?? "use";
7571
7657
  const cacheEnabled = config.cache?.enabled !== false;
7572
- const moduleIsolation = {
7573
- key: runState.manifest.id,
7574
- workspaceRoot
7575
- };
7576
7658
  const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
7577
7659
  const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
7578
7660
  for (const evalMeta of targetEvals) {
7579
7661
  const evalFilePath = evalMeta.sourceFilePath;
7662
+ const evalModuleIsolation = buildEvalPreparationModuleIsolation({
7663
+ runId: runState.manifest.id,
7664
+ evalKey: evalMeta.key,
7665
+ workspaceRoot
7666
+ });
7580
7667
  let sourceFingerprint = "";
7581
7668
  try {
7582
7669
  sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
@@ -7591,13 +7678,12 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7591
7678
  evalMeta.sourceFingerprint = null;
7592
7679
  }
7593
7680
  try {
7594
- const registry = getEvalRegistry();
7595
- await runWithModuleIsolation(moduleIsolation, async () => {
7596
- await runInEvalRuntimeScope("env", async () => {
7597
- await loadEvalModule(evalFilePath, sourceFingerprint);
7598
- });
7599
- });
7600
- const entry = registry.get(evalMeta.id);
7681
+ const entry = (await loadIsolatedEvalRegistry({
7682
+ evalFilePath,
7683
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7684
+ moduleIsolation: evalModuleIsolation,
7685
+ runtimeScope: "env"
7686
+ })).get(evalMeta.id);
7601
7687
  if (!entry) {
7602
7688
  evalErrors.push({
7603
7689
  evalId: evalMeta.id,
@@ -7605,7 +7691,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7605
7691
  });
7606
7692
  continue;
7607
7693
  }
7608
- await runWithModuleIsolation(moduleIsolation, async () => {
7694
+ await runWithModuleIsolation(evalModuleIsolation, async () => {
7609
7695
  await runInEvalRuntimeScope("cases", async () => {
7610
7696
  await entry.use(async (evalDef) => {
7611
7697
  const evalTagsResult = resolveEvalTags({
@@ -7668,6 +7754,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7668
7754
  evalDef,
7669
7755
  globalColumns: config.columns,
7670
7756
  globalStats: config.stats,
7757
+ globalDefaultStatAggregate: config.defaultStatAggregate,
7671
7758
  globalRemove: config.removeDefaultConfig
7672
7759
  });
7673
7760
  const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -7679,6 +7766,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7679
7766
  for (const warning of validatedCharts.warnings) console.warn(warning);
7680
7767
  evalMeta.columnDefs = declaredColumnDefs;
7681
7768
  evalMeta.stats = defaultConfig.stats;
7769
+ evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
7682
7770
  evalMeta.charts = validatedCharts.charts;
7683
7771
  const evalCaseRows = [];
7684
7772
  const preparedCases = [];
@@ -7701,29 +7789,43 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7701
7789
  preparedCases.push(preparedCase);
7702
7790
  for (let trial = 0; trial < request.trials; trial++) {
7703
7791
  const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
7792
+ const caseModuleIsolation = buildCaseModuleIsolation({
7793
+ runId: runState.manifest.id,
7794
+ evalKey: evalMeta.key,
7795
+ caseId: evalCase.id,
7796
+ trial,
7797
+ workspaceRoot
7798
+ });
7704
7799
  queuedCases.push({
7705
7800
  execute: async ({ startTime, globalTraceDisplay }) => {
7706
- const { caseDetail, caseRowUpdate } = await runCase({
7707
- evalDef,
7801
+ const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
7708
7802
  evalId: evalMeta.id,
7709
- evalKey: evalMeta.key,
7710
- evalCase,
7711
- globalTraceDisplay,
7712
- globalColumns: config.columns,
7713
- globalDeriveFromTracing: config.deriveFromTracing,
7714
- llmCallsConfig,
7715
- apiCallsConfig,
7716
- globalRemoveDefaultConfig: config.removeDefaultConfig,
7717
- trial,
7718
- startTime,
7719
- cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
7720
- cacheMode,
7721
- moduleIsolation,
7722
7803
  evalFilePath,
7723
- evalFileRelativePath: evalMeta.filePath,
7724
- workspaceRoot,
7725
- artifactDir: join(runDir, "artifacts"),
7726
- runId: runState.manifest.id
7804
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7805
+ moduleIsolation: caseModuleIsolation,
7806
+ runtimeScope: "env",
7807
+ use: async (isolatedEvalDef) => await runCase({
7808
+ evalDef: isolatedEvalDef,
7809
+ evalId: evalMeta.id,
7810
+ evalKey: evalMeta.key,
7811
+ evalCase,
7812
+ globalTraceDisplay,
7813
+ globalColumns: config.columns,
7814
+ globalDeriveFromTracing: config.deriveFromTracing,
7815
+ llmCallsConfig,
7816
+ apiCallsConfig,
7817
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
7818
+ trial,
7819
+ startTime,
7820
+ cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
7821
+ cacheMode,
7822
+ moduleIsolation: caseModuleIsolation,
7823
+ evalFilePath,
7824
+ evalFileRelativePath: evalMeta.filePath,
7825
+ workspaceRoot,
7826
+ artifactDir: join(runDir, "artifacts"),
7827
+ runId: runState.manifest.id
7828
+ })
7727
7829
  });
7728
7830
  return {
7729
7831
  caseDetail,
@@ -7874,4 +7976,4 @@ function toLastRunStatus(status) {
7874
7976
  return status === "pending" ? null : status;
7875
7977
  }
7876
7978
  //#endregion
7877
- export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
7979
+ export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-C0qdoRSi.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-CFQ8LZmY.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-3FrKBc9l.mjs";
2
- import "./src-BNmtaqeC.mjs";
1
+ import { n as createRunner } from "./cli-rvPrUj6S.mjs";
2
+ import "./src-DEENkbkn.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-Cn6fGL2s.mjs";
2
- import "./cli-3FrKBc9l.mjs";
1
+ import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BWyE5lRX.mjs";
2
+ import "./cli-rvPrUj6S.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.53.0",
3
+ "version": "0.55.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1",
37
- "@agent-evals/shared": "0.0.1"
36
+ "@agent-evals/shared": "0.0.1",
37
+ "@agent-evals/sdk": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -30,7 +30,8 @@ display rules), read the TypeScript declarations shipped with the package:
30
30
  `agent-evals.config.ts` to opt into run-all CLI behavior.
31
31
  - `agent-evals run --temporary` persists a run like normal history, but deletes
32
32
  it before the next run starts. Temporary runs appear in `show-runs` while
33
- present; normal runs are never deleted by temporary-run cleanup.
33
+ present; normal runs are never deleted by temporary-run cleanup. In the app,
34
+ the run drawer can promote a temporary run to durable history.
34
35
  - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
35
36
  place when the runner is idle. If config changes during an active run, the
36
37
  reload applies after the current run reaches a terminal state.
@@ -415,10 +416,17 @@ definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
415
416
  stats. Native stat kinds include `cases`, `passRate`, `duration`, and
416
417
  `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
417
418
  cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
418
- LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
419
+ LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit
420
+ stats use a separate aggregate control and default to `sum`; `avg` is average
421
+ per-case hit rate, and min/max/best/worst select cases by hit rate. `duration`
422
+ aggregates per-case durations using the same modes as column stats. Usage stats
419
423
  and LLM usage charts are added by default unless removed with
420
424
  `removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
421
- otherwise they inherit from the matching column. Number formats use
425
+ otherwise they inherit from the matching column. Duration and column stat
426
+ aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value),
427
+ and `worst` (lowest finite value). Use `defaultStatAggregate` in
428
+ `agent-evals.config.ts` to set the workspace-wide initial duration/column stat
429
+ mode, or on an eval definition to override it for that eval. Number formats use
422
430
  `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing
423
431
  zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats
424
432
  and charts support `hideIfNoValue: true`. Charts support
@@ -566,6 +574,9 @@ For true module replacement inside an eval, register `mock.module(...)` from
566
574
  Node's `--experimental-test-module-mocks` flag automatically for CLI and app
567
575
  runs. Use dynamic
568
576
  `import(...)` inside `execute` — static imports happen too early.
577
+ Each case/trial reloads the eval module graph in its own isolation scope, so
578
+ module-level mock state in workspace files and ESM dependencies does not leak
579
+ between concurrent cases.
569
580
 
570
581
  ```ts
571
582
  import { mock } from 'node:test';