@ls-stack/agent-eval 0.52.3 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-DT6cje9E.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVFgRO3.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-DlvYXPxG.mjs";
1
+ import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-C78U4Ir0.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BEtk5skO.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-BM6LW4ou.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DT6cje9E.mjs";
1
+ import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Mt as evalStatAggregateSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C78U4Ir0.mjs";
2
2
  import { z } from "zod/v4";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -17,6 +17,7 @@ const evalMetaSchema = z.object({
17
17
  caseCount: z.number().nullable(),
18
18
  caseIds: z.array(z.string()).optional(),
19
19
  stats: evalStatsConfigSchema.optional(),
20
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
20
21
  charts: evalChartsConfigSchema.optional(),
21
22
  manualInputDescriptor: manualInputDescriptorSchema.optional(),
22
23
  requiresManualInput: z.boolean().optional()
@@ -1,6 +1,6 @@
1
1
  import { createRequire, registerHooks } from "node:module";
2
- import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
3
2
  import { AsyncLocalStorage } from "node:async_hooks";
3
+ import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
4
4
  import { z, z as z$1 } from "zod/v4";
5
5
  import dayjs from "dayjs";
6
6
  import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
@@ -14,16 +14,31 @@ import { Result, resultify } from "t-result";
14
14
  import { fileURLToPath, pathToFileURL } from "node:url";
15
15
  //#region ../sdk/src/defineEval.ts
16
16
  const evalRegistry = /* @__PURE__ */ new Map();
17
+ const evalRegistryStorage = new AsyncLocalStorage();
17
18
  /** Return the in-memory registry of evals defined in the current process. */
18
19
  function getEvalRegistry() {
19
- return evalRegistry;
20
+ return evalRegistryStorage.getStore() ?? evalRegistry;
21
+ }
22
+ /**
23
+ * Execute a callback with an empty async-local eval registry.
24
+ *
25
+ * Runner internals use this when importing eval modules concurrently so
26
+ * `defineEval(...)` calls from one import cannot overwrite another import's
27
+ * registered definitions. The callback receives the scoped registry populated
28
+ * during its async execution.
29
+ */
30
+ async function runWithEvalRegistry(fn) {
31
+ const scopedRegistry = /* @__PURE__ */ new Map();
32
+ return await evalRegistryStorage.run(scopedRegistry, async () => {
33
+ return await fn(scopedRegistry);
34
+ });
20
35
  }
21
36
  /**
22
37
  * Register an eval definition with the SDK so the runner can discover it
23
38
  * after importing the eval module.
24
39
  */
25
40
  function defineEval(definition) {
26
- evalRegistry.set(definition.id, {
41
+ getEvalRegistry().set(definition.id, {
27
42
  id: definition.id,
28
43
  title: definition.title,
29
44
  use: (fn) => fn(definition)
@@ -45,13 +60,15 @@ const jsonCellSchema = z.lazy(() => z.union([
45
60
  const repoFileRefSchema = z.object({
46
61
  source: z.literal("repo"),
47
62
  path: z.string(),
48
- mimeType: z.string().optional()
63
+ mimeType: z.string().optional(),
64
+ sizeBytes: z.number().int().nonnegative().optional()
49
65
  });
50
66
  const runArtifactRefSchema = z.object({
51
67
  source: z.literal("run"),
52
68
  artifactId: z.string(),
53
69
  mimeType: z.string(),
54
- fileName: z.string().optional()
70
+ fileName: z.string().optional(),
71
+ sizeBytes: z.number().int().nonnegative().optional()
55
72
  });
56
73
  const fileRefSchema = z.union([repoFileRefSchema, runArtifactRefSchema]);
57
74
  /** Schema for numeric presentation options used by number-formatted values. */
@@ -82,6 +99,8 @@ const columnFormatSchema = z.enum([
82
99
  "markdown",
83
100
  "json",
84
101
  "image",
102
+ "html",
103
+ "pdf",
85
104
  "audio",
86
105
  "video",
87
106
  "file",
@@ -604,13 +623,18 @@ const evalFreshnessStatusSchema = z.enum([
604
623
  "stale",
605
624
  "outdated"
606
625
  ]);
607
- /** Reducer used to collapse a column's per-case values into a single stat. */
626
+ /**
627
+ * Reducer used to collapse per-case values into a single duration or column
628
+ * stat.
629
+ * `best` selects the highest finite value and `worst` selects the lowest.
630
+ */
608
631
  const evalStatAggregateSchema = z.enum([
609
632
  "avg",
610
633
  "min",
611
634
  "max",
612
635
  "sum",
613
- "last"
636
+ "best",
637
+ "worst"
614
638
  ]);
615
639
  const hideIfNoValueShape = {
616
640
  /**
@@ -620,10 +644,12 @@ const hideIfNoValueShape = {
620
644
  */
621
645
  hideIfNoValue: z.boolean().optional() };
622
646
  /**
623
- * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
624
- * `cacheHits` counts Agent Eval operation-level cache hits from spans and
625
- * `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
626
- * `column` aggregates a score or numeric output column across the latest run.
647
+ * One entry in the EvalCard stats row. Built-in kinds read from the latest run;
648
+ * `duration` aggregates per-case durations, `cacheHits` counts Agent Eval
649
+ * operation-level cache hits from spans and `evalTracer.cache(...)` refs, not
650
+ * LLM provider prompt-cache read tokens. Cache hits use an independent
651
+ * aggregate mode and default to `sum`. `column` aggregates a score or numeric
652
+ * output column across the latest run.
627
653
  */
628
654
  const evalStatItemSchema = z.discriminatedUnion("kind", [
629
655
  z.object({
@@ -637,10 +663,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
637
663
  }),
638
664
  z.object({
639
665
  kind: z.literal("duration"),
666
+ aggregate: evalStatAggregateSchema.optional(),
640
667
  ...hideIfNoValueShape
641
668
  }),
642
669
  z.object({
643
670
  kind: z.literal("cacheHits"),
671
+ aggregate: evalStatAggregateSchema.optional(),
644
672
  ...hideIfNoValueShape
645
673
  }),
646
674
  z.object({
@@ -700,6 +728,11 @@ z.object({
700
728
  */
701
729
  stats: evalStatsConfigSchema.optional(),
702
730
  /**
731
+ * Initial aggregate mode used for duration and column stats on this eval
732
+ * card. Overrides workspace-level `defaultStatAggregate` when present.
733
+ */
734
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
735
+ /**
703
736
  * Ordered per-eval history chart configuration for the EvalCard. Opt-in:
704
737
  * when omitted or empty, the UI renders no history chart at all.
705
738
  */
@@ -1334,6 +1367,7 @@ const agentEvalsConfigSchema = z.object({
1334
1367
  columns: evalColumnsSchema.optional(),
1335
1368
  deriveFromTracing: evalDeriveConfigSchema.optional(),
1336
1369
  stats: evalStatsConfigSchema.optional(),
1370
+ defaultStatAggregate: evalStatAggregateSchema.optional(),
1337
1371
  llmCalls: llmCallsConfigSchema.optional(),
1338
1372
  removeDefaultConfig: removeDefaultConfigSchema.optional(),
1339
1373
  apiCalls: apiCallsConfigSchema.optional(),
@@ -3372,17 +3406,20 @@ async function readManualInputFile(value, options = {}) {
3372
3406
  //#region ../sdk/src/repoFile.ts
3373
3407
  /**
3374
3408
  * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
3375
- * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
3409
+ * by a column configured with `format: 'image' | 'html' | 'pdf' | 'audio' |
3410
+ * 'video' | 'file'`.
3376
3411
  *
3377
3412
  * @param path Relative or absolute path to the repository file.
3378
3413
  * @param mimeType Optional MIME type hint for UI rendering.
3414
+ * @param sizeBytes Optional file size hint shown by artifact cards in the UI.
3379
3415
  * @returns A repo-backed file reference suitable for file/media columns.
3380
3416
  */
3381
- function repoFile(path, mimeType) {
3417
+ function repoFile(path, mimeType, sizeBytes) {
3382
3418
  return {
3383
3419
  source: "repo",
3384
3420
  path,
3385
- mimeType
3421
+ mimeType,
3422
+ sizeBytes
3386
3423
  };
3387
3424
  }
3388
3425
  //#endregion
@@ -4917,12 +4954,13 @@ function createFsCacheStore(options) {
4917
4954
  },
4918
4955
  async lookup(namespace, keyHash) {
4919
4956
  const entry = await readCacheEntry(cacheDir, namespace, keyHash);
4920
- return entry === null ? null : await materializeExternalJsonCacheEntry(entry, externalJsonStore);
4957
+ return entry === null ? null : await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
4921
4958
  },
4922
4959
  async lookupWithDebug(namespace, keyHash) {
4923
4960
  const rawEntry = await readCacheEntry(cacheDir, namespace, keyHash);
4924
4961
  if (rawEntry === null) return null;
4925
- const entry = await materializeExternalJsonCacheEntry(rawEntry, externalJsonStore);
4962
+ const entry = await materializeExternalJsonCacheEntryOrNull(rawEntry, externalJsonStore);
4963
+ if (entry === null) return null;
4926
4964
  const debugKey = await readDebugEntry(debugDir, namespace, keyHash);
4927
4965
  const deserializedEntry = {
4928
4966
  ...entry,
@@ -5019,7 +5057,7 @@ function createBufferedCacheStore(backingStore) {
5019
5057
  externalJsonStore: backingStore.externalJsonStore,
5020
5058
  async lookup(namespace, keyHash) {
5021
5059
  const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
5022
- if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntry(buffered.entry, backingStore.externalJsonStore);
5060
+ if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntryOrNull(buffered.entry, backingStore.externalJsonStore);
5023
5061
  return backingStore.lookup(namespace, keyHash);
5024
5062
  },
5025
5063
  write(entry, debugKey) {
@@ -5266,6 +5304,10 @@ async function materializeExternalJsonCacheEntry(entry, store) {
5266
5304
  recording: cacheRecordingSchema.parse(await materializeExternalJsonValues(entry.recording, store))
5267
5305
  };
5268
5306
  }
5307
+ async function materializeExternalJsonCacheEntryOrNull(entry, store) {
5308
+ const result = await resultify(() => materializeExternalJsonCacheEntry(entry, store));
5309
+ return result.error ? null : result.value;
5310
+ }
5269
5311
  async function pruneExternalJsonBlobs(cacheDir, blobDir) {
5270
5312
  if (!existsSync(blobDir)) return;
5271
5313
  const referenced = await collectReferencedExternalJsonBlobPaths(cacheDir);
@@ -5857,6 +5899,7 @@ function resolveEvalDefaultConfig(params) {
5857
5899
  globalRemove: params.globalRemove,
5858
5900
  evalRemove
5859
5901
  }),
5902
+ defaultStatAggregate: params.evalDef.defaultStatAggregate ?? params.globalDefaultStatAggregate,
5860
5903
  charts: appendDefaultCharts({
5861
5904
  charts: params.evalDef.charts,
5862
5905
  globalRemove: params.globalRemove,
@@ -6065,6 +6108,114 @@ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
6065
6108
  await import(moduleUrl.href);
6066
6109
  }
6067
6110
  //#endregion
6111
+ //#region ../runner/src/moduleIsolation.ts
6112
+ const isolationParam = "agent-evals-isolate";
6113
+ const pathSegmentSeparatorPattern = /[\\/]+/;
6114
+ const isolationStorage = new AsyncLocalStorage();
6115
+ const activeIsolationRoots = /* @__PURE__ */ new Map();
6116
+ const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
6117
+ let hooksRegistered = false;
6118
+ const requireFromRunner = createRequire(import.meta.url);
6119
+ const agentPackageUrlBySpecifier = new Map([
6120
+ "@ls-stack/agent-eval",
6121
+ "@agent-evals/sdk",
6122
+ "@agent-evals/shared",
6123
+ "@agent-evals/runner",
6124
+ "@agent-evals/runner/run-child"
6125
+ ].flatMap((specifier) => {
6126
+ try {
6127
+ return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
6128
+ } catch {
6129
+ return [];
6130
+ }
6131
+ }));
6132
+ function isAgentEvalsPackageSpecifier(specifier) {
6133
+ return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
6134
+ }
6135
+ function getIsolationKeyFromParent(parentURL) {
6136
+ if (!parentURL?.startsWith("file:")) return null;
6137
+ const value = new URL(parentURL).searchParams.get(isolationParam);
6138
+ return activeIsolationRoots.has(value ?? "") ? value : null;
6139
+ }
6140
+ function isIsolatableFile(url, workspaceRoot) {
6141
+ if (url.protocol !== "file:") return false;
6142
+ return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
6143
+ }
6144
+ function isIsolatableFilePath(filePath, workspaceRoot) {
6145
+ const relativePath = relative(workspaceRoot, filePath);
6146
+ if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
6147
+ return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
6148
+ }
6149
+ function addIsolationParam(url, key) {
6150
+ const moduleUrl = new URL(url);
6151
+ if (moduleUrl.searchParams.get(isolationParam) === key) return url;
6152
+ moduleUrl.searchParams.set(isolationParam, key);
6153
+ return moduleUrl.href;
6154
+ }
6155
+ function registerModuleIsolationHooks() {
6156
+ if (hooksRegistered) return;
6157
+ hooksRegistered = true;
6158
+ registerHooks({ resolve(specifier, context, nextResolve) {
6159
+ const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
6160
+ if (agentPackageUrl !== void 0) return {
6161
+ url: agentPackageUrl,
6162
+ shortCircuit: true
6163
+ };
6164
+ const resolved = nextResolve(specifier, context);
6165
+ if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
6166
+ const activeContext = isolationStorage.getStore();
6167
+ const inferredKey = getIsolationKeyFromParent(context.parentURL);
6168
+ const isolationKey = activeContext?.key ?? inferredKey;
6169
+ if (isolationKey === null) return resolved;
6170
+ const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
6171
+ if (workspaceRoot === void 0) return resolved;
6172
+ if (!isIsolatableFile(new URL(resolved.url), workspaceRoot)) return resolved;
6173
+ return {
6174
+ ...resolved,
6175
+ url: addIsolationParam(resolved.url, isolationKey)
6176
+ };
6177
+ } });
6178
+ }
6179
+ function clearWorkspaceRequireCacheOnce(context) {
6180
+ if (clearedRequireCacheKeys.has(context.key)) return;
6181
+ clearedRequireCacheKeys.add(context.key);
6182
+ for (const filePath of Object.keys(requireFromRunner.cache)) if (isIsolatableFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
6183
+ }
6184
+ /**
6185
+ * Execute module loading and eval code with fresh module URLs.
6186
+ *
6187
+ * Node does not expose an ESM cache reset API, so the runner appends a
6188
+ * scoped query parameter to workspace and dependency file imports. CommonJS
6189
+ * modules use `require.cache` behind ESM imports, so isolatable entries are
6190
+ * cleared once per scope. Agent Evals package imports are left alone so SDK
6191
+ * singletons, such as the eval registry, remain shared.
6192
+ */
6193
+ async function runWithModuleIsolation(context, fn) {
6194
+ registerModuleIsolationHooks();
6195
+ clearWorkspaceRequireCacheOnce(context);
6196
+ activeIsolationRoots.set(context.key, context.workspaceRoot);
6197
+ return await isolationStorage.run(context, fn);
6198
+ }
6199
+ //#endregion
6200
+ //#region ../runner/src/evalRegistryLoader.ts
6201
+ async function loadIsolatedEvalRegistry(params) {
6202
+ return await runWithEvalRegistry(async (registry) => {
6203
+ await runWithModuleIsolation(params.moduleIsolation, async () => {
6204
+ await runInEvalRuntimeScope(params.runtimeScope, async () => {
6205
+ await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
6206
+ });
6207
+ });
6208
+ return registry;
6209
+ });
6210
+ }
6211
+ async function useIsolatedEvalDefinition(params) {
6212
+ const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
6213
+ if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
6214
+ return await entry.use(async (evalDef) => {
6215
+ return await params.use(evalDef);
6216
+ });
6217
+ }
6218
+ //#endregion
6068
6219
  //#region ../runner/src/freshness.ts
6069
6220
  /**
6070
6221
  * Derive eval freshness from the latest run, current eval-file fingerprint,
@@ -6509,12 +6660,15 @@ async function persistInlineArtifact({ artifactDir, runId, caseId, outputKey, tr
6509
6660
  sanitizeSegment(outputKey),
6510
6661
  sanitizeFileName(fileName)
6511
6662
  ].join("__");
6512
- await writeFile(join(artifactDir, artifactId), new Uint8Array(await value.arrayBuffer()));
6663
+ const targetPath = join(artifactDir, artifactId);
6664
+ const bytes = new Uint8Array(await value.arrayBuffer());
6665
+ await writeFile(targetPath, bytes);
6513
6666
  return {
6514
6667
  source: "run",
6515
6668
  artifactId,
6516
6669
  mimeType,
6517
- fileName
6670
+ fileName,
6671
+ sizeBytes: bytes.byteLength
6518
6672
  };
6519
6673
  }
6520
6674
  /** Resolve a persisted run artifact path from its artifact id. */
@@ -6862,96 +7016,6 @@ function stripTerminalControlCodes(value) {
6862
7016
  return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
6863
7017
  }
6864
7018
  //#endregion
6865
- //#region ../runner/src/moduleIsolation.ts
6866
- const isolationParam = "agent-evals-isolate";
6867
- const pathSegmentSeparatorPattern = /[\\/]+/;
6868
- const isolationStorage = new AsyncLocalStorage();
6869
- const activeIsolationRoots = /* @__PURE__ */ new Map();
6870
- const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
6871
- let hooksRegistered = false;
6872
- const requireFromRunner = createRequire(import.meta.url);
6873
- const agentPackageUrlBySpecifier = new Map([
6874
- "@ls-stack/agent-eval",
6875
- "@agent-evals/sdk",
6876
- "@agent-evals/shared",
6877
- "@agent-evals/runner",
6878
- "@agent-evals/runner/run-child"
6879
- ].flatMap((specifier) => {
6880
- try {
6881
- return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
6882
- } catch {
6883
- return [];
6884
- }
6885
- }));
6886
- function isAgentEvalsPackageSpecifier(specifier) {
6887
- return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
6888
- }
6889
- function getIsolationKeyFromParent(parentURL) {
6890
- if (!parentURL?.startsWith("file:")) return null;
6891
- const value = new URL(parentURL).searchParams.get(isolationParam);
6892
- return activeIsolationRoots.has(value ?? "") ? value : null;
6893
- }
6894
- function isWorkspaceFile(url, workspaceRoot) {
6895
- if (url.protocol !== "file:") return false;
6896
- return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
6897
- }
6898
- function isWorkspaceFilePath(filePath, workspaceRoot) {
6899
- const relativePath = relative(workspaceRoot, filePath);
6900
- if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
6901
- const segments = relativePath.split(pathSegmentSeparatorPattern);
6902
- return !segments.includes("node_modules") && !segments.includes(".agent-evals");
6903
- }
6904
- function addIsolationParam(url, key) {
6905
- const moduleUrl = new URL(url);
6906
- if (moduleUrl.searchParams.get(isolationParam) === key) return url;
6907
- moduleUrl.searchParams.set(isolationParam, key);
6908
- return moduleUrl.href;
6909
- }
6910
- function registerModuleIsolationHooks() {
6911
- if (hooksRegistered) return;
6912
- hooksRegistered = true;
6913
- registerHooks({ resolve(specifier, context, nextResolve) {
6914
- const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
6915
- if (agentPackageUrl !== void 0) return {
6916
- url: agentPackageUrl,
6917
- shortCircuit: true
6918
- };
6919
- const resolved = nextResolve(specifier, context);
6920
- if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
6921
- const activeContext = isolationStorage.getStore();
6922
- const inferredKey = getIsolationKeyFromParent(context.parentURL);
6923
- const isolationKey = activeContext?.key ?? inferredKey;
6924
- if (isolationKey === null) return resolved;
6925
- const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
6926
- if (workspaceRoot === void 0) return resolved;
6927
- if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
6928
- return {
6929
- ...resolved,
6930
- url: addIsolationParam(resolved.url, isolationKey)
6931
- };
6932
- } });
6933
- }
6934
- function clearWorkspaceRequireCacheOnce(context) {
6935
- if (clearedRequireCacheKeys.has(context.key)) return;
6936
- clearedRequireCacheKeys.add(context.key);
6937
- for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
6938
- }
6939
- /**
6940
- * Execute module loading and eval code with fresh workspace module URLs.
6941
- *
6942
- * Node does not expose an ESM cache reset API, so the runner appends a
6943
- * run-scoped query parameter to workspace file imports. CommonJS modules use
6944
- * `require.cache` behind ESM imports, so workspace entries are cleared once per
6945
- * run. Package imports are left alone so SDK singletons, such as the eval
6946
- * registry, remain shared.
6947
- */
6948
- async function runWithModuleIsolation(context, fn) {
6949
- registerModuleIsolationHooks();
6950
- clearWorkspaceRequireCacheOnce(context);
6951
- activeIsolationRoots.set(context.key, context.workspaceRoot);
6952
- return await isolationStorage.run(context, fn);
6953
- }
6954
- //#endregion
6955
7019
  //#region ../runner/src/runExecution.ts
6956
7020
  function filterEvalCases(cases, caseIds) {
6957
7021
  if (!caseIds || caseIds.length === 0) return cases;
@@ -7422,6 +7486,30 @@ function getTargetEvalKeys(params) {
7422
7486
  }
7423
7487
  //#endregion
7424
7488
  //#region ../runner/src/runOrchestration.ts
7489
+ function toOptionalSourceFingerprint(sourceFingerprint) {
7490
+ return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
7491
+ }
7492
+ function buildCaseModuleIsolation(params) {
7493
+ return {
7494
+ key: [
7495
+ params.runId,
7496
+ params.evalKey,
7497
+ params.caseId,
7498
+ `trial-${String(params.trial)}`
7499
+ ].join(":"),
7500
+ workspaceRoot: params.workspaceRoot
7501
+ };
7502
+ }
7503
+ function buildEvalPreparationModuleIsolation(params) {
7504
+ return {
7505
+ key: [
7506
+ params.runId,
7507
+ params.evalKey,
7508
+ "prepare"
7509
+ ].join(":"),
7510
+ workspaceRoot: params.workspaceRoot
7511
+ };
7512
+ }
7425
7513
  /**
7426
7514
  * Ranks case statuses from worst to best. Used to order trial attempts so the
7427
7515
  * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
@@ -7559,14 +7647,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7559
7647
  const preparedEvals = [];
7560
7648
  const cacheMode = runState.manifest.cacheMode ?? "use";
7561
7649
  const cacheEnabled = config.cache?.enabled !== false;
7562
- const moduleIsolation = {
7563
- key: runState.manifest.id,
7564
- workspaceRoot
7565
- };
7566
7650
  const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
7567
7651
  const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
7568
7652
  for (const evalMeta of targetEvals) {
7569
7653
  const evalFilePath = evalMeta.sourceFilePath;
7654
+ const evalModuleIsolation = buildEvalPreparationModuleIsolation({
7655
+ runId: runState.manifest.id,
7656
+ evalKey: evalMeta.key,
7657
+ workspaceRoot
7658
+ });
7570
7659
  let sourceFingerprint = "";
7571
7660
  try {
7572
7661
  sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
@@ -7581,13 +7670,12 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7581
7670
  evalMeta.sourceFingerprint = null;
7582
7671
  }
7583
7672
  try {
7584
- const registry = getEvalRegistry();
7585
- await runWithModuleIsolation(moduleIsolation, async () => {
7586
- await runInEvalRuntimeScope("env", async () => {
7587
- await loadEvalModule(evalFilePath, sourceFingerprint);
7588
- });
7589
- });
7590
- const entry = registry.get(evalMeta.id);
7673
+ const entry = (await loadIsolatedEvalRegistry({
7674
+ evalFilePath,
7675
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7676
+ moduleIsolation: evalModuleIsolation,
7677
+ runtimeScope: "env"
7678
+ })).get(evalMeta.id);
7591
7679
  if (!entry) {
7592
7680
  evalErrors.push({
7593
7681
  evalId: evalMeta.id,
@@ -7595,7 +7683,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7595
7683
  });
7596
7684
  continue;
7597
7685
  }
7598
- await runWithModuleIsolation(moduleIsolation, async () => {
7686
+ await runWithModuleIsolation(evalModuleIsolation, async () => {
7599
7687
  await runInEvalRuntimeScope("cases", async () => {
7600
7688
  await entry.use(async (evalDef) => {
7601
7689
  const evalTagsResult = resolveEvalTags({
@@ -7658,6 +7746,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7658
7746
  evalDef,
7659
7747
  globalColumns: config.columns,
7660
7748
  globalStats: config.stats,
7749
+ globalDefaultStatAggregate: config.defaultStatAggregate,
7661
7750
  globalRemove: config.removeDefaultConfig
7662
7751
  });
7663
7752
  const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -7669,6 +7758,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7669
7758
  for (const warning of validatedCharts.warnings) console.warn(warning);
7670
7759
  evalMeta.columnDefs = declaredColumnDefs;
7671
7760
  evalMeta.stats = defaultConfig.stats;
7761
+ evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
7672
7762
  evalMeta.charts = validatedCharts.charts;
7673
7763
  const evalCaseRows = [];
7674
7764
  const preparedCases = [];
@@ -7691,29 +7781,43 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7691
7781
  preparedCases.push(preparedCase);
7692
7782
  for (let trial = 0; trial < request.trials; trial++) {
7693
7783
  const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
7784
+ const caseModuleIsolation = buildCaseModuleIsolation({
7785
+ runId: runState.manifest.id,
7786
+ evalKey: evalMeta.key,
7787
+ caseId: evalCase.id,
7788
+ trial,
7789
+ workspaceRoot
7790
+ });
7694
7791
  queuedCases.push({
7695
7792
  execute: async ({ startTime, globalTraceDisplay }) => {
7696
- const { caseDetail, caseRowUpdate } = await runCase({
7697
- evalDef,
7793
+ const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
7698
7794
  evalId: evalMeta.id,
7699
- evalKey: evalMeta.key,
7700
- evalCase,
7701
- globalTraceDisplay,
7702
- globalColumns: config.columns,
7703
- globalDeriveFromTracing: config.deriveFromTracing,
7704
- llmCallsConfig,
7705
- apiCallsConfig,
7706
- globalRemoveDefaultConfig: config.removeDefaultConfig,
7707
- trial,
7708
- startTime,
7709
- cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
7710
- cacheMode,
7711
- moduleIsolation,
7712
7795
  evalFilePath,
7713
- evalFileRelativePath: evalMeta.filePath,
7714
- workspaceRoot,
7715
- artifactDir: join(runDir, "artifacts"),
7716
- runId: runState.manifest.id
7796
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7797
+ moduleIsolation: caseModuleIsolation,
7798
+ runtimeScope: "env",
7799
+ use: async (isolatedEvalDef) => await runCase({
7800
+ evalDef: isolatedEvalDef,
7801
+ evalId: evalMeta.id,
7802
+ evalKey: evalMeta.key,
7803
+ evalCase,
7804
+ globalTraceDisplay,
7805
+ globalColumns: config.columns,
7806
+ globalDeriveFromTracing: config.deriveFromTracing,
7807
+ llmCallsConfig,
7808
+ apiCallsConfig,
7809
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
7810
+ trial,
7811
+ startTime,
7812
+ cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
7813
+ cacheMode,
7814
+ moduleIsolation: caseModuleIsolation,
7815
+ evalFilePath,
7816
+ evalFileRelativePath: evalMeta.filePath,
7817
+ workspaceRoot,
7818
+ artifactDir: join(runDir, "artifacts"),
7819
+ runId: runState.manifest.id
7820
+ })
7717
7821
  });
7718
7822
  return {
7719
7823
  caseDetail,
@@ -7864,4 +7968,4 @@ function toLastRunStatus(status) {
7864
7968
  return status === "pending" ? null : status;
7865
7969
  }
7866
7970
  //#endregion
7867
- export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
7971
+ export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-DbVFgRO3.mjs";
2
- import "./src-DlvYXPxG.mjs";
1
+ import { n as createRunner } from "./cli-BEtk5skO.mjs";
2
+ import "./src-BM6LW4ou.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-WRQdfG0r.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-5y6nEBZM.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DT6cje9E.mjs";
2
- import "./cli-DbVFgRO3.mjs";
1
+ import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-C78U4Ir0.mjs";
2
+ import "./cli-BEtk5skO.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {