@ls-stack/agent-eval 0.53.0 → 0.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-D0a57pVo.mjs → app-CunZ8Dku.mjs} +12 -4
- package/dist/apps/web/dist/assets/index-2I-eWzVL.css +1 -0
- package/dist/apps/web/dist/assets/index-CvsPmlHl.js +377 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-3FrKBc9l.mjs → cli-rvPrUj6S.mjs} +51 -11
- package/dist/index.d.mts +305 -133
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -1
- package/dist/{runOrchestration-Cn6fGL2s.mjs → runOrchestration-BWyE5lRX.mjs} +236 -134
- package/dist/{runner-Dsqj431i.mjs → runner-C2fvjKZP.mjs} +1 -1
- package/dist/{runner-C0qdoRSi.mjs → runner-CFQ8LZmY.mjs} +2 -2
- package/dist/{src-BNmtaqeC.mjs → src-DEENkbkn.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +14 -3
- package/dist/apps/web/dist/assets/index-RNejIyap.js +0 -375
- package/dist/apps/web/dist/assets/index-vaLgWG8j.css +0 -1
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey,
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-BWyE5lRX.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-rvPrUj6S.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-DEENkbkn.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as
|
|
1
|
+
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Mt as evalStatAggregateSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BWyE5lRX.mjs";
|
|
2
2
|
import { z } from "zod/v4";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -17,6 +17,7 @@ const evalMetaSchema = z.object({
|
|
|
17
17
|
caseCount: z.number().nullable(),
|
|
18
18
|
caseIds: z.array(z.string()).optional(),
|
|
19
19
|
stats: evalStatsConfigSchema.optional(),
|
|
20
|
+
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
20
21
|
charts: evalChartsConfigSchema.optional(),
|
|
21
22
|
manualInputDescriptor: manualInputDescriptorSchema.optional(),
|
|
22
23
|
requiresManualInput: z.boolean().optional()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { createRequire, registerHooks } from "node:module";
|
|
2
|
-
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
3
2
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
3
|
+
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
4
4
|
import { z, z as z$1 } from "zod/v4";
|
|
5
5
|
import dayjs from "dayjs";
|
|
6
6
|
import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
|
|
@@ -14,16 +14,31 @@ import { Result, resultify } from "t-result";
|
|
|
14
14
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
15
15
|
//#region ../sdk/src/defineEval.ts
|
|
16
16
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
17
|
+
const evalRegistryStorage = new AsyncLocalStorage();
|
|
17
18
|
/** Return the in-memory registry of evals defined in the current process. */
|
|
18
19
|
function getEvalRegistry() {
|
|
19
|
-
return evalRegistry;
|
|
20
|
+
return evalRegistryStorage.getStore() ?? evalRegistry;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Execute a callback with an empty async-local eval registry.
|
|
24
|
+
*
|
|
25
|
+
* Runner internals use this when importing eval modules concurrently so
|
|
26
|
+
* `defineEval(...)` calls from one import cannot overwrite another import's
|
|
27
|
+
* registered definitions. The callback receives the scoped registry populated
|
|
28
|
+
* during its async execution.
|
|
29
|
+
*/
|
|
30
|
+
async function runWithEvalRegistry(fn) {
|
|
31
|
+
const scopedRegistry = /* @__PURE__ */ new Map();
|
|
32
|
+
return await evalRegistryStorage.run(scopedRegistry, async () => {
|
|
33
|
+
return await fn(scopedRegistry);
|
|
34
|
+
});
|
|
20
35
|
}
|
|
21
36
|
/**
|
|
22
37
|
* Register an eval definition with the SDK so the runner can discover it
|
|
23
38
|
* after importing the eval module.
|
|
24
39
|
*/
|
|
25
40
|
function defineEval(definition) {
|
|
26
|
-
|
|
41
|
+
getEvalRegistry().set(definition.id, {
|
|
27
42
|
id: definition.id,
|
|
28
43
|
title: definition.title,
|
|
29
44
|
use: (fn) => fn(definition)
|
|
@@ -608,13 +623,18 @@ const evalFreshnessStatusSchema = z.enum([
|
|
|
608
623
|
"stale",
|
|
609
624
|
"outdated"
|
|
610
625
|
]);
|
|
611
|
-
/**
|
|
626
|
+
/**
|
|
627
|
+
* Reducer used to collapse per-case values into a single duration or column
|
|
628
|
+
* stat.
|
|
629
|
+
* `best` selects the highest finite value and `worst` selects the lowest.
|
|
630
|
+
*/
|
|
612
631
|
const evalStatAggregateSchema = z.enum([
|
|
613
632
|
"avg",
|
|
614
633
|
"min",
|
|
615
634
|
"max",
|
|
616
635
|
"sum",
|
|
617
|
-
"
|
|
636
|
+
"best",
|
|
637
|
+
"worst"
|
|
618
638
|
]);
|
|
619
639
|
const hideIfNoValueShape = {
|
|
620
640
|
/**
|
|
@@ -624,10 +644,12 @@ const hideIfNoValueShape = {
|
|
|
624
644
|
*/
|
|
625
645
|
hideIfNoValue: z.boolean().optional() };
|
|
626
646
|
/**
|
|
627
|
-
* One entry in the EvalCard stats row. Built-in kinds
|
|
628
|
-
* `
|
|
629
|
-
* `evalTracer.cache(...)` refs, not
|
|
630
|
-
*
|
|
647
|
+
* One entry in the EvalCard stats row. Built-in kinds read from the latest run;
|
|
648
|
+
* `duration` aggregates per-case durations, `cacheHits` counts Agent Eval
|
|
649
|
+
* operation-level cache hits from spans and `evalTracer.cache(...)` refs, not
|
|
650
|
+
* LLM provider prompt-cache read tokens. Cache hits use an independent
|
|
651
|
+
* aggregate mode and default to `sum`. `column` aggregates a score or numeric
|
|
652
|
+
* output column across the latest run.
|
|
631
653
|
*/
|
|
632
654
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
633
655
|
z.object({
|
|
@@ -641,10 +663,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
641
663
|
}),
|
|
642
664
|
z.object({
|
|
643
665
|
kind: z.literal("duration"),
|
|
666
|
+
aggregate: evalStatAggregateSchema.optional(),
|
|
644
667
|
...hideIfNoValueShape
|
|
645
668
|
}),
|
|
646
669
|
z.object({
|
|
647
670
|
kind: z.literal("cacheHits"),
|
|
671
|
+
aggregate: evalStatAggregateSchema.optional(),
|
|
648
672
|
...hideIfNoValueShape
|
|
649
673
|
}),
|
|
650
674
|
z.object({
|
|
@@ -704,6 +728,11 @@ z.object({
|
|
|
704
728
|
*/
|
|
705
729
|
stats: evalStatsConfigSchema.optional(),
|
|
706
730
|
/**
|
|
731
|
+
* Initial aggregate mode used for duration and column stats on this eval
|
|
732
|
+
* card. Overrides workspace-level `defaultStatAggregate` when present.
|
|
733
|
+
*/
|
|
734
|
+
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
735
|
+
/**
|
|
707
736
|
* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
|
|
708
737
|
* when omitted or empty, the UI renders no history chart at all.
|
|
709
738
|
*/
|
|
@@ -1338,6 +1367,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1338
1367
|
columns: evalColumnsSchema.optional(),
|
|
1339
1368
|
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
1340
1369
|
stats: evalStatsConfigSchema.optional(),
|
|
1370
|
+
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
1341
1371
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
1342
1372
|
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
1343
1373
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -4924,12 +4954,13 @@ function createFsCacheStore(options) {
|
|
|
4924
4954
|
},
|
|
4925
4955
|
async lookup(namespace, keyHash) {
|
|
4926
4956
|
const entry = await readCacheEntry(cacheDir, namespace, keyHash);
|
|
4927
|
-
return entry === null ? null : await
|
|
4957
|
+
return entry === null ? null : await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
|
|
4928
4958
|
},
|
|
4929
4959
|
async lookupWithDebug(namespace, keyHash) {
|
|
4930
4960
|
const rawEntry = await readCacheEntry(cacheDir, namespace, keyHash);
|
|
4931
4961
|
if (rawEntry === null) return null;
|
|
4932
|
-
const entry = await
|
|
4962
|
+
const entry = await materializeExternalJsonCacheEntryOrNull(rawEntry, externalJsonStore);
|
|
4963
|
+
if (entry === null) return null;
|
|
4933
4964
|
const debugKey = await readDebugEntry(debugDir, namespace, keyHash);
|
|
4934
4965
|
const deserializedEntry = {
|
|
4935
4966
|
...entry,
|
|
@@ -5026,7 +5057,7 @@ function createBufferedCacheStore(backingStore) {
|
|
|
5026
5057
|
externalJsonStore: backingStore.externalJsonStore,
|
|
5027
5058
|
async lookup(namespace, keyHash) {
|
|
5028
5059
|
const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
|
|
5029
|
-
if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await
|
|
5060
|
+
if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntryOrNull(buffered.entry, backingStore.externalJsonStore);
|
|
5030
5061
|
return backingStore.lookup(namespace, keyHash);
|
|
5031
5062
|
},
|
|
5032
5063
|
write(entry, debugKey) {
|
|
@@ -5273,6 +5304,10 @@ async function materializeExternalJsonCacheEntry(entry, store) {
|
|
|
5273
5304
|
recording: cacheRecordingSchema.parse(await materializeExternalJsonValues(entry.recording, store))
|
|
5274
5305
|
};
|
|
5275
5306
|
}
|
|
5307
|
+
async function materializeExternalJsonCacheEntryOrNull(entry, store) {
|
|
5308
|
+
const result = await resultify(() => materializeExternalJsonCacheEntry(entry, store));
|
|
5309
|
+
return result.error ? null : result.value;
|
|
5310
|
+
}
|
|
5276
5311
|
async function pruneExternalJsonBlobs(cacheDir, blobDir) {
|
|
5277
5312
|
if (!existsSync(blobDir)) return;
|
|
5278
5313
|
const referenced = await collectReferencedExternalJsonBlobPaths(cacheDir);
|
|
@@ -5864,6 +5899,7 @@ function resolveEvalDefaultConfig(params) {
|
|
|
5864
5899
|
globalRemove: params.globalRemove,
|
|
5865
5900
|
evalRemove
|
|
5866
5901
|
}),
|
|
5902
|
+
defaultStatAggregate: params.evalDef.defaultStatAggregate ?? params.globalDefaultStatAggregate,
|
|
5867
5903
|
charts: appendDefaultCharts({
|
|
5868
5904
|
charts: params.evalDef.charts,
|
|
5869
5905
|
globalRemove: params.globalRemove,
|
|
@@ -6072,6 +6108,122 @@ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
|
|
|
6072
6108
|
await import(moduleUrl.href);
|
|
6073
6109
|
}
|
|
6074
6110
|
//#endregion
|
|
6111
|
+
//#region ../runner/src/moduleIsolation.ts
|
|
6112
|
+
const isolationParam = "agent-evals-isolate";
|
|
6113
|
+
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
6114
|
+
const isolationStorage = new AsyncLocalStorage();
|
|
6115
|
+
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
6116
|
+
const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
|
|
6117
|
+
let hooksRegistered = false;
|
|
6118
|
+
const requireFromRunner = createRequire(import.meta.url);
|
|
6119
|
+
const agentPackageUrlBySpecifier = new Map([
|
|
6120
|
+
"@ls-stack/agent-eval",
|
|
6121
|
+
"@agent-evals/sdk",
|
|
6122
|
+
"@agent-evals/shared",
|
|
6123
|
+
"@agent-evals/runner",
|
|
6124
|
+
"@agent-evals/runner/run-child"
|
|
6125
|
+
].flatMap((specifier) => {
|
|
6126
|
+
try {
|
|
6127
|
+
return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
|
|
6128
|
+
} catch {
|
|
6129
|
+
return [];
|
|
6130
|
+
}
|
|
6131
|
+
}));
|
|
6132
|
+
const agentPackageDirectoryPaths = [...new Set([...agentPackageUrlBySpecifier.values()].map((packageUrl) => dirname(fileURLToPath(packageUrl))))];
|
|
6133
|
+
function isAgentEvalsPackageSpecifier(specifier) {
|
|
6134
|
+
return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
|
|
6135
|
+
}
|
|
6136
|
+
function getIsolationKeyFromParent(parentURL) {
|
|
6137
|
+
if (!parentURL?.startsWith("file:")) return null;
|
|
6138
|
+
const value = new URL(parentURL).searchParams.get(isolationParam);
|
|
6139
|
+
return activeIsolationRoots.has(value ?? "") ? value : null;
|
|
6140
|
+
}
|
|
6141
|
+
function isIsolatableFile(url, workspaceRoot) {
|
|
6142
|
+
if (url.protocol !== "file:") return false;
|
|
6143
|
+
return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
|
|
6144
|
+
}
|
|
6145
|
+
function isIsolatableFilePath(filePath, workspaceRoot) {
|
|
6146
|
+
if (isAgentEvalsPackageFilePath(filePath)) return false;
|
|
6147
|
+
const relativePath = relative(workspaceRoot, filePath);
|
|
6148
|
+
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
6149
|
+
return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
|
|
6150
|
+
}
|
|
6151
|
+
function isAgentEvalsPackageFilePath(filePath) {
|
|
6152
|
+
return agentPackageDirectoryPaths.some((packageDirectoryPath) => {
|
|
6153
|
+
const packageRelativePath = relative(packageDirectoryPath, filePath);
|
|
6154
|
+
return packageRelativePath === "" || !packageRelativePath.startsWith("..") && !isAbsolute(packageRelativePath);
|
|
6155
|
+
});
|
|
6156
|
+
}
|
|
6157
|
+
function addIsolationParam(url, key) {
|
|
6158
|
+
const moduleUrl = new URL(url);
|
|
6159
|
+
if (moduleUrl.searchParams.get(isolationParam) === key) return url;
|
|
6160
|
+
moduleUrl.searchParams.set(isolationParam, key);
|
|
6161
|
+
return moduleUrl.href;
|
|
6162
|
+
}
|
|
6163
|
+
function registerModuleIsolationHooks() {
|
|
6164
|
+
if (hooksRegistered) return;
|
|
6165
|
+
hooksRegistered = true;
|
|
6166
|
+
registerHooks({ resolve(specifier, context, nextResolve) {
|
|
6167
|
+
const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
|
|
6168
|
+
if (agentPackageUrl !== void 0) return {
|
|
6169
|
+
url: agentPackageUrl,
|
|
6170
|
+
shortCircuit: true
|
|
6171
|
+
};
|
|
6172
|
+
const resolved = nextResolve(specifier, context);
|
|
6173
|
+
if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
|
|
6174
|
+
const activeContext = isolationStorage.getStore();
|
|
6175
|
+
const inferredKey = getIsolationKeyFromParent(context.parentURL);
|
|
6176
|
+
const isolationKey = activeContext?.key ?? inferredKey;
|
|
6177
|
+
if (isolationKey === null) return resolved;
|
|
6178
|
+
const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
|
|
6179
|
+
if (workspaceRoot === void 0) return resolved;
|
|
6180
|
+
if (!isIsolatableFile(new URL(resolved.url), workspaceRoot)) return resolved;
|
|
6181
|
+
return {
|
|
6182
|
+
...resolved,
|
|
6183
|
+
url: addIsolationParam(resolved.url, isolationKey)
|
|
6184
|
+
};
|
|
6185
|
+
} });
|
|
6186
|
+
}
|
|
6187
|
+
function clearWorkspaceRequireCacheOnce(context) {
|
|
6188
|
+
if (clearedRequireCacheKeys.has(context.key)) return;
|
|
6189
|
+
clearedRequireCacheKeys.add(context.key);
|
|
6190
|
+
for (const filePath of Object.keys(requireFromRunner.cache)) if (isIsolatableFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
|
|
6191
|
+
}
|
|
6192
|
+
/**
|
|
6193
|
+
* Execute module loading and eval code with fresh module URLs.
|
|
6194
|
+
*
|
|
6195
|
+
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
6196
|
+
* scoped query parameter to workspace and dependency file imports. CommonJS
|
|
6197
|
+
* modules use `require.cache` behind ESM imports, so isolatable entries are
|
|
6198
|
+
* cleared once per scope. Agent Evals package imports are left alone so SDK
|
|
6199
|
+
* singletons, such as the eval registry, remain shared.
|
|
6200
|
+
*/
|
|
6201
|
+
async function runWithModuleIsolation(context, fn) {
|
|
6202
|
+
registerModuleIsolationHooks();
|
|
6203
|
+
clearWorkspaceRequireCacheOnce(context);
|
|
6204
|
+
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
6205
|
+
return await isolationStorage.run(context, fn);
|
|
6206
|
+
}
|
|
6207
|
+
//#endregion
|
|
6208
|
+
//#region ../runner/src/evalRegistryLoader.ts
|
|
6209
|
+
async function loadIsolatedEvalRegistry(params) {
|
|
6210
|
+
return await runWithEvalRegistry(async (registry) => {
|
|
6211
|
+
await runWithModuleIsolation(params.moduleIsolation, async () => {
|
|
6212
|
+
await runInEvalRuntimeScope(params.runtimeScope, async () => {
|
|
6213
|
+
await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
|
|
6214
|
+
});
|
|
6215
|
+
});
|
|
6216
|
+
return registry;
|
|
6217
|
+
});
|
|
6218
|
+
}
|
|
6219
|
+
async function useIsolatedEvalDefinition(params) {
|
|
6220
|
+
const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
|
|
6221
|
+
if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
|
|
6222
|
+
return await entry.use(async (evalDef) => {
|
|
6223
|
+
return await params.use(evalDef);
|
|
6224
|
+
});
|
|
6225
|
+
}
|
|
6226
|
+
//#endregion
|
|
6075
6227
|
//#region ../runner/src/freshness.ts
|
|
6076
6228
|
/**
|
|
6077
6229
|
* Derive eval freshness from the latest run, current eval-file fingerprint,
|
|
@@ -6872,96 +7024,6 @@ function stripTerminalControlCodes(value) {
|
|
|
6872
7024
|
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
6873
7025
|
}
|
|
6874
7026
|
//#endregion
|
|
6875
|
-
//#region ../runner/src/moduleIsolation.ts
|
|
6876
|
-
const isolationParam = "agent-evals-isolate";
|
|
6877
|
-
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
6878
|
-
const isolationStorage = new AsyncLocalStorage();
|
|
6879
|
-
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
6880
|
-
const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
|
|
6881
|
-
let hooksRegistered = false;
|
|
6882
|
-
const requireFromRunner = createRequire(import.meta.url);
|
|
6883
|
-
const agentPackageUrlBySpecifier = new Map([
|
|
6884
|
-
"@ls-stack/agent-eval",
|
|
6885
|
-
"@agent-evals/sdk",
|
|
6886
|
-
"@agent-evals/shared",
|
|
6887
|
-
"@agent-evals/runner",
|
|
6888
|
-
"@agent-evals/runner/run-child"
|
|
6889
|
-
].flatMap((specifier) => {
|
|
6890
|
-
try {
|
|
6891
|
-
return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
|
|
6892
|
-
} catch {
|
|
6893
|
-
return [];
|
|
6894
|
-
}
|
|
6895
|
-
}));
|
|
6896
|
-
function isAgentEvalsPackageSpecifier(specifier) {
|
|
6897
|
-
return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
|
|
6898
|
-
}
|
|
6899
|
-
function getIsolationKeyFromParent(parentURL) {
|
|
6900
|
-
if (!parentURL?.startsWith("file:")) return null;
|
|
6901
|
-
const value = new URL(parentURL).searchParams.get(isolationParam);
|
|
6902
|
-
return activeIsolationRoots.has(value ?? "") ? value : null;
|
|
6903
|
-
}
|
|
6904
|
-
function isWorkspaceFile(url, workspaceRoot) {
|
|
6905
|
-
if (url.protocol !== "file:") return false;
|
|
6906
|
-
return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
|
|
6907
|
-
}
|
|
6908
|
-
function isWorkspaceFilePath(filePath, workspaceRoot) {
|
|
6909
|
-
const relativePath = relative(workspaceRoot, filePath);
|
|
6910
|
-
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
6911
|
-
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
6912
|
-
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
6913
|
-
}
|
|
6914
|
-
function addIsolationParam(url, key) {
|
|
6915
|
-
const moduleUrl = new URL(url);
|
|
6916
|
-
if (moduleUrl.searchParams.get(isolationParam) === key) return url;
|
|
6917
|
-
moduleUrl.searchParams.set(isolationParam, key);
|
|
6918
|
-
return moduleUrl.href;
|
|
6919
|
-
}
|
|
6920
|
-
function registerModuleIsolationHooks() {
|
|
6921
|
-
if (hooksRegistered) return;
|
|
6922
|
-
hooksRegistered = true;
|
|
6923
|
-
registerHooks({ resolve(specifier, context, nextResolve) {
|
|
6924
|
-
const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
|
|
6925
|
-
if (agentPackageUrl !== void 0) return {
|
|
6926
|
-
url: agentPackageUrl,
|
|
6927
|
-
shortCircuit: true
|
|
6928
|
-
};
|
|
6929
|
-
const resolved = nextResolve(specifier, context);
|
|
6930
|
-
if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
|
|
6931
|
-
const activeContext = isolationStorage.getStore();
|
|
6932
|
-
const inferredKey = getIsolationKeyFromParent(context.parentURL);
|
|
6933
|
-
const isolationKey = activeContext?.key ?? inferredKey;
|
|
6934
|
-
if (isolationKey === null) return resolved;
|
|
6935
|
-
const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
|
|
6936
|
-
if (workspaceRoot === void 0) return resolved;
|
|
6937
|
-
if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
|
|
6938
|
-
return {
|
|
6939
|
-
...resolved,
|
|
6940
|
-
url: addIsolationParam(resolved.url, isolationKey)
|
|
6941
|
-
};
|
|
6942
|
-
} });
|
|
6943
|
-
}
|
|
6944
|
-
function clearWorkspaceRequireCacheOnce(context) {
|
|
6945
|
-
if (clearedRequireCacheKeys.has(context.key)) return;
|
|
6946
|
-
clearedRequireCacheKeys.add(context.key);
|
|
6947
|
-
for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
|
|
6948
|
-
}
|
|
6949
|
-
/**
|
|
6950
|
-
* Execute module loading and eval code with fresh workspace module URLs.
|
|
6951
|
-
*
|
|
6952
|
-
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
6953
|
-
* run-scoped query parameter to workspace file imports. CommonJS modules use
|
|
6954
|
-
* `require.cache` behind ESM imports, so workspace entries are cleared once per
|
|
6955
|
-
* run. Package imports are left alone so SDK singletons, such as the eval
|
|
6956
|
-
* registry, remain shared.
|
|
6957
|
-
*/
|
|
6958
|
-
async function runWithModuleIsolation(context, fn) {
|
|
6959
|
-
registerModuleIsolationHooks();
|
|
6960
|
-
clearWorkspaceRequireCacheOnce(context);
|
|
6961
|
-
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
6962
|
-
return await isolationStorage.run(context, fn);
|
|
6963
|
-
}
|
|
6964
|
-
//#endregion
|
|
6965
7027
|
//#region ../runner/src/runExecution.ts
|
|
6966
7028
|
function filterEvalCases(cases, caseIds) {
|
|
6967
7029
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
@@ -7432,6 +7494,30 @@ function getTargetEvalKeys(params) {
|
|
|
7432
7494
|
}
|
|
7433
7495
|
//#endregion
|
|
7434
7496
|
//#region ../runner/src/runOrchestration.ts
|
|
7497
|
+
function toOptionalSourceFingerprint(sourceFingerprint) {
|
|
7498
|
+
return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
|
|
7499
|
+
}
|
|
7500
|
+
function buildCaseModuleIsolation(params) {
|
|
7501
|
+
return {
|
|
7502
|
+
key: [
|
|
7503
|
+
params.runId,
|
|
7504
|
+
params.evalKey,
|
|
7505
|
+
params.caseId,
|
|
7506
|
+
`trial-${String(params.trial)}`
|
|
7507
|
+
].join(":"),
|
|
7508
|
+
workspaceRoot: params.workspaceRoot
|
|
7509
|
+
};
|
|
7510
|
+
}
|
|
7511
|
+
function buildEvalPreparationModuleIsolation(params) {
|
|
7512
|
+
return {
|
|
7513
|
+
key: [
|
|
7514
|
+
params.runId,
|
|
7515
|
+
params.evalKey,
|
|
7516
|
+
"prepare"
|
|
7517
|
+
].join(":"),
|
|
7518
|
+
workspaceRoot: params.workspaceRoot
|
|
7519
|
+
};
|
|
7520
|
+
}
|
|
7435
7521
|
/**
|
|
7436
7522
|
* Ranks case statuses from worst to best. Used to order trial attempts so the
|
|
7437
7523
|
* pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
|
|
@@ -7569,14 +7655,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7569
7655
|
const preparedEvals = [];
|
|
7570
7656
|
const cacheMode = runState.manifest.cacheMode ?? "use";
|
|
7571
7657
|
const cacheEnabled = config.cache?.enabled !== false;
|
|
7572
|
-
const moduleIsolation = {
|
|
7573
|
-
key: runState.manifest.id,
|
|
7574
|
-
workspaceRoot
|
|
7575
|
-
};
|
|
7576
7658
|
const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
7577
7659
|
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
7578
7660
|
for (const evalMeta of targetEvals) {
|
|
7579
7661
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
7662
|
+
const evalModuleIsolation = buildEvalPreparationModuleIsolation({
|
|
7663
|
+
runId: runState.manifest.id,
|
|
7664
|
+
evalKey: evalMeta.key,
|
|
7665
|
+
workspaceRoot
|
|
7666
|
+
});
|
|
7580
7667
|
let sourceFingerprint = "";
|
|
7581
7668
|
try {
|
|
7582
7669
|
sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
|
|
@@ -7591,13 +7678,12 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7591
7678
|
evalMeta.sourceFingerprint = null;
|
|
7592
7679
|
}
|
|
7593
7680
|
try {
|
|
7594
|
-
const
|
|
7595
|
-
|
|
7596
|
-
|
|
7597
|
-
|
|
7598
|
-
|
|
7599
|
-
});
|
|
7600
|
-
const entry = registry.get(evalMeta.id);
|
|
7681
|
+
const entry = (await loadIsolatedEvalRegistry({
|
|
7682
|
+
evalFilePath,
|
|
7683
|
+
sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
|
|
7684
|
+
moduleIsolation: evalModuleIsolation,
|
|
7685
|
+
runtimeScope: "env"
|
|
7686
|
+
})).get(evalMeta.id);
|
|
7601
7687
|
if (!entry) {
|
|
7602
7688
|
evalErrors.push({
|
|
7603
7689
|
evalId: evalMeta.id,
|
|
@@ -7605,7 +7691,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7605
7691
|
});
|
|
7606
7692
|
continue;
|
|
7607
7693
|
}
|
|
7608
|
-
await runWithModuleIsolation(
|
|
7694
|
+
await runWithModuleIsolation(evalModuleIsolation, async () => {
|
|
7609
7695
|
await runInEvalRuntimeScope("cases", async () => {
|
|
7610
7696
|
await entry.use(async (evalDef) => {
|
|
7611
7697
|
const evalTagsResult = resolveEvalTags({
|
|
@@ -7668,6 +7754,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7668
7754
|
evalDef,
|
|
7669
7755
|
globalColumns: config.columns,
|
|
7670
7756
|
globalStats: config.stats,
|
|
7757
|
+
globalDefaultStatAggregate: config.defaultStatAggregate,
|
|
7671
7758
|
globalRemove: config.removeDefaultConfig
|
|
7672
7759
|
});
|
|
7673
7760
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -7679,6 +7766,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7679
7766
|
for (const warning of validatedCharts.warnings) console.warn(warning);
|
|
7680
7767
|
evalMeta.columnDefs = declaredColumnDefs;
|
|
7681
7768
|
evalMeta.stats = defaultConfig.stats;
|
|
7769
|
+
evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
|
|
7682
7770
|
evalMeta.charts = validatedCharts.charts;
|
|
7683
7771
|
const evalCaseRows = [];
|
|
7684
7772
|
const preparedCases = [];
|
|
@@ -7701,29 +7789,43 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7701
7789
|
preparedCases.push(preparedCase);
|
|
7702
7790
|
for (let trial = 0; trial < request.trials; trial++) {
|
|
7703
7791
|
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
7792
|
+
const caseModuleIsolation = buildCaseModuleIsolation({
|
|
7793
|
+
runId: runState.manifest.id,
|
|
7794
|
+
evalKey: evalMeta.key,
|
|
7795
|
+
caseId: evalCase.id,
|
|
7796
|
+
trial,
|
|
7797
|
+
workspaceRoot
|
|
7798
|
+
});
|
|
7704
7799
|
queuedCases.push({
|
|
7705
7800
|
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
7706
|
-
const { caseDetail, caseRowUpdate } = await
|
|
7707
|
-
evalDef,
|
|
7801
|
+
const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
|
|
7708
7802
|
evalId: evalMeta.id,
|
|
7709
|
-
evalKey: evalMeta.key,
|
|
7710
|
-
evalCase,
|
|
7711
|
-
globalTraceDisplay,
|
|
7712
|
-
globalColumns: config.columns,
|
|
7713
|
-
globalDeriveFromTracing: config.deriveFromTracing,
|
|
7714
|
-
llmCallsConfig,
|
|
7715
|
-
apiCallsConfig,
|
|
7716
|
-
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
7717
|
-
trial,
|
|
7718
|
-
startTime,
|
|
7719
|
-
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
7720
|
-
cacheMode,
|
|
7721
|
-
moduleIsolation,
|
|
7722
7803
|
evalFilePath,
|
|
7723
|
-
|
|
7724
|
-
|
|
7725
|
-
|
|
7726
|
-
|
|
7804
|
+
sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
|
|
7805
|
+
moduleIsolation: caseModuleIsolation,
|
|
7806
|
+
runtimeScope: "env",
|
|
7807
|
+
use: async (isolatedEvalDef) => await runCase({
|
|
7808
|
+
evalDef: isolatedEvalDef,
|
|
7809
|
+
evalId: evalMeta.id,
|
|
7810
|
+
evalKey: evalMeta.key,
|
|
7811
|
+
evalCase,
|
|
7812
|
+
globalTraceDisplay,
|
|
7813
|
+
globalColumns: config.columns,
|
|
7814
|
+
globalDeriveFromTracing: config.deriveFromTracing,
|
|
7815
|
+
llmCallsConfig,
|
|
7816
|
+
apiCallsConfig,
|
|
7817
|
+
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
7818
|
+
trial,
|
|
7819
|
+
startTime,
|
|
7820
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
7821
|
+
cacheMode,
|
|
7822
|
+
moduleIsolation: caseModuleIsolation,
|
|
7823
|
+
evalFilePath,
|
|
7824
|
+
evalFileRelativePath: evalMeta.filePath,
|
|
7825
|
+
workspaceRoot,
|
|
7826
|
+
artifactDir: join(runDir, "artifacts"),
|
|
7827
|
+
runId: runState.manifest.id
|
|
7828
|
+
})
|
|
7727
7829
|
});
|
|
7728
7830
|
return {
|
|
7729
7831
|
caseDetail,
|
|
@@ -7874,4 +7976,4 @@ function toLastRunStatus(status) {
|
|
|
7874
7976
|
return status === "pending" ? null : status;
|
|
7875
7977
|
}
|
|
7876
7978
|
//#endregion
|
|
7877
|
-
export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F,
|
|
7979
|
+
export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-CFQ8LZmY.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-rvPrUj6S.mjs";
|
|
2
|
+
import "./src-DEENkbkn.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BWyE5lRX.mjs";
|
|
2
|
+
import "./cli-rvPrUj6S.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.55.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -30,7 +30,8 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
30
30
|
`agent-evals.config.ts` to opt into run-all CLI behavior.
|
|
31
31
|
- `agent-evals run --temporary` persists a run like normal history, but deletes
|
|
32
32
|
it before the next run starts. Temporary runs appear in `show-runs` while
|
|
33
|
-
present; normal runs are never deleted by temporary-run cleanup.
|
|
33
|
+
present; normal runs are never deleted by temporary-run cleanup. In the app,
|
|
34
|
+
the run drawer can promote a temporary run to durable history.
|
|
34
35
|
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
35
36
|
place when the runner is idle. If config changes during an active run, the
|
|
36
37
|
reload applies after the current run reaches a terminal state.
|
|
@@ -415,10 +416,17 @@ definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
|
|
|
415
416
|
stats. Native stat kinds include `cases`, `passRate`, `duration`, and
|
|
416
417
|
`cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
|
|
417
418
|
cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
|
|
418
|
-
LLM provider prompt-cache read tokens such as `cachedInputTokens`.
|
|
419
|
+
LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit
|
|
420
|
+
stats use a separate aggregate control and default to `sum`; `avg` is average
|
|
421
|
+
per-case hit rate, and min/max/best/worst select cases by hit rate. `duration`
|
|
422
|
+
aggregates per-case durations using the same modes as column stats. Usage stats
|
|
419
423
|
and LLM usage charts are added by default unless removed with
|
|
420
424
|
`removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
|
|
421
|
-
otherwise they inherit from the matching column.
|
|
425
|
+
otherwise they inherit from the matching column. Duration and column stat
|
|
426
|
+
aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value),
|
|
427
|
+
and `worst` (lowest finite value). Use `defaultStatAggregate` in
|
|
428
|
+
`agent-evals.config.ts` to set the workspace-wide initial duration/column stat
|
|
429
|
+
mode, or on an eval definition to override it for that eval. Number formats use
|
|
422
430
|
`maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing
|
|
423
431
|
zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats
|
|
424
432
|
and charts support `hideIfNoValue: true`. Charts support
|
|
@@ -566,6 +574,9 @@ For true module replacement inside an eval, register `mock.module(...)` from
|
|
|
566
574
|
Node's `--experimental-test-module-mocks` flag automatically for CLI and app
|
|
567
575
|
runs. Use dynamic
|
|
568
576
|
`import(...)` inside `execute` — static imports happen too early.
|
|
577
|
+
Each case/trial reloads the eval module graph in its own isolation scope, so
|
|
578
|
+
module-level mock state in workspace files and ESM dependencies does not leak
|
|
579
|
+
between concurrent cases.
|
|
569
580
|
|
|
570
581
|
```ts
|
|
571
582
|
import { mock } from 'node:test';
|