@ls-stack/agent-eval 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DKWm1oxc.mjs → app-B8e-oWYc.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-9hO8NpgZ.js +117 -0
- package/dist/apps/web/dist/assets/{index-BVnLr79e.css → index-MARPw1bH.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CMiCEQ-3.mjs → cli-BmrtjQj_.mjs} +155 -99
- package/dist/index.d.mts +40 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +41 -3
- package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-BDyNrRQT.mjs} +271 -124
- package/dist/{runner-DLnj18MO.mjs → runner-CsZqhbiA.mjs} +2 -2
- package/dist/{runner-Dx1sMCbh.mjs → runner-DABFPXkx.mjs} +1 -1
- package/dist/src-CEAJYN_X.mjs +3 -0
- package/package.json +5 -4
- package/skills/agent-eval/SKILL.md +408 -0
- package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +0 -117
- package/dist/src-BgGL7DDp.mjs +0 -3
|
@@ -46,7 +46,9 @@ function repoFile(path, mimeType) {
|
|
|
46
46
|
//#endregion
|
|
47
47
|
//#region ../sdk/src/runtime.ts
|
|
48
48
|
const scopeStorage = new AsyncLocalStorage();
|
|
49
|
+
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
49
50
|
let activeEvalScopeCount = 0;
|
|
51
|
+
let activeEvalRuntimeScopeCount = 0;
|
|
50
52
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
51
53
|
var EvalAssertionError = class extends Error {
|
|
52
54
|
constructor(message) {
|
|
@@ -60,13 +62,16 @@ function getCurrentScope() {
|
|
|
60
62
|
return scopeStorage.getStore();
|
|
61
63
|
}
|
|
62
64
|
/**
|
|
63
|
-
* Return
|
|
65
|
+
* Return the current eval runner phase for this async execution.
|
|
64
66
|
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
68
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
69
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
70
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
67
71
|
*/
|
|
68
72
|
function isInEvalScope() {
|
|
69
|
-
|
|
73
|
+
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
74
|
+
return runtimeScopeStorage.getStore() ?? null;
|
|
70
75
|
}
|
|
71
76
|
function registerBackgroundJobInScope(scope, promise) {
|
|
72
77
|
const trackedPromise = promise.then(() => {
|
|
@@ -122,6 +127,31 @@ function getEvalCaseInput(path = void 0) {
|
|
|
122
127
|
function setScopeCacheContext(scope, context) {
|
|
123
128
|
scope.cacheContext = context;
|
|
124
129
|
}
|
|
130
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
131
|
+
async function runInEvalRuntimeScope(runtimeScope, fn) {
|
|
132
|
+
activeEvalRuntimeScopeCount++;
|
|
133
|
+
try {
|
|
134
|
+
return await runtimeScopeStorage.run(runtimeScope, fn);
|
|
135
|
+
} finally {
|
|
136
|
+
activeEvalRuntimeScopeCount--;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
141
|
+
*
|
|
142
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
143
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
144
|
+
*/
|
|
145
|
+
async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
146
|
+
activeEvalScopeCount++;
|
|
147
|
+
try {
|
|
148
|
+
return await scopeStorage.run(scope, async () => {
|
|
149
|
+
return await runInEvalRuntimeScope(runtimeScope, fn);
|
|
150
|
+
});
|
|
151
|
+
} finally {
|
|
152
|
+
activeEvalScopeCount--;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
125
155
|
/**
|
|
126
156
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
127
157
|
* trace data, and terminal error state.
|
|
@@ -144,29 +174,24 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
144
174
|
caseCacheRefs: [],
|
|
145
175
|
pendingBackgroundJobs: /* @__PURE__ */ new Set()
|
|
146
176
|
};
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
}
|
|
166
|
-
});
|
|
167
|
-
} finally {
|
|
168
|
-
activeEvalScopeCount--;
|
|
169
|
-
}
|
|
177
|
+
return await runInExistingEvalScope(scope, options.runtimeScope ?? "eval", async () => {
|
|
178
|
+
try {
|
|
179
|
+
const result = await fn();
|
|
180
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
181
|
+
return {
|
|
182
|
+
result,
|
|
183
|
+
scope,
|
|
184
|
+
error: void 0
|
|
185
|
+
};
|
|
186
|
+
} catch (error) {
|
|
187
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
188
|
+
return {
|
|
189
|
+
result: void 0,
|
|
190
|
+
scope,
|
|
191
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
});
|
|
170
195
|
}
|
|
171
196
|
/**
|
|
172
197
|
* Return the next deterministic ID for the active eval case execution.
|
|
@@ -3888,6 +3913,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
3888
3913
|
trialSelection: trialSelectionModeSchema.optional(),
|
|
3889
3914
|
concurrency: z.number().optional(),
|
|
3890
3915
|
staleAfterDays: z.number().optional(),
|
|
3916
|
+
allowCliRunAll: z.boolean().optional(),
|
|
3891
3917
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
3892
3918
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
3893
3919
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -4856,6 +4882,7 @@ const defaultConfig = {
|
|
|
4856
4882
|
trialSelection: "lowestScore",
|
|
4857
4883
|
concurrency: 2,
|
|
4858
4884
|
staleAfterDays: 14,
|
|
4885
|
+
allowCliRunAll: false,
|
|
4859
4886
|
traceDisplay: { attributes: [{
|
|
4860
4887
|
path: "input",
|
|
4861
4888
|
label: "Input",
|
|
@@ -4886,6 +4913,98 @@ async function loadConfig() {
|
|
|
4886
4913
|
}
|
|
4887
4914
|
}
|
|
4888
4915
|
//#endregion
|
|
4916
|
+
//#region ../runner/src/discovery.ts
|
|
4917
|
+
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4918
|
+
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
4919
|
+
function parseEvalMetas(filePath, content) {
|
|
4920
|
+
const metas = [];
|
|
4921
|
+
let searchIndex = 0;
|
|
4922
|
+
while (searchIndex < content.length) {
|
|
4923
|
+
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
4924
|
+
if (defineEvalIndex === -1) break;
|
|
4925
|
+
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
4926
|
+
if (!extracted) {
|
|
4927
|
+
searchIndex = defineEvalIndex + 10;
|
|
4928
|
+
continue;
|
|
4929
|
+
}
|
|
4930
|
+
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
4931
|
+
if (id !== void 0) {
|
|
4932
|
+
const result = {
|
|
4933
|
+
filePath,
|
|
4934
|
+
id
|
|
4935
|
+
};
|
|
4936
|
+
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
4937
|
+
if (title !== void 0) result.title = title;
|
|
4938
|
+
metas.push(result);
|
|
4939
|
+
}
|
|
4940
|
+
searchIndex = extracted.nextIndex;
|
|
4941
|
+
}
|
|
4942
|
+
return metas;
|
|
4943
|
+
}
|
|
4944
|
+
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
4945
|
+
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
4946
|
+
if (openParenIndex === -1) return void 0;
|
|
4947
|
+
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
4948
|
+
if (objectStartIndex === -1) return void 0;
|
|
4949
|
+
let depth = 0;
|
|
4950
|
+
let quote;
|
|
4951
|
+
let inBlockComment = false;
|
|
4952
|
+
let inLineComment = false;
|
|
4953
|
+
let isEscaped = false;
|
|
4954
|
+
for (let index = objectStartIndex; index < content.length; index++) {
|
|
4955
|
+
const currentChar = content[index];
|
|
4956
|
+
const nextChar = content[index + 1];
|
|
4957
|
+
if (inLineComment) {
|
|
4958
|
+
if (currentChar === "\n") inLineComment = false;
|
|
4959
|
+
continue;
|
|
4960
|
+
}
|
|
4961
|
+
if (inBlockComment) {
|
|
4962
|
+
if (currentChar === "*" && nextChar === "/") {
|
|
4963
|
+
inBlockComment = false;
|
|
4964
|
+
index++;
|
|
4965
|
+
}
|
|
4966
|
+
continue;
|
|
4967
|
+
}
|
|
4968
|
+
if (quote) {
|
|
4969
|
+
if (isEscaped) {
|
|
4970
|
+
isEscaped = false;
|
|
4971
|
+
continue;
|
|
4972
|
+
}
|
|
4973
|
+
if (currentChar === "\\") {
|
|
4974
|
+
isEscaped = true;
|
|
4975
|
+
continue;
|
|
4976
|
+
}
|
|
4977
|
+
if (currentChar === quote) quote = void 0;
|
|
4978
|
+
continue;
|
|
4979
|
+
}
|
|
4980
|
+
if (currentChar === "/" && nextChar === "/") {
|
|
4981
|
+
inLineComment = true;
|
|
4982
|
+
index++;
|
|
4983
|
+
continue;
|
|
4984
|
+
}
|
|
4985
|
+
if (currentChar === "/" && nextChar === "*") {
|
|
4986
|
+
inBlockComment = true;
|
|
4987
|
+
index++;
|
|
4988
|
+
continue;
|
|
4989
|
+
}
|
|
4990
|
+
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
4991
|
+
quote = currentChar;
|
|
4992
|
+
continue;
|
|
4993
|
+
}
|
|
4994
|
+
if (currentChar === "{") {
|
|
4995
|
+
depth++;
|
|
4996
|
+
continue;
|
|
4997
|
+
}
|
|
4998
|
+
if (currentChar === "}") {
|
|
4999
|
+
depth--;
|
|
5000
|
+
if (depth === 0) return {
|
|
5001
|
+
nextIndex: index + 1,
|
|
5002
|
+
objectText: content.slice(objectStartIndex, index + 1)
|
|
5003
|
+
};
|
|
5004
|
+
}
|
|
5005
|
+
}
|
|
5006
|
+
}
|
|
5007
|
+
//#endregion
|
|
4889
5008
|
//#region ../runner/src/evalModuleLoader.ts
|
|
4890
5009
|
/**
|
|
4891
5010
|
* Import one eval module with a cache key derived from its current source so
|
|
@@ -5280,6 +5399,7 @@ const isolationParam = "agent-evals-isolate";
|
|
|
5280
5399
|
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
5281
5400
|
const isolationStorage = new AsyncLocalStorage();
|
|
5282
5401
|
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
5402
|
+
const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
|
|
5283
5403
|
let hooksRegistered = false;
|
|
5284
5404
|
const requireFromRunner = createRequire(import.meta.url);
|
|
5285
5405
|
const agentPackageUrlBySpecifier = new Map([
|
|
@@ -5305,7 +5425,10 @@ function getIsolationKeyFromParent(parentURL) {
|
|
|
5305
5425
|
}
|
|
5306
5426
|
function isWorkspaceFile(url, workspaceRoot) {
|
|
5307
5427
|
if (url.protocol !== "file:") return false;
|
|
5308
|
-
|
|
5428
|
+
return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
|
|
5429
|
+
}
|
|
5430
|
+
function isWorkspaceFilePath(filePath, workspaceRoot) {
|
|
5431
|
+
const relativePath = relative(workspaceRoot, filePath);
|
|
5309
5432
|
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
5310
5433
|
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
5311
5434
|
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
@@ -5340,15 +5463,23 @@ function registerModuleIsolationHooks() {
|
|
|
5340
5463
|
};
|
|
5341
5464
|
} });
|
|
5342
5465
|
}
|
|
5466
|
+
function clearWorkspaceRequireCacheOnce(context) {
|
|
5467
|
+
if (clearedRequireCacheKeys.has(context.key)) return;
|
|
5468
|
+
clearedRequireCacheKeys.add(context.key);
|
|
5469
|
+
for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
|
|
5470
|
+
}
|
|
5343
5471
|
/**
|
|
5344
5472
|
* Execute module loading and eval code with fresh workspace module URLs.
|
|
5345
5473
|
*
|
|
5346
5474
|
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
5347
|
-
* run-scoped query parameter to workspace file imports.
|
|
5348
|
-
*
|
|
5475
|
+
* run-scoped query parameter to workspace file imports. CommonJS modules use
|
|
5476
|
+
* `require.cache` behind ESM imports, so workspace entries are cleared once per
|
|
5477
|
+
* run. Package imports are left alone so SDK singletons, such as the eval
|
|
5478
|
+
* registry, remain shared.
|
|
5349
5479
|
*/
|
|
5350
5480
|
async function runWithModuleIsolation(context, fn) {
|
|
5351
5481
|
registerModuleIsolationHooks();
|
|
5482
|
+
clearWorkspaceRequireCacheOnce(context);
|
|
5352
5483
|
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
5353
5484
|
return await isolationStorage.run(context, fn);
|
|
5354
5485
|
}
|
|
@@ -5476,20 +5607,26 @@ async function runCase(params) {
|
|
|
5476
5607
|
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
5477
5608
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5478
5609
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5479
|
-
if (!nonAssertError && evalDef.deriveFromTracing)
|
|
5480
|
-
const
|
|
5481
|
-
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5610
|
+
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
5611
|
+
const { deriveFromTracing } = evalDef;
|
|
5612
|
+
try {
|
|
5613
|
+
const derived = await runInExistingEvalScope(scope, "derive", async () => {
|
|
5614
|
+
return await callWithUnknownResult(deriveFromTracing, [{
|
|
5615
|
+
trace: traceTree,
|
|
5616
|
+
input: evalCase.input,
|
|
5617
|
+
case: evalCase
|
|
5618
|
+
}]);
|
|
5619
|
+
});
|
|
5620
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
5621
|
+
for (const [key, value] of Object.entries(derived)) if (!(key in scope.outputs)) scope.outputs[key] = value;
|
|
5622
|
+
} catch (e) {
|
|
5623
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5624
|
+
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5625
|
+
}
|
|
5490
5626
|
}
|
|
5491
5627
|
if (!nonAssertError && evalDef.outputsSchema) {
|
|
5492
|
-
const
|
|
5628
|
+
const { outputsSchema } = evalDef;
|
|
5629
|
+
const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
|
|
5493
5630
|
if (parsedOutputs.success) scope.outputs = {
|
|
5494
5631
|
...scope.outputs,
|
|
5495
5632
|
...parsedOutputs.data
|
|
@@ -5511,6 +5648,7 @@ async function runCase(params) {
|
|
|
5511
5648
|
}, {
|
|
5512
5649
|
input: evalCase.input,
|
|
5513
5650
|
idPrefix: `${scopedIdPrefix}-score-${toStableIdSegment(key)}`,
|
|
5651
|
+
runtimeScope: "scorer",
|
|
5514
5652
|
cacheContext: cacheAdapter ? {
|
|
5515
5653
|
adapter: cacheAdapter,
|
|
5516
5654
|
mode: cacheMode,
|
|
@@ -5791,12 +5929,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5791
5929
|
} catch {
|
|
5792
5930
|
codeFingerprint = "";
|
|
5793
5931
|
}
|
|
5794
|
-
if (codeFingerprint.length > 0)
|
|
5795
|
-
|
|
5932
|
+
if (codeFingerprint.length > 0) {
|
|
5933
|
+
runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
|
|
5934
|
+
evalMeta.sourceFingerprint = codeFingerprint;
|
|
5935
|
+
} else {
|
|
5936
|
+
delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
5937
|
+
evalMeta.sourceFingerprint = null;
|
|
5938
|
+
}
|
|
5796
5939
|
try {
|
|
5797
5940
|
const registry = getEvalRegistry();
|
|
5798
5941
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5799
|
-
await
|
|
5942
|
+
await runInEvalRuntimeScope("env", async () => {
|
|
5943
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
5944
|
+
});
|
|
5800
5945
|
});
|
|
5801
5946
|
const entry = registry.get(evalMeta.id);
|
|
5802
5947
|
if (!entry) {
|
|
@@ -5807,87 +5952,89 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5807
5952
|
continue;
|
|
5808
5953
|
}
|
|
5809
5954
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5810
|
-
await
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
5814
|
-
|
|
5815
|
-
|
|
5816
|
-
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
5822
|
-
|
|
5823
|
-
|
|
5824
|
-
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
5829
|
-
|
|
5830
|
-
|
|
5831
|
-
preparedEvals.push(preparedEval);
|
|
5832
|
-
for (const evalCase of cases) {
|
|
5833
|
-
const trialResults = [];
|
|
5834
|
-
const preparedCase = {
|
|
5835
|
-
caseId: evalCase.id,
|
|
5836
|
-
trialResults,
|
|
5837
|
-
finalized: false
|
|
5955
|
+
await runInEvalRuntimeScope("cases", async () => {
|
|
5956
|
+
await entry.use(async (evalDef) => {
|
|
5957
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
5958
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
5959
|
+
evalId: evalMeta.id
|
|
5960
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5961
|
+
runState.summary.totalCases += cases.length;
|
|
5962
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
5963
|
+
const evalCaseRows = [];
|
|
5964
|
+
const preparedCases = [];
|
|
5965
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
5966
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
5967
|
+
const preparedEval = {
|
|
5968
|
+
evalMeta,
|
|
5969
|
+
accumulatedColumns,
|
|
5970
|
+
evalCaseRows,
|
|
5971
|
+
preparedCases,
|
|
5972
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
5973
|
+
mergeColumns: (columns) => {
|
|
5974
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
5975
|
+
}
|
|
5838
5976
|
};
|
|
5839
|
-
|
|
5840
|
-
for (
|
|
5841
|
-
const
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
|
|
5846
|
-
|
|
5847
|
-
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
|
|
5854
|
-
moduleIsolation,
|
|
5855
|
-
evalFilePath,
|
|
5856
|
-
workspaceRoot,
|
|
5857
|
-
artifactDir: join(runDir, "artifacts"),
|
|
5858
|
-
runId: runState.manifest.id
|
|
5859
|
-
});
|
|
5860
|
-
return {
|
|
5861
|
-
caseDetail,
|
|
5862
|
-
caseRow: {
|
|
5863
|
-
caseId: evalCase.id,
|
|
5977
|
+
preparedEvals.push(preparedEval);
|
|
5978
|
+
for (const evalCase of cases) {
|
|
5979
|
+
const trialResults = [];
|
|
5980
|
+
const preparedCase = {
|
|
5981
|
+
caseId: evalCase.id,
|
|
5982
|
+
trialResults,
|
|
5983
|
+
finalized: false
|
|
5984
|
+
};
|
|
5985
|
+
preparedCases.push(preparedCase);
|
|
5986
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
5987
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
5988
|
+
queuedCases.push({
|
|
5989
|
+
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
5990
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
5991
|
+
evalDef,
|
|
5864
5992
|
evalId: evalMeta.id,
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
5871
|
-
|
|
5872
|
-
|
|
5873
|
-
|
|
5874
|
-
|
|
5875
|
-
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
|
|
5887
|
-
|
|
5888
|
-
|
|
5993
|
+
evalCase,
|
|
5994
|
+
globalTraceDisplay,
|
|
5995
|
+
trial,
|
|
5996
|
+
startTime,
|
|
5997
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
5998
|
+
cacheMode,
|
|
5999
|
+
codeFingerprint,
|
|
6000
|
+
moduleIsolation,
|
|
6001
|
+
evalFilePath,
|
|
6002
|
+
workspaceRoot,
|
|
6003
|
+
artifactDir: join(runDir, "artifacts"),
|
|
6004
|
+
runId: runState.manifest.id
|
|
6005
|
+
});
|
|
6006
|
+
return {
|
|
6007
|
+
caseDetail,
|
|
6008
|
+
caseRow: {
|
|
6009
|
+
caseId: evalCase.id,
|
|
6010
|
+
evalId: evalMeta.id,
|
|
6011
|
+
status: caseRowUpdate.status ?? "pending",
|
|
6012
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
6013
|
+
columns: caseRowUpdate.columns ?? {},
|
|
6014
|
+
trial
|
|
6015
|
+
}
|
|
6016
|
+
};
|
|
6017
|
+
},
|
|
6018
|
+
onComplete: async ({ caseDetail, caseRow }) => {
|
|
6019
|
+
trialResults.push({
|
|
6020
|
+
caseDetail,
|
|
6021
|
+
caseRow,
|
|
6022
|
+
bufferedCacheStore
|
|
6023
|
+
});
|
|
6024
|
+
if (trialResults.length !== request.trials) return;
|
|
6025
|
+
await finalizePreparedCase({
|
|
6026
|
+
runState,
|
|
6027
|
+
runDir,
|
|
6028
|
+
preparedEval,
|
|
6029
|
+
preparedCase,
|
|
6030
|
+
onCaseFinished,
|
|
6031
|
+
emitEvent
|
|
6032
|
+
});
|
|
6033
|
+
}
|
|
6034
|
+
});
|
|
6035
|
+
}
|
|
5889
6036
|
}
|
|
5890
|
-
}
|
|
6037
|
+
});
|
|
5891
6038
|
});
|
|
5892
6039
|
});
|
|
5893
6040
|
} catch (error) {
|
|
@@ -5995,4 +6142,4 @@ function toLastRunStatus(status) {
|
|
|
5995
6142
|
return status === "pending" ? null : status;
|
|
5996
6143
|
}
|
|
5997
6144
|
//#endregion
|
|
5998
|
-
export {
|
|
6145
|
+
export { caseRowSchema as $, appendToEvalOutput as $t, getEvalTitle as A, traceDisplayConfigSchema as At, apiCallMetricFormatSchema as B, fileRefSchema as Bt, createRunRequestSchema as C, serializedCacheSpanSchema as Ct, extractApiCalls as D, traceAttributeDisplayInputSchema as Dt, extractCacheHits as E, traceAttributeDisplayFormatSchema as Et, runManifestSchema as F, traceSpanWarningSchema as Ft, llmCallMetricPlacementSchema as G, z$1 as Gt, apiCallMetricSchema as H, numberDisplayOptionsSchema as Ht, runSummarySchema as I, cellValueSchema as It, resolveApiCallsConfig as J, evalSpan as Jt, llmCallMetricSchema as K, buildTraceTree as Kt, DEFAULT_API_CALLS_CONFIG as L, columnDefSchema as Lt, deriveScopedSummaryFromCases as M, traceSpanErrorSchema as Mt, deriveStatusFromCaseRows as N, traceSpanKindSchema as Nt, extractLlmCalls as O, traceAttributeDisplayPlacementSchema as Ot, deriveStatusFromChildStatuses as P, traceSpanSchema as Pt, caseDetailSchema as Q, EvalAssertionError as Qt, DEFAULT_LLM_CALLS_CONFIG as R, columnFormatSchema as Rt, createFsCacheStore as S, cacheStatusSchema as St, sseEnvelopeSchema as T, traceCacheRefSchema as Tt, apiCallsConfigSchema as U, repoFileRefSchema as Ut, apiCallMetricPlacementSchema as V, jsonCellSchema as Vt, llmCallMetricFormatSchema as W, runArtifactRefSchema as Wt, trialSelectionModeSchema as X, hashCacheKey as Xt, resolveLlmCallsConfig as Y, evalTracer as Yt, assertionFailureSchema as Z, hashCacheKeySync as Zt, loadEvalModule as _, cacheListItemSchema as _t, loadPersistedRunSnapshot as a, mergeEvalOutput as an, scoreTraceSchema as at, buildDeclaredColumnDefs as b, cacheRecordingOpSchema as bt, persistCaseDetail as c, runInEvalScope as cn, evalChartBuiltinMetricSchema as ct, recomputePersistedCaseStatus as d, setScopeCacheContext as dn, evalChartMetricSchema as dt, evalAssert as en, evalFreshnessStatusSchema as et, runTouchesEval as f, startEvalBackgroundJob as fn, evalChartTooltipExtraSchema as ft, setLatestRunInfoMap as g, cacheFileSchema as gt, getTargetEvalIds as h, getEvalRegistry as hn, cacheEntrySchema as ht, getLatestRunInfos as i, isInEvalScope as in, evalSummarySchema as it, getEvalDisplayStatus as j, traceDisplayInputConfigSchema as jt, getNestedAttribute as k, traceAttributeDisplaySchema as kt, persistRunState as l, runInExistingEvalScope as ln, evalChartColorSchema as lt, buildEvalSummary as m, defineEval as mn, evalChartsConfigSchema as mt, generateRunId as n, getEvalCaseInput as nn, evalStatItemSchema as nt, loadPersistedRunSnapshots as o, nextEvalId as on, evalChartAggregateSchema as ot, resolveArtifactPath as p, repoFile as pn, evalChartTypeSchema as pt, llmCallsConfigSchema as q, captureEvalSpanError as qt, getLastRunStatuses as r, incrementEvalOutput as rn, evalStatsConfigSchema as rt, nextShortIdFromSnapshots as s, runInEvalRuntimeScope as sn, evalChartAxisSchema as st, executeRun as t, getCurrentScope as tn, evalStatAggregateSchema as tt, recomputeEvalStatusesInRuns as u, setEvalOutput as un, evalChartConfigSchema as ut, parseEvalMetas as v, cacheModeSchema as vt, updateManualScoreRequestSchema as w, spanCacheOptionsSchema as wt, normalizeScoreDef as x, cacheRecordingSchema as xt, loadConfig as y, cacheOperationTypeSchema as yt, agentEvalsConfigSchema as z, columnKindSchema as zt };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BmrtjQj_.mjs";
|
|
2
|
+
import "./src-CEAJYN_X.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.16.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
}
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
|
-
"dist"
|
|
16
|
+
"dist",
|
|
17
|
+
"skills"
|
|
17
18
|
],
|
|
18
19
|
"tsdown": {
|
|
19
20
|
"clean": true,
|
|
@@ -58,8 +59,8 @@
|
|
|
58
59
|
"@types/node": "^24.7.2",
|
|
59
60
|
"typescript": "^5.9.2",
|
|
60
61
|
"@agent-evals/runner": "0.0.1",
|
|
61
|
-
"@agent-evals/
|
|
62
|
-
"@agent-evals/
|
|
62
|
+
"@agent-evals/shared": "0.0.1",
|
|
63
|
+
"@agent-evals/sdk": "0.0.1"
|
|
63
64
|
},
|
|
64
65
|
"scripts": {
|
|
65
66
|
"build": "pnpm --filter @agent-evals/web build && tsdown",
|