@ls-stack/agent-eval 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Dg3qYVku.mjs → app-B7FUWsVm.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +117 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +8 -3
- package/dist/{cli-lOZdhO2D.mjs → cli-B-sCTyz8.mjs} +17 -98
- package/dist/index.d.mts +26 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +41 -3
- package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-B3fYtpKo.mjs} +269 -124
- package/dist/{runner-BK1KX2SA.mjs → runner-Dt-Ynv6s.mjs} +1 -1
- package/dist/{runner-CmbmfBG2.mjs → runner-vunKoSBu.mjs} +2 -2
- package/dist/src-jaOlXwb5.mjs +3 -0
- package/package.json +3 -2
- package/skills/agent-eval/SKILL.md +400 -0
- package/dist/apps/web/dist/assets/index-Drw0IpOd.js +0 -117
- package/dist/src-Btb9RCYD.mjs +0 -3
|
@@ -46,7 +46,9 @@ function repoFile(path, mimeType) {
|
|
|
46
46
|
//#endregion
|
|
47
47
|
//#region ../sdk/src/runtime.ts
|
|
48
48
|
const scopeStorage = new AsyncLocalStorage();
|
|
49
|
+
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
49
50
|
let activeEvalScopeCount = 0;
|
|
51
|
+
let activeEvalRuntimeScopeCount = 0;
|
|
50
52
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
51
53
|
var EvalAssertionError = class extends Error {
|
|
52
54
|
constructor(message) {
|
|
@@ -60,13 +62,16 @@ function getCurrentScope() {
|
|
|
60
62
|
return scopeStorage.getStore();
|
|
61
63
|
}
|
|
62
64
|
/**
|
|
63
|
-
* Return
|
|
65
|
+
* Return the current eval runner phase for this async execution.
|
|
64
66
|
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
68
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
69
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
70
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
67
71
|
*/
|
|
68
72
|
function isInEvalScope() {
|
|
69
|
-
|
|
73
|
+
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
74
|
+
return runtimeScopeStorage.getStore() ?? null;
|
|
70
75
|
}
|
|
71
76
|
function registerBackgroundJobInScope(scope, promise) {
|
|
72
77
|
const trackedPromise = promise.then(() => {
|
|
@@ -122,6 +127,31 @@ function getEvalCaseInput(path = void 0) {
|
|
|
122
127
|
function setScopeCacheContext(scope, context) {
|
|
123
128
|
scope.cacheContext = context;
|
|
124
129
|
}
|
|
130
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
131
|
+
async function runInEvalRuntimeScope(runtimeScope, fn) {
|
|
132
|
+
activeEvalRuntimeScopeCount++;
|
|
133
|
+
try {
|
|
134
|
+
return await runtimeScopeStorage.run(runtimeScope, fn);
|
|
135
|
+
} finally {
|
|
136
|
+
activeEvalRuntimeScopeCount--;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
141
|
+
*
|
|
142
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
143
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
144
|
+
*/
|
|
145
|
+
async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
146
|
+
activeEvalScopeCount++;
|
|
147
|
+
try {
|
|
148
|
+
return await scopeStorage.run(scope, async () => {
|
|
149
|
+
return await runInEvalRuntimeScope(runtimeScope, fn);
|
|
150
|
+
});
|
|
151
|
+
} finally {
|
|
152
|
+
activeEvalScopeCount--;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
125
155
|
/**
|
|
126
156
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
127
157
|
* trace data, and terminal error state.
|
|
@@ -144,29 +174,24 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
144
174
|
caseCacheRefs: [],
|
|
145
175
|
pendingBackgroundJobs: /* @__PURE__ */ new Set()
|
|
146
176
|
};
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
}
|
|
166
|
-
});
|
|
167
|
-
} finally {
|
|
168
|
-
activeEvalScopeCount--;
|
|
169
|
-
}
|
|
177
|
+
return await runInExistingEvalScope(scope, options.runtimeScope ?? "eval", async () => {
|
|
178
|
+
try {
|
|
179
|
+
const result = await fn();
|
|
180
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
181
|
+
return {
|
|
182
|
+
result,
|
|
183
|
+
scope,
|
|
184
|
+
error: void 0
|
|
185
|
+
};
|
|
186
|
+
} catch (error) {
|
|
187
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
188
|
+
return {
|
|
189
|
+
result: void 0,
|
|
190
|
+
scope,
|
|
191
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
});
|
|
170
195
|
}
|
|
171
196
|
/**
|
|
172
197
|
* Return the next deterministic ID for the active eval case execution.
|
|
@@ -4886,6 +4911,98 @@ async function loadConfig() {
|
|
|
4886
4911
|
}
|
|
4887
4912
|
}
|
|
4888
4913
|
//#endregion
|
|
4914
|
+
//#region ../runner/src/discovery.ts
|
|
4915
|
+
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4916
|
+
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
4917
|
+
function parseEvalMetas(filePath, content) {
|
|
4918
|
+
const metas = [];
|
|
4919
|
+
let searchIndex = 0;
|
|
4920
|
+
while (searchIndex < content.length) {
|
|
4921
|
+
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
4922
|
+
if (defineEvalIndex === -1) break;
|
|
4923
|
+
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
4924
|
+
if (!extracted) {
|
|
4925
|
+
searchIndex = defineEvalIndex + 10;
|
|
4926
|
+
continue;
|
|
4927
|
+
}
|
|
4928
|
+
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
4929
|
+
if (id !== void 0) {
|
|
4930
|
+
const result = {
|
|
4931
|
+
filePath,
|
|
4932
|
+
id
|
|
4933
|
+
};
|
|
4934
|
+
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
4935
|
+
if (title !== void 0) result.title = title;
|
|
4936
|
+
metas.push(result);
|
|
4937
|
+
}
|
|
4938
|
+
searchIndex = extracted.nextIndex;
|
|
4939
|
+
}
|
|
4940
|
+
return metas;
|
|
4941
|
+
}
|
|
4942
|
+
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
4943
|
+
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
4944
|
+
if (openParenIndex === -1) return void 0;
|
|
4945
|
+
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
4946
|
+
if (objectStartIndex === -1) return void 0;
|
|
4947
|
+
let depth = 0;
|
|
4948
|
+
let quote;
|
|
4949
|
+
let inBlockComment = false;
|
|
4950
|
+
let inLineComment = false;
|
|
4951
|
+
let isEscaped = false;
|
|
4952
|
+
for (let index = objectStartIndex; index < content.length; index++) {
|
|
4953
|
+
const currentChar = content[index];
|
|
4954
|
+
const nextChar = content[index + 1];
|
|
4955
|
+
if (inLineComment) {
|
|
4956
|
+
if (currentChar === "\n") inLineComment = false;
|
|
4957
|
+
continue;
|
|
4958
|
+
}
|
|
4959
|
+
if (inBlockComment) {
|
|
4960
|
+
if (currentChar === "*" && nextChar === "/") {
|
|
4961
|
+
inBlockComment = false;
|
|
4962
|
+
index++;
|
|
4963
|
+
}
|
|
4964
|
+
continue;
|
|
4965
|
+
}
|
|
4966
|
+
if (quote) {
|
|
4967
|
+
if (isEscaped) {
|
|
4968
|
+
isEscaped = false;
|
|
4969
|
+
continue;
|
|
4970
|
+
}
|
|
4971
|
+
if (currentChar === "\\") {
|
|
4972
|
+
isEscaped = true;
|
|
4973
|
+
continue;
|
|
4974
|
+
}
|
|
4975
|
+
if (currentChar === quote) quote = void 0;
|
|
4976
|
+
continue;
|
|
4977
|
+
}
|
|
4978
|
+
if (currentChar === "/" && nextChar === "/") {
|
|
4979
|
+
inLineComment = true;
|
|
4980
|
+
index++;
|
|
4981
|
+
continue;
|
|
4982
|
+
}
|
|
4983
|
+
if (currentChar === "/" && nextChar === "*") {
|
|
4984
|
+
inBlockComment = true;
|
|
4985
|
+
index++;
|
|
4986
|
+
continue;
|
|
4987
|
+
}
|
|
4988
|
+
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
4989
|
+
quote = currentChar;
|
|
4990
|
+
continue;
|
|
4991
|
+
}
|
|
4992
|
+
if (currentChar === "{") {
|
|
4993
|
+
depth++;
|
|
4994
|
+
continue;
|
|
4995
|
+
}
|
|
4996
|
+
if (currentChar === "}") {
|
|
4997
|
+
depth--;
|
|
4998
|
+
if (depth === 0) return {
|
|
4999
|
+
nextIndex: index + 1,
|
|
5000
|
+
objectText: content.slice(objectStartIndex, index + 1)
|
|
5001
|
+
};
|
|
5002
|
+
}
|
|
5003
|
+
}
|
|
5004
|
+
}
|
|
5005
|
+
//#endregion
|
|
4889
5006
|
//#region ../runner/src/evalModuleLoader.ts
|
|
4890
5007
|
/**
|
|
4891
5008
|
* Import one eval module with a cache key derived from its current source so
|
|
@@ -5280,6 +5397,7 @@ const isolationParam = "agent-evals-isolate";
|
|
|
5280
5397
|
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
5281
5398
|
const isolationStorage = new AsyncLocalStorage();
|
|
5282
5399
|
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
5400
|
+
const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
|
|
5283
5401
|
let hooksRegistered = false;
|
|
5284
5402
|
const requireFromRunner = createRequire(import.meta.url);
|
|
5285
5403
|
const agentPackageUrlBySpecifier = new Map([
|
|
@@ -5305,7 +5423,10 @@ function getIsolationKeyFromParent(parentURL) {
|
|
|
5305
5423
|
}
|
|
5306
5424
|
function isWorkspaceFile(url, workspaceRoot) {
|
|
5307
5425
|
if (url.protocol !== "file:") return false;
|
|
5308
|
-
|
|
5426
|
+
return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
|
|
5427
|
+
}
|
|
5428
|
+
function isWorkspaceFilePath(filePath, workspaceRoot) {
|
|
5429
|
+
const relativePath = relative(workspaceRoot, filePath);
|
|
5309
5430
|
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
5310
5431
|
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
5311
5432
|
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
@@ -5340,15 +5461,23 @@ function registerModuleIsolationHooks() {
|
|
|
5340
5461
|
};
|
|
5341
5462
|
} });
|
|
5342
5463
|
}
|
|
5464
|
+
function clearWorkspaceRequireCacheOnce(context) {
|
|
5465
|
+
if (clearedRequireCacheKeys.has(context.key)) return;
|
|
5466
|
+
clearedRequireCacheKeys.add(context.key);
|
|
5467
|
+
for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
|
|
5468
|
+
}
|
|
5343
5469
|
/**
|
|
5344
5470
|
* Execute module loading and eval code with fresh workspace module URLs.
|
|
5345
5471
|
*
|
|
5346
5472
|
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
5347
|
-
* run-scoped query parameter to workspace file imports.
|
|
5348
|
-
*
|
|
5473
|
+
* run-scoped query parameter to workspace file imports. CommonJS modules use
|
|
5474
|
+
* `require.cache` behind ESM imports, so workspace entries are cleared once per
|
|
5475
|
+
* run. Package imports are left alone so SDK singletons, such as the eval
|
|
5476
|
+
* registry, remain shared.
|
|
5349
5477
|
*/
|
|
5350
5478
|
async function runWithModuleIsolation(context, fn) {
|
|
5351
5479
|
registerModuleIsolationHooks();
|
|
5480
|
+
clearWorkspaceRequireCacheOnce(context);
|
|
5352
5481
|
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
5353
5482
|
return await isolationStorage.run(context, fn);
|
|
5354
5483
|
}
|
|
@@ -5476,20 +5605,26 @@ async function runCase(params) {
|
|
|
5476
5605
|
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
5477
5606
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5478
5607
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5479
|
-
if (!nonAssertError && evalDef.deriveFromTracing)
|
|
5480
|
-
const
|
|
5481
|
-
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5608
|
+
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
5609
|
+
const { deriveFromTracing } = evalDef;
|
|
5610
|
+
try {
|
|
5611
|
+
const derived = await runInExistingEvalScope(scope, "derive", async () => {
|
|
5612
|
+
return await callWithUnknownResult(deriveFromTracing, [{
|
|
5613
|
+
trace: traceTree,
|
|
5614
|
+
input: evalCase.input,
|
|
5615
|
+
case: evalCase
|
|
5616
|
+
}]);
|
|
5617
|
+
});
|
|
5618
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
5619
|
+
for (const [key, value] of Object.entries(derived)) if (!(key in scope.outputs)) scope.outputs[key] = value;
|
|
5620
|
+
} catch (e) {
|
|
5621
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5622
|
+
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5623
|
+
}
|
|
5490
5624
|
}
|
|
5491
5625
|
if (!nonAssertError && evalDef.outputsSchema) {
|
|
5492
|
-
const
|
|
5626
|
+
const { outputsSchema } = evalDef;
|
|
5627
|
+
const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
|
|
5493
5628
|
if (parsedOutputs.success) scope.outputs = {
|
|
5494
5629
|
...scope.outputs,
|
|
5495
5630
|
...parsedOutputs.data
|
|
@@ -5511,6 +5646,7 @@ async function runCase(params) {
|
|
|
5511
5646
|
}, {
|
|
5512
5647
|
input: evalCase.input,
|
|
5513
5648
|
idPrefix: `${scopedIdPrefix}-score-${toStableIdSegment(key)}`,
|
|
5649
|
+
runtimeScope: "scorer",
|
|
5514
5650
|
cacheContext: cacheAdapter ? {
|
|
5515
5651
|
adapter: cacheAdapter,
|
|
5516
5652
|
mode: cacheMode,
|
|
@@ -5791,12 +5927,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5791
5927
|
} catch {
|
|
5792
5928
|
codeFingerprint = "";
|
|
5793
5929
|
}
|
|
5794
|
-
if (codeFingerprint.length > 0)
|
|
5795
|
-
|
|
5930
|
+
if (codeFingerprint.length > 0) {
|
|
5931
|
+
runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
|
|
5932
|
+
evalMeta.sourceFingerprint = codeFingerprint;
|
|
5933
|
+
} else {
|
|
5934
|
+
delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
5935
|
+
evalMeta.sourceFingerprint = null;
|
|
5936
|
+
}
|
|
5796
5937
|
try {
|
|
5797
5938
|
const registry = getEvalRegistry();
|
|
5798
5939
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5799
|
-
await
|
|
5940
|
+
await runInEvalRuntimeScope("env", async () => {
|
|
5941
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
5942
|
+
});
|
|
5800
5943
|
});
|
|
5801
5944
|
const entry = registry.get(evalMeta.id);
|
|
5802
5945
|
if (!entry) {
|
|
@@ -5807,87 +5950,89 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5807
5950
|
continue;
|
|
5808
5951
|
}
|
|
5809
5952
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5810
|
-
await
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
5814
|
-
|
|
5815
|
-
|
|
5816
|
-
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
5822
|
-
|
|
5823
|
-
|
|
5824
|
-
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
5829
|
-
|
|
5830
|
-
|
|
5831
|
-
preparedEvals.push(preparedEval);
|
|
5832
|
-
for (const evalCase of cases) {
|
|
5833
|
-
const trialResults = [];
|
|
5834
|
-
const preparedCase = {
|
|
5835
|
-
caseId: evalCase.id,
|
|
5836
|
-
trialResults,
|
|
5837
|
-
finalized: false
|
|
5953
|
+
await runInEvalRuntimeScope("cases", async () => {
|
|
5954
|
+
await entry.use(async (evalDef) => {
|
|
5955
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
5956
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
5957
|
+
evalId: evalMeta.id
|
|
5958
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5959
|
+
runState.summary.totalCases += cases.length;
|
|
5960
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
5961
|
+
const evalCaseRows = [];
|
|
5962
|
+
const preparedCases = [];
|
|
5963
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
5964
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
5965
|
+
const preparedEval = {
|
|
5966
|
+
evalMeta,
|
|
5967
|
+
accumulatedColumns,
|
|
5968
|
+
evalCaseRows,
|
|
5969
|
+
preparedCases,
|
|
5970
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
5971
|
+
mergeColumns: (columns) => {
|
|
5972
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
5973
|
+
}
|
|
5838
5974
|
};
|
|
5839
|
-
|
|
5840
|
-
for (
|
|
5841
|
-
const
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
|
|
5846
|
-
|
|
5847
|
-
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
|
|
5854
|
-
moduleIsolation,
|
|
5855
|
-
evalFilePath,
|
|
5856
|
-
workspaceRoot,
|
|
5857
|
-
artifactDir: join(runDir, "artifacts"),
|
|
5858
|
-
runId: runState.manifest.id
|
|
5859
|
-
});
|
|
5860
|
-
return {
|
|
5861
|
-
caseDetail,
|
|
5862
|
-
caseRow: {
|
|
5863
|
-
caseId: evalCase.id,
|
|
5975
|
+
preparedEvals.push(preparedEval);
|
|
5976
|
+
for (const evalCase of cases) {
|
|
5977
|
+
const trialResults = [];
|
|
5978
|
+
const preparedCase = {
|
|
5979
|
+
caseId: evalCase.id,
|
|
5980
|
+
trialResults,
|
|
5981
|
+
finalized: false
|
|
5982
|
+
};
|
|
5983
|
+
preparedCases.push(preparedCase);
|
|
5984
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
5985
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
5986
|
+
queuedCases.push({
|
|
5987
|
+
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
5988
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
5989
|
+
evalDef,
|
|
5864
5990
|
evalId: evalMeta.id,
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
5871
|
-
|
|
5872
|
-
|
|
5873
|
-
|
|
5874
|
-
|
|
5875
|
-
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
|
|
5887
|
-
|
|
5888
|
-
|
|
5991
|
+
evalCase,
|
|
5992
|
+
globalTraceDisplay,
|
|
5993
|
+
trial,
|
|
5994
|
+
startTime,
|
|
5995
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
5996
|
+
cacheMode,
|
|
5997
|
+
codeFingerprint,
|
|
5998
|
+
moduleIsolation,
|
|
5999
|
+
evalFilePath,
|
|
6000
|
+
workspaceRoot,
|
|
6001
|
+
artifactDir: join(runDir, "artifacts"),
|
|
6002
|
+
runId: runState.manifest.id
|
|
6003
|
+
});
|
|
6004
|
+
return {
|
|
6005
|
+
caseDetail,
|
|
6006
|
+
caseRow: {
|
|
6007
|
+
caseId: evalCase.id,
|
|
6008
|
+
evalId: evalMeta.id,
|
|
6009
|
+
status: caseRowUpdate.status ?? "pending",
|
|
6010
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
6011
|
+
columns: caseRowUpdate.columns ?? {},
|
|
6012
|
+
trial
|
|
6013
|
+
}
|
|
6014
|
+
};
|
|
6015
|
+
},
|
|
6016
|
+
onComplete: async ({ caseDetail, caseRow }) => {
|
|
6017
|
+
trialResults.push({
|
|
6018
|
+
caseDetail,
|
|
6019
|
+
caseRow,
|
|
6020
|
+
bufferedCacheStore
|
|
6021
|
+
});
|
|
6022
|
+
if (trialResults.length !== request.trials) return;
|
|
6023
|
+
await finalizePreparedCase({
|
|
6024
|
+
runState,
|
|
6025
|
+
runDir,
|
|
6026
|
+
preparedEval,
|
|
6027
|
+
preparedCase,
|
|
6028
|
+
onCaseFinished,
|
|
6029
|
+
emitEvent
|
|
6030
|
+
});
|
|
6031
|
+
}
|
|
6032
|
+
});
|
|
6033
|
+
}
|
|
5889
6034
|
}
|
|
5890
|
-
}
|
|
6035
|
+
});
|
|
5891
6036
|
});
|
|
5892
6037
|
});
|
|
5893
6038
|
} catch (error) {
|
|
@@ -5995,4 +6140,4 @@ function toLastRunStatus(status) {
|
|
|
5995
6140
|
return status === "pending" ? null : status;
|
|
5996
6141
|
}
|
|
5997
6142
|
//#endregion
|
|
5998
|
-
export {
|
|
6143
|
+
export { caseRowSchema as $, appendToEvalOutput as $t, getEvalTitle as A, traceDisplayConfigSchema as At, apiCallMetricFormatSchema as B, fileRefSchema as Bt, createRunRequestSchema as C, serializedCacheSpanSchema as Ct, extractApiCalls as D, traceAttributeDisplayInputSchema as Dt, extractCacheHits as E, traceAttributeDisplayFormatSchema as Et, runManifestSchema as F, traceSpanWarningSchema as Ft, llmCallMetricPlacementSchema as G, z$1 as Gt, apiCallMetricSchema as H, numberDisplayOptionsSchema as Ht, runSummarySchema as I, cellValueSchema as It, resolveApiCallsConfig as J, evalSpan as Jt, llmCallMetricSchema as K, buildTraceTree as Kt, DEFAULT_API_CALLS_CONFIG as L, columnDefSchema as Lt, deriveScopedSummaryFromCases as M, traceSpanErrorSchema as Mt, deriveStatusFromCaseRows as N, traceSpanKindSchema as Nt, extractLlmCalls as O, traceAttributeDisplayPlacementSchema as Ot, deriveStatusFromChildStatuses as P, traceSpanSchema as Pt, caseDetailSchema as Q, EvalAssertionError as Qt, DEFAULT_LLM_CALLS_CONFIG as R, columnFormatSchema as Rt, createFsCacheStore as S, cacheStatusSchema as St, sseEnvelopeSchema as T, traceCacheRefSchema as Tt, apiCallsConfigSchema as U, repoFileRefSchema as Ut, apiCallMetricPlacementSchema as V, jsonCellSchema as Vt, llmCallMetricFormatSchema as W, runArtifactRefSchema as Wt, trialSelectionModeSchema as X, hashCacheKey as Xt, resolveLlmCallsConfig as Y, evalTracer as Yt, assertionFailureSchema as Z, hashCacheKeySync as Zt, loadEvalModule as _, cacheListItemSchema as _t, loadPersistedRunSnapshot as a, mergeEvalOutput as an, scoreTraceSchema as at, buildDeclaredColumnDefs as b, cacheRecordingOpSchema as bt, persistCaseDetail as c, runInEvalScope as cn, evalChartBuiltinMetricSchema as ct, recomputePersistedCaseStatus as d, setScopeCacheContext as dn, evalChartMetricSchema as dt, evalAssert as en, evalFreshnessStatusSchema as et, runTouchesEval as f, startEvalBackgroundJob as fn, evalChartTooltipExtraSchema as ft, setLatestRunInfoMap as g, cacheFileSchema as gt, getTargetEvalIds as h, getEvalRegistry as hn, cacheEntrySchema as ht, getLatestRunInfos as i, isInEvalScope as in, evalSummarySchema as it, getEvalDisplayStatus as j, traceDisplayInputConfigSchema as jt, getNestedAttribute as k, traceAttributeDisplaySchema as kt, persistRunState as l, runInExistingEvalScope as ln, evalChartColorSchema as lt, buildEvalSummary as m, defineEval as mn, evalChartsConfigSchema as mt, generateRunId as n, getEvalCaseInput as nn, evalStatItemSchema as nt, loadPersistedRunSnapshots as o, nextEvalId as on, evalChartAggregateSchema as ot, resolveArtifactPath as p, repoFile as pn, evalChartTypeSchema as pt, llmCallsConfigSchema as q, captureEvalSpanError as qt, getLastRunStatuses as r, incrementEvalOutput as rn, evalStatsConfigSchema as rt, nextShortIdFromSnapshots as s, runInEvalRuntimeScope as sn, evalChartAxisSchema as st, executeRun as t, getCurrentScope as tn, evalStatAggregateSchema as tt, recomputeEvalStatusesInRuns as u, setEvalOutput as un, evalChartConfigSchema as ut, parseEvalMetas as v, cacheModeSchema as vt, updateManualScoreRequestSchema as w, spanCacheOptionsSchema as wt, normalizeScoreDef as x, cacheRecordingSchema as xt, loadConfig as y, cacheOperationTypeSchema as yt, agentEvalsConfigSchema as z, columnKindSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-vunKoSBu.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-B-sCTyz8.mjs";
|
|
2
|
+
import "./src-jaOlXwb5.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.15.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
}
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
|
-
"dist"
|
|
16
|
+
"dist",
|
|
17
|
+
"skills"
|
|
17
18
|
],
|
|
18
19
|
"tsdown": {
|
|
19
20
|
"clean": true,
|