@ls-stack/agent-eval 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DYRmucgj.mjs → app-DS3j_AyX.mjs} +6 -3
- package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
- package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Be0x8CS3.mjs → cli-ETfZ15RB.mjs} +151 -42
- package/dist/index.d.mts +61 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +28 -18
- package/dist/{runOrchestration-D697g6Qe.mjs → runOrchestration-B31SV_Bq.mjs} +222 -87
- package/dist/{runner-jSujaSKt.mjs → runner-B2f2TEjp.mjs} +1 -1
- package/dist/{runner-B4SosWgD.mjs → runner-cj1TkR-H.mjs} +2 -2
- package/dist/src-CyNb2ycA.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +6 -1
- package/dist/apps/web/dist/assets/index-KbbX3NYr.js +0 -118
- package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
- package/dist/src-D6cettg0.mjs +0 -3
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DqR1YaMG.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-DNsZjOms.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle,
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -8,6 +8,34 @@ import { existsSync } from "node:fs";
|
|
|
8
8
|
import { resultify } from "t-result";
|
|
9
9
|
import { fileURLToPath } from "node:url";
|
|
10
10
|
import { spawn, spawnSync } from "node:child_process";
|
|
11
|
+
//#region ../runner/src/evalSummaries.ts
|
|
12
|
+
/** Build the API/UI summary payload for one discovered eval. */
|
|
13
|
+
function buildEvalSummary(params) {
|
|
14
|
+
const { meta, config, gitState, latestRun, lastRunStatus } = params;
|
|
15
|
+
const { sourceFingerprint, ...summaryMeta } = meta;
|
|
16
|
+
const freshness = deriveEvalFreshness({
|
|
17
|
+
latestRun,
|
|
18
|
+
gitState,
|
|
19
|
+
currentEvalSourceFingerprint: sourceFingerprint,
|
|
20
|
+
staleAfterDays: config.staleAfterDays ?? 14
|
|
21
|
+
});
|
|
22
|
+
return {
|
|
23
|
+
...summaryMeta,
|
|
24
|
+
stale: freshness.stale,
|
|
25
|
+
outdated: freshness.outdated,
|
|
26
|
+
freshnessStatus: freshness.freshnessStatus,
|
|
27
|
+
latestRunAt: latestRun?.startedAt ?? null,
|
|
28
|
+
latestRunCommitSha: latestRun?.commitSha ?? null,
|
|
29
|
+
currentCommitSha: gitState.commitSha,
|
|
30
|
+
lastRunStatus
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/** Write one latest-run snapshot to each targeted eval id. */
|
|
34
|
+
function setLatestRunInfoMap(params) {
|
|
35
|
+
const { latestRunInfoMap, evalIds, info } = params;
|
|
36
|
+
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
37
|
+
}
|
|
38
|
+
//#endregion
|
|
11
39
|
//#region ../runner/src/gitState.ts
|
|
12
40
|
function runGitCommand(workspaceRoot, args) {
|
|
13
41
|
const result = spawnSync("git", args, {
|
|
@@ -142,16 +170,16 @@ function handleRunChildMessage(params) {
|
|
|
142
170
|
handleRunChildEvent(runState, message.event, managerContext);
|
|
143
171
|
}
|
|
144
172
|
function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
145
|
-
const existingIndex = runState.cases.findIndex((row) => row
|
|
173
|
+
const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
|
|
146
174
|
if (existingIndex === -1) runState.cases.push(caseRow);
|
|
147
175
|
else runState.cases[existingIndex] = caseRow;
|
|
148
|
-
runState.caseDetails.set(caseDetail.caseId, caseDetail);
|
|
176
|
+
runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
|
|
149
177
|
}
|
|
150
178
|
function applyChildEvalMetas(evals, childMetas) {
|
|
151
179
|
for (const childMeta of childMetas) {
|
|
152
|
-
const evalMeta = evals.get(childMeta.
|
|
180
|
+
const evalMeta = evals.get(childMeta.key);
|
|
153
181
|
if (evalMeta === void 0) {
|
|
154
|
-
evals.set(childMeta.
|
|
182
|
+
evals.set(childMeta.key, childMeta);
|
|
155
183
|
continue;
|
|
156
184
|
}
|
|
157
185
|
evalMeta.columnDefs = childMeta.columnDefs;
|
|
@@ -266,6 +294,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
266
294
|
let llmCallsConfig = resolveLlmCallsConfig(void 0);
|
|
267
295
|
let apiCallsConfig = resolveApiCallsConfig(void 0);
|
|
268
296
|
const evals = /* @__PURE__ */ new Map();
|
|
297
|
+
let discoveryIssues = [];
|
|
269
298
|
const runs = /* @__PURE__ */ new Map();
|
|
270
299
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
271
300
|
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
@@ -279,7 +308,13 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
279
308
|
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
280
309
|
}
|
|
281
310
|
function getSortedEvalMetas() {
|
|
282
|
-
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
311
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
|
|
312
|
+
}
|
|
313
|
+
function resolveEvalMeta(evalRef) {
|
|
314
|
+
const exactMatch = evals.get(evalRef);
|
|
315
|
+
if (exactMatch !== void 0) return exactMatch;
|
|
316
|
+
const matches = getSortedEvalMetas().filter((ev) => ev.id === evalRef);
|
|
317
|
+
return matches.length === 1 ? matches[0] : void 0;
|
|
283
318
|
}
|
|
284
319
|
function getSourceFingerprint(source) {
|
|
285
320
|
return createHash("sha256").update(source).digest("hex");
|
|
@@ -312,12 +347,12 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
312
347
|
async clearCache(filter) {
|
|
313
348
|
await cacheStore.clear(filter);
|
|
314
349
|
},
|
|
315
|
-
async recomputeStatusesForEval(
|
|
316
|
-
const evalMeta =
|
|
350
|
+
async recomputeStatusesForEval(evalKey) {
|
|
351
|
+
const evalMeta = resolveEvalMeta(evalKey);
|
|
317
352
|
if (!evalMeta) return { updatedRuns: 0 };
|
|
318
353
|
const registry = getEvalRegistry();
|
|
319
354
|
await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
|
|
320
|
-
const entry = registry.get(
|
|
355
|
+
const entry = registry.get(evalMeta.id);
|
|
321
356
|
if (!entry) return { updatedRuns: 0 };
|
|
322
357
|
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
323
358
|
entry.use((evalDef) => {
|
|
@@ -329,22 +364,25 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
329
364
|
});
|
|
330
365
|
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
331
366
|
runs: runs.values(),
|
|
332
|
-
|
|
333
|
-
|
|
367
|
+
evalKey: evalMeta.key,
|
|
368
|
+
evalId: evalMeta.id,
|
|
369
|
+
evalExists: evals.has(evalMeta.key),
|
|
334
370
|
scoreThresholds,
|
|
335
371
|
persistCaseDetail
|
|
336
372
|
});
|
|
337
373
|
emitDiscoveryEvent();
|
|
338
374
|
return { updatedRuns };
|
|
339
375
|
},
|
|
340
|
-
async cleanRunsForEval(
|
|
376
|
+
async cleanRunsForEval(evalKey) {
|
|
377
|
+
const evalMeta = resolveEvalMeta(evalKey);
|
|
341
378
|
let deletedRuns = 0;
|
|
342
379
|
for (const [runId, run] of [...runs]) {
|
|
343
380
|
if (!runTouchesEval({
|
|
344
381
|
target: run.manifest.target,
|
|
345
382
|
caseRows: run.cases,
|
|
346
|
-
|
|
347
|
-
|
|
383
|
+
evalKey: evalMeta?.key ?? evalKey,
|
|
384
|
+
evalId: evalMeta?.id,
|
|
385
|
+
evalExists: evalMeta !== void 0
|
|
348
386
|
})) continue;
|
|
349
387
|
if (run.manifest.status === "running") continue;
|
|
350
388
|
runs.delete(runId);
|
|
@@ -367,12 +405,12 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
367
405
|
updated: false,
|
|
368
406
|
reason: "Run is still running"
|
|
369
407
|
};
|
|
370
|
-
const caseRow = run.cases.find((row) => row.caseId === caseId);
|
|
408
|
+
const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
|
|
371
409
|
if (!caseRow) return {
|
|
372
410
|
updated: false,
|
|
373
411
|
reason: "Case not found"
|
|
374
412
|
};
|
|
375
|
-
const evalMeta = evals.get(caseRow
|
|
413
|
+
const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
|
|
376
414
|
if (!evalMeta) return {
|
|
377
415
|
updated: false,
|
|
378
416
|
reason: "Eval not found"
|
|
@@ -381,7 +419,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
381
419
|
updated: false,
|
|
382
420
|
reason: "Manual score not found"
|
|
383
421
|
};
|
|
384
|
-
const caseDetail = run.caseDetails.get(
|
|
422
|
+
const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
385
423
|
if (!caseDetail) return {
|
|
386
424
|
updated: false,
|
|
387
425
|
reason: "Case detail not found"
|
|
@@ -435,22 +473,25 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
435
473
|
meta,
|
|
436
474
|
config,
|
|
437
475
|
gitState,
|
|
438
|
-
latestRun: latestRunInfoMap.get(meta.
|
|
439
|
-
lastRunStatus: lastRunStatusMap.get(meta.
|
|
476
|
+
latestRun: latestRunInfoMap.get(meta.key),
|
|
477
|
+
lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
|
|
440
478
|
}));
|
|
441
479
|
return result;
|
|
442
480
|
},
|
|
443
481
|
getEval(id) {
|
|
444
|
-
const meta =
|
|
482
|
+
const meta = resolveEvalMeta(id);
|
|
445
483
|
if (!meta) return void 0;
|
|
446
484
|
return buildEvalSummary({
|
|
447
485
|
meta,
|
|
448
486
|
config,
|
|
449
487
|
gitState: readGitWorktreeState(workspaceRoot),
|
|
450
|
-
latestRun: latestRunInfoMap.get(meta.
|
|
451
|
-
lastRunStatus: lastRunStatusMap.get(meta.
|
|
488
|
+
latestRun: latestRunInfoMap.get(meta.key),
|
|
489
|
+
lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
|
|
452
490
|
});
|
|
453
491
|
},
|
|
492
|
+
getDiscoveryIssues() {
|
|
493
|
+
return discoveryIssues;
|
|
494
|
+
},
|
|
454
495
|
async refreshDiscovery() {
|
|
455
496
|
const patterns = config.include;
|
|
456
497
|
const discovered = [];
|
|
@@ -462,16 +503,25 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
462
503
|
discovered.push(...files);
|
|
463
504
|
}
|
|
464
505
|
evals.clear();
|
|
506
|
+
discoveryIssues = [];
|
|
465
507
|
for (const filePath of discovered) try {
|
|
466
508
|
const content = await readFile(filePath, "utf-8");
|
|
467
|
-
const
|
|
509
|
+
const discovery = parseEvalDiscovery(filePath, content);
|
|
510
|
+
const discoveredMetas = discovery.metas;
|
|
511
|
+
discoveryIssues.push(...discovery.issues.map((issue) => ({
|
|
512
|
+
...issue,
|
|
513
|
+
filePath: toWorkspaceRelativePath(issue.filePath),
|
|
514
|
+
message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
|
|
515
|
+
})));
|
|
468
516
|
const sourceFingerprint = getSourceFingerprint(content);
|
|
469
517
|
const registry = getEvalRegistry();
|
|
518
|
+
let moduleLoaded = false;
|
|
470
519
|
try {
|
|
471
520
|
await loadEvalModule(filePath, sourceFingerprint);
|
|
521
|
+
moduleLoaded = true;
|
|
472
522
|
} catch {}
|
|
473
523
|
for (const meta of discoveredMetas) {
|
|
474
|
-
const discoveredEntry = registry.get(meta.id);
|
|
524
|
+
const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
|
|
475
525
|
const title = meta.title;
|
|
476
526
|
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
477
527
|
let stats;
|
|
@@ -491,10 +541,16 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
491
541
|
for (const warning of validated.warnings) console.warn(warning);
|
|
492
542
|
charts = validated.charts;
|
|
493
543
|
});
|
|
494
|
-
|
|
544
|
+
const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
|
|
545
|
+
const key = buildEvalKey({
|
|
546
|
+
filePath: relativeFilePath,
|
|
547
|
+
evalId: meta.id
|
|
548
|
+
});
|
|
549
|
+
evals.set(key, {
|
|
550
|
+
key,
|
|
495
551
|
id: meta.id,
|
|
496
552
|
title,
|
|
497
|
-
filePath:
|
|
553
|
+
filePath: relativeFilePath,
|
|
498
554
|
sourceFilePath: meta.filePath,
|
|
499
555
|
sourceFingerprint,
|
|
500
556
|
columnDefs,
|
|
@@ -549,10 +605,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
549
605
|
runs.set(runId, runState);
|
|
550
606
|
setLatestRunInfoMap({
|
|
551
607
|
latestRunInfoMap,
|
|
552
|
-
evalIds:
|
|
608
|
+
evalIds: getTargetEvalKeys({
|
|
553
609
|
request,
|
|
554
|
-
|
|
555
|
-
knownEvalIds: new Set(evals.keys())
|
|
610
|
+
sortedEvals: getSortedEvalMetas()
|
|
556
611
|
}),
|
|
557
612
|
info: {
|
|
558
613
|
status: "running",
|
|
@@ -633,7 +688,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
633
688
|
getCaseDetail(runId, caseId) {
|
|
634
689
|
const run = runs.get(runId);
|
|
635
690
|
if (!run) return void 0;
|
|
636
|
-
return run.caseDetails.get(caseId);
|
|
691
|
+
return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
|
|
637
692
|
},
|
|
638
693
|
subscribe(runId, listener) {
|
|
639
694
|
const run = runs.get(runId);
|
|
@@ -799,6 +854,7 @@ function parseArgs(argv) {
|
|
|
799
854
|
helpTopic: "global",
|
|
800
855
|
unknownHelpTarget: void 0,
|
|
801
856
|
evalIds: [],
|
|
857
|
+
files: [],
|
|
802
858
|
caseIds: [],
|
|
803
859
|
trials: 1,
|
|
804
860
|
json: false,
|
|
@@ -834,6 +890,9 @@ function parseArgs(argv) {
|
|
|
834
890
|
else if (arg === "--eval" && next) {
|
|
835
891
|
args.evalIds.push(...next.split(","));
|
|
836
892
|
i++;
|
|
893
|
+
} else if (arg === "--file" && next) {
|
|
894
|
+
args.files.push(...next.split(","));
|
|
895
|
+
i++;
|
|
837
896
|
} else if (arg === "--case" && next) {
|
|
838
897
|
args.caseIds.push(...next.split(","));
|
|
839
898
|
i++;
|
|
@@ -899,6 +958,28 @@ async function runCli(argv) {
|
|
|
899
958
|
function isCliCommand(command) {
|
|
900
959
|
return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
|
|
901
960
|
}
|
|
961
|
+
function escapeRegex(value) {
|
|
962
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
963
|
+
}
|
|
964
|
+
function globToRegex(pattern) {
|
|
965
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
966
|
+
let regex = "^";
|
|
967
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
968
|
+
const char = normalized[i];
|
|
969
|
+
const next = normalized[i + 1];
|
|
970
|
+
if (char === "*" && next === "*") {
|
|
971
|
+
regex += ".*";
|
|
972
|
+
i++;
|
|
973
|
+
} else if (char === "*") regex += "[^/]*";
|
|
974
|
+
else if (char === "?") regex += "[^/]";
|
|
975
|
+
else regex += escapeRegex(char ?? "");
|
|
976
|
+
}
|
|
977
|
+
return new RegExp(`${regex}$`);
|
|
978
|
+
}
|
|
979
|
+
function fileMatches(pattern, filePath) {
|
|
980
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
981
|
+
return normalized === filePath || globToRegex(normalized).test(filePath);
|
|
982
|
+
}
|
|
902
983
|
function loadWorkspaceEnv() {
|
|
903
984
|
const envPath = resolve(process.cwd(), ".env");
|
|
904
985
|
if (!existsSync(envPath)) return true;
|
|
@@ -959,8 +1040,8 @@ async function commandApp(args) {
|
|
|
959
1040
|
const { serve } = await import("@hono/node-server");
|
|
960
1041
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
961
1042
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
962
|
-
const appModule = await import("./app-
|
|
963
|
-
const runnerModule = await import("./runner-
|
|
1043
|
+
const appModule = await import("./app-DS3j_AyX.mjs");
|
|
1044
|
+
const runnerModule = await import("./runner-B2f2TEjp.mjs");
|
|
964
1045
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
965
1046
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
966
1047
|
await runnerModule.initRunner();
|
|
@@ -973,9 +1054,16 @@ async function commandApp(args) {
|
|
|
973
1054
|
async function commandList(args_) {
|
|
974
1055
|
const runner = createRunner({ watchForChanges: false });
|
|
975
1056
|
await runner.init();
|
|
1057
|
+
const discoveryIssues = runner.getDiscoveryIssues();
|
|
1058
|
+
if (discoveryIssues.length > 0) {
|
|
1059
|
+
console.error("Discovery errors:\n");
|
|
1060
|
+
for (const issue of discoveryIssues) console.error(` ${issue.message}`);
|
|
1061
|
+
console.error("");
|
|
1062
|
+
}
|
|
976
1063
|
const evals = runner.getEvals();
|
|
977
1064
|
if (evals.length === 0) {
|
|
978
1065
|
console.info("No eval files found.");
|
|
1066
|
+
if (discoveryIssues.length > 0) process.exit(1);
|
|
979
1067
|
return;
|
|
980
1068
|
}
|
|
981
1069
|
console.info("Discovered evals:\n");
|
|
@@ -994,12 +1082,13 @@ async function commandList(args_) {
|
|
|
994
1082
|
if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
|
|
995
1083
|
console.info("");
|
|
996
1084
|
}
|
|
1085
|
+
if (discoveryIssues.length > 0) process.exit(1);
|
|
997
1086
|
}
|
|
998
1087
|
async function commandRun(args) {
|
|
999
1088
|
const runner = createRunner({ watchForChanges: false });
|
|
1000
1089
|
await runner.init();
|
|
1001
|
-
if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
|
|
1002
|
-
console.error("This workspace disables running all evals from the CLI. Pass --eval <id
|
|
1090
|
+
if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
|
|
1091
|
+
console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
|
|
1003
1092
|
process.exit(1);
|
|
1004
1093
|
return;
|
|
1005
1094
|
}
|
|
@@ -1013,10 +1102,15 @@ async function commandRun(args) {
|
|
|
1013
1102
|
const target = args.caseIds.length > 0 ? {
|
|
1014
1103
|
mode: "caseIds",
|
|
1015
1104
|
caseIds: args.caseIds,
|
|
1016
|
-
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
|
|
1105
|
+
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0,
|
|
1106
|
+
files: args.files.length > 0 ? args.files : void 0
|
|
1017
1107
|
} : args.evalIds.length > 0 ? {
|
|
1018
1108
|
mode: "evalIds",
|
|
1019
|
-
evalIds: args.evalIds
|
|
1109
|
+
evalIds: args.evalIds,
|
|
1110
|
+
files: args.files.length > 0 ? args.files : void 0
|
|
1111
|
+
} : args.files.length > 0 ? {
|
|
1112
|
+
mode: "evalIds",
|
|
1113
|
+
files: args.files
|
|
1020
1114
|
} : { mode: "all" };
|
|
1021
1115
|
const run = await runner.startRun({
|
|
1022
1116
|
target,
|
|
@@ -1046,8 +1140,12 @@ async function commandRun(args) {
|
|
|
1046
1140
|
console.info(`Errors: ${String(summary.errorCases)}`);
|
|
1047
1141
|
if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
|
|
1048
1142
|
if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
|
|
1143
|
+
if (summary.errorMessage !== null) {
|
|
1144
|
+
console.info("");
|
|
1145
|
+
console.info(summary.errorMessage);
|
|
1146
|
+
}
|
|
1049
1147
|
}
|
|
1050
|
-
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
1148
|
+
if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
1051
1149
|
}
|
|
1052
1150
|
async function commandShowRuns(args) {
|
|
1053
1151
|
const runner = createRunner({ watchForChanges: false });
|
|
@@ -1101,8 +1199,9 @@ async function commandCache(args) {
|
|
|
1101
1199
|
return;
|
|
1102
1200
|
}
|
|
1103
1201
|
if (args.subcommand === "clear") {
|
|
1104
|
-
if (args.evalIds.length > 0) {
|
|
1105
|
-
|
|
1202
|
+
if (args.evalIds.length > 0 || args.files.length > 0) {
|
|
1203
|
+
const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
|
|
1204
|
+
for (const evalId of evalIds) {
|
|
1106
1205
|
const entries = await runner.listCache();
|
|
1107
1206
|
const prefix = `${evalId}__`;
|
|
1108
1207
|
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
@@ -1111,7 +1210,7 @@ async function commandCache(args) {
|
|
|
1111
1210
|
key: entry.key
|
|
1112
1211
|
});
|
|
1113
1212
|
}
|
|
1114
|
-
console.info(`Cleared cache entries for: ${
|
|
1213
|
+
console.info(`Cleared cache entries for: ${evalIds.join(", ")}`);
|
|
1115
1214
|
return;
|
|
1116
1215
|
}
|
|
1117
1216
|
if (args.all) {
|
|
@@ -1130,6 +1229,9 @@ function getSortedRunSnapshots(runner) {
|
|
|
1130
1229
|
}
|
|
1131
1230
|
function buildRunFileIndex(workspaceRoot, run) {
|
|
1132
1231
|
const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
|
|
1232
|
+
const caseIdCounts = /* @__PURE__ */ new Map();
|
|
1233
|
+
for (const caseRow of run.cases) caseIdCounts.set(caseRow.caseId, (caseIdCounts.get(caseRow.caseId) ?? 0) + 1);
|
|
1234
|
+
const seenCaseIds = /* @__PURE__ */ new Set();
|
|
1133
1235
|
return {
|
|
1134
1236
|
id: run.manifest.id,
|
|
1135
1237
|
shortId: run.manifest.shortId,
|
|
@@ -1147,10 +1249,16 @@ function buildRunFileIndex(workspaceRoot, run) {
|
|
|
1147
1249
|
tracesDir: join(runDir, "traces")
|
|
1148
1250
|
},
|
|
1149
1251
|
cases: run.cases.map((caseRow) => {
|
|
1150
|
-
const
|
|
1252
|
+
const duplicateCaseIdCount = caseIdCounts.get(caseRow.caseId) ?? 0;
|
|
1253
|
+
const hasPreviousCaseWithId = seenCaseIds.has(caseRow.caseId);
|
|
1254
|
+
const fileId = duplicateCaseIdCount > 1 && hasPreviousCaseWithId ? caseRow.caseKey ?? caseRow.caseId : caseRow.caseId;
|
|
1255
|
+
seenCaseIds.add(caseRow.caseId);
|
|
1256
|
+
const fileName = `${encodeURIComponent(fileId)}.json`;
|
|
1151
1257
|
return {
|
|
1152
1258
|
caseId: caseRow.caseId,
|
|
1259
|
+
caseKey: caseRow.caseKey,
|
|
1153
1260
|
evalId: caseRow.evalId,
|
|
1261
|
+
evalKey: caseRow.evalKey,
|
|
1154
1262
|
status: caseRow.status,
|
|
1155
1263
|
files: {
|
|
1156
1264
|
caseDetail: join(runDir, "case-details", fileName),
|
|
@@ -1262,7 +1370,8 @@ Usage:
|
|
|
1262
1370
|
|
|
1263
1371
|
Flags:
|
|
1264
1372
|
--eval <id> Run specific eval(s) (comma-separated)
|
|
1265
|
-
--
|
|
1373
|
+
--file <path|glob> Run eval files matching path/glob (comma-separated)
|
|
1374
|
+
--case <id> Run case(s); combine with --file/--eval if ambiguous
|
|
1266
1375
|
--trials <n> Number of trials per case
|
|
1267
1376
|
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1268
1377
|
--inspect-brk[=host:port] Enable inspector and pause before startup
|
package/dist/index.d.mts
CHANGED
|
@@ -457,6 +457,7 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
457
457
|
type EvalStatsConfig = z$1.infer<typeof evalStatsConfigSchema>;
|
|
458
458
|
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
459
459
|
declare const evalSummarySchema: z$1.ZodObject<{
|
|
460
|
+
key: z$1.ZodDefault<z$1.ZodString>;
|
|
460
461
|
id: z$1.ZodString;
|
|
461
462
|
title: z$1.ZodOptional<z$1.ZodString>;
|
|
462
463
|
filePath: z$1.ZodString;
|
|
@@ -635,6 +636,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
635
636
|
type EvalSummary = z$1.infer<typeof evalSummarySchema>;
|
|
636
637
|
/** Schema for one case row in an eval run result table. */
|
|
637
638
|
declare const caseRowSchema: z$1.ZodObject<{
|
|
639
|
+
evalKey: z$1.ZodOptional<z$1.ZodString>;
|
|
640
|
+
caseKey: z$1.ZodOptional<z$1.ZodString>;
|
|
638
641
|
caseId: z$1.ZodString;
|
|
639
642
|
evalId: z$1.ZodString;
|
|
640
643
|
status: z$1.ZodEnum<{
|
|
@@ -796,6 +799,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
796
799
|
type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
|
|
797
800
|
/** Schema for the detailed payload shown when opening a specific case. */
|
|
798
801
|
declare const caseDetailSchema: z$1.ZodObject<{
|
|
802
|
+
evalKey: z$1.ZodOptional<z$1.ZodString>;
|
|
803
|
+
caseKey: z$1.ZodOptional<z$1.ZodString>;
|
|
799
804
|
caseId: z$1.ZodString;
|
|
800
805
|
evalId: z$1.ZodString;
|
|
801
806
|
status: z$1.ZodEnum<{
|
|
@@ -1009,6 +1014,43 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
1009
1014
|
}, z$1.core.$strip>;
|
|
1010
1015
|
/** Full case payload including inputs, trace, outputs, and failures. */
|
|
1011
1016
|
type CaseDetail = z$1.infer<typeof caseDetailSchema>;
|
|
1017
|
+
/** Schema for discovery problems that should be shown before running evals. */
|
|
1018
|
+
declare const discoveryIssueSchema: z$1.ZodObject<{
|
|
1019
|
+
type: z$1.ZodEnum<{
|
|
1020
|
+
"duplicate-eval-id": "duplicate-eval-id";
|
|
1021
|
+
}>;
|
|
1022
|
+
severity: z$1.ZodEnum<{
|
|
1023
|
+
error: "error";
|
|
1024
|
+
}>;
|
|
1025
|
+
filePath: z$1.ZodString;
|
|
1026
|
+
evalId: z$1.ZodString;
|
|
1027
|
+
message: z$1.ZodString;
|
|
1028
|
+
}, z$1.core.$strip>;
|
|
1029
|
+
/** Discovery problem found while scanning eval files. */
|
|
1030
|
+
type DiscoveryIssue = z$1.infer<typeof discoveryIssueSchema>;
|
|
1031
|
+
//#endregion
|
|
1032
|
+
//#region ../shared/src/evalIdentity.d.ts
|
|
1033
|
+
/** Build the stable identity for one eval inside a workspace. */
|
|
1034
|
+
declare function buildEvalKey(params: {
|
|
1035
|
+
filePath: string;
|
|
1036
|
+
evalId: string;
|
|
1037
|
+
}): string;
|
|
1038
|
+
/** Build the stable identity for one eval case inside a workspace. */
|
|
1039
|
+
declare function buildCaseKey(params: {
|
|
1040
|
+
filePath: string;
|
|
1041
|
+
evalId: string;
|
|
1042
|
+
caseId: string;
|
|
1043
|
+
}): string;
|
|
1044
|
+
/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
|
|
1045
|
+
declare function getCaseRowEvalKey(row: {
|
|
1046
|
+
evalKey?: string;
|
|
1047
|
+
evalId: string;
|
|
1048
|
+
}): string;
|
|
1049
|
+
/** Return the collision-safe case key stored on a row, falling back for legacy data. */
|
|
1050
|
+
declare function getCaseRowCaseKey(row: {
|
|
1051
|
+
caseKey?: string;
|
|
1052
|
+
caseId: string;
|
|
1053
|
+
}): string;
|
|
1012
1054
|
//#endregion
|
|
1013
1055
|
//#region ../shared/src/schemas/chart.d.ts
|
|
1014
1056
|
/** Chart type rendered for a single eval history chart. */
|
|
@@ -1338,6 +1380,8 @@ declare const runManifestSchema: z$1.ZodObject<{
|
|
|
1338
1380
|
evalIds: "evalIds";
|
|
1339
1381
|
caseIds: "caseIds";
|
|
1340
1382
|
}>;
|
|
1383
|
+
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1384
|
+
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1341
1385
|
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1342
1386
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1343
1387
|
}, z$1.core.$strip>;
|
|
@@ -2772,6 +2816,8 @@ declare const createRunRequestSchema: z$1.ZodObject<{
|
|
|
2772
2816
|
evalIds: "evalIds";
|
|
2773
2817
|
caseIds: "caseIds";
|
|
2774
2818
|
}>;
|
|
2819
|
+
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2820
|
+
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2775
2821
|
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2776
2822
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2777
2823
|
}, z$1.core.$strip>;
|
|
@@ -3564,7 +3610,8 @@ type CacheClearFilter = {
|
|
|
3564
3610
|
type EvalRunner = {
|
|
3565
3611
|
/** Load workspace config, discover evals, and start file watching when enabled. */init(): Promise<void>; /** Return the currently discovered eval summaries for the active workspace. */
|
|
3566
3612
|
getEvals(): EvalSummary[]; /** Look up one discovered eval by id. */
|
|
3567
|
-
getEval(id: string): EvalSummary | undefined; /**
|
|
3613
|
+
getEval(id: string): EvalSummary | undefined; /** Return discovery errors that should be shown before running evals. */
|
|
3614
|
+
getDiscoveryIssues(): DiscoveryIssue[]; /** Re-scan configured eval files and emit a discovery update to listeners. */
|
|
3568
3615
|
refreshDiscovery(): Promise<void>;
|
|
3569
3616
|
startRun(request: CreateRunRequest): Promise<{
|
|
3570
3617
|
manifest: RunManifest;
|
|
@@ -3617,11 +3664,19 @@ type EvalRunner = {
|
|
|
3617
3664
|
* Remove cache entries matching `filter`, or all entries when no filter is
|
|
3618
3665
|
* supplied.
|
|
3619
3666
|
*/
|
|
3620
|
-
clearCache(filter?: CacheClearFilter): Promise<void>;
|
|
3621
|
-
|
|
3667
|
+
clearCache(filter?: CacheClearFilter): Promise<void>;
|
|
3668
|
+
/**
|
|
3669
|
+
* Recompute persisted case and run statuses for terminal runs touching one
|
|
3670
|
+
* eval. Accepts the exact eval key, with a legacy fallback for unique eval ids.
|
|
3671
|
+
*/
|
|
3672
|
+
recomputeStatusesForEval(evalKey: string): Promise<{
|
|
3622
3673
|
updatedRuns: number;
|
|
3623
|
-
}>;
|
|
3624
|
-
|
|
3674
|
+
}>;
|
|
3675
|
+
/**
|
|
3676
|
+
* Delete terminal persisted runs that touch one eval from memory and disk.
|
|
3677
|
+
* Accepts the exact eval key, with a legacy fallback for unique eval ids.
|
|
3678
|
+
*/
|
|
3679
|
+
cleanRunsForEval(evalKey: string): Promise<{
|
|
3625
3680
|
deletedRuns: number;
|
|
3626
3681
|
}>; /** Persist a UI-authored manual score for one case and recompute affected summaries. */
|
|
3627
3682
|
updateManualScore(params: {
|
|
@@ -3667,4 +3722,4 @@ declare function createRunner({
|
|
|
3667
3722
|
*/
|
|
3668
3723
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3669
3724
|
//#endregion
|
|
3670
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3725
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as llmCallsConfigSchema, $t as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-ETfZ15RB.mjs";
|
|
3
|
+
import "./src-CyNb2ycA.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|