@ls-stack/agent-eval 0.36.0 → 0.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BlNzXWDM.mjs → app-C7ON9Wdh.mjs} +39 -4
- package/dist/apps/web/dist/assets/index-BiwYbMem.js +140 -0
- package/dist/apps/web/dist/assets/{index-D0rC5MSS.css → index-CKdoOah2.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Dg3abrOv.mjs → cli-CwGcJYWe.mjs} +57 -8
- package/dist/index.d.mts +41 -35
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +44 -2
- package/dist/{runOrchestration-V1TxX8es.mjs → runOrchestration-C4o5TcIu.mjs} +42 -7
- package/dist/{runner-BCs5rzej.mjs → runner-BTH8m_Er.mjs} +2 -2
- package/dist/{runner-znY6PY1M.mjs → runner-LqeHPID6.mjs} +1 -1
- package/dist/src--13_4uDG.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +9 -1
- package/dist/apps/web/dist/assets/index-BYtcGddU.js +0 -140
- package/dist/src-DBypR4TV.mjs +0 -3
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { At as evalStatsConfigSchema, C as parseEvalDiscovery, En as columnDefSchema, J as runManifestSchema, M as createRunRequestSchema, Qn as configureEvalRunLogs, Rt as manualInputDescriptorSchema, T as loadConfig, Y as runSummarySchema, bt as buildEvalKey, en as evalChartsConfigSchema, k as createFsCacheStore, p as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C4o5TcIu.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -14,6 +14,7 @@ const evalMetaSchema = z.object({
|
|
|
14
14
|
sourceFingerprint: z.string().nullable(),
|
|
15
15
|
columnDefs: z.array(columnDefSchema),
|
|
16
16
|
caseCount: z.number().nullable(),
|
|
17
|
+
caseIds: z.array(z.string()).optional(),
|
|
17
18
|
stats: evalStatsConfigSchema.optional(),
|
|
18
19
|
charts: evalChartsConfigSchema.optional(),
|
|
19
20
|
manualInputDescriptor: manualInputDescriptorSchema.optional(),
|
|
@@ -27,6 +28,7 @@ const runChildContextSchema = z.object({
|
|
|
27
28
|
summary: runSummarySchema,
|
|
28
29
|
evals: z.array(evalMetaSchema).optional()
|
|
29
30
|
});
|
|
31
|
+
let activeContext;
|
|
30
32
|
function sendMessage(message) {
|
|
31
33
|
if (process.send === void 0) return;
|
|
32
34
|
process.send(message);
|
|
@@ -93,6 +95,7 @@ async function main() {
|
|
|
93
95
|
process.exit(1);
|
|
94
96
|
});
|
|
95
97
|
const context = await readContext(process.argv[2]);
|
|
98
|
+
activeContext = context;
|
|
96
99
|
process.chdir(context.workspaceRoot);
|
|
97
100
|
const config = await loadConfig();
|
|
98
101
|
configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
|
|
@@ -153,7 +156,46 @@ async function main() {
|
|
|
153
156
|
evals: [...evals.values()]
|
|
154
157
|
});
|
|
155
158
|
}
|
|
156
|
-
|
|
159
|
+
async function handleFatalRunChildError(error) {
|
|
160
|
+
const message = formatUnknownErrorDetails(error);
|
|
161
|
+
process.exitCode = 1;
|
|
162
|
+
console.error(message);
|
|
163
|
+
if (activeContext === void 0) return;
|
|
164
|
+
const endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
165
|
+
await persistRunState({
|
|
166
|
+
runDir: activeContext.runDir,
|
|
167
|
+
manifest: {
|
|
168
|
+
...activeContext.manifest,
|
|
169
|
+
status: "error",
|
|
170
|
+
endedAt
|
|
171
|
+
},
|
|
172
|
+
summary: {
|
|
173
|
+
...activeContext.summary,
|
|
174
|
+
status: "error",
|
|
175
|
+
errorMessage: message
|
|
176
|
+
},
|
|
177
|
+
cases: [],
|
|
178
|
+
caseDetails: /* @__PURE__ */ new Map(),
|
|
179
|
+
listeners: /* @__PURE__ */ new Set()
|
|
180
|
+
});
|
|
181
|
+
sendMessage({
|
|
182
|
+
type: "event",
|
|
183
|
+
event: {
|
|
184
|
+
type: "run.error",
|
|
185
|
+
runId: activeContext.manifest.id,
|
|
186
|
+
timestamp: endedAt,
|
|
187
|
+
payload: { message }
|
|
188
|
+
}
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
function formatUnknownErrorDetails(error) {
|
|
192
|
+
if (error instanceof Error) return error.stack ?? error.message;
|
|
193
|
+
if (typeof error === "string") return error;
|
|
194
|
+
return String(error);
|
|
195
|
+
}
|
|
196
|
+
await main().catch(async (error) => {
|
|
197
|
+
await handleFatalRunChildError(error);
|
|
198
|
+
});
|
|
157
199
|
process.disconnect();
|
|
158
200
|
//#endregion
|
|
159
201
|
export {};
|
|
@@ -2980,6 +2980,8 @@ const evalSummarySchema = z.object({
|
|
|
2980
2980
|
currentCommitSha: z.string().nullable(),
|
|
2981
2981
|
columnDefs: z.array(columnDefSchema),
|
|
2982
2982
|
caseCount: z.number().nullable(),
|
|
2983
|
+
/** Authored case ids discovered for this eval, when case generation has run. */
|
|
2984
|
+
caseIds: z.array(z.string()).optional(),
|
|
2983
2985
|
lastRunStatus: z.enum([
|
|
2984
2986
|
"pass",
|
|
2985
2987
|
"fail",
|
|
@@ -3626,6 +3628,11 @@ const runManifestSchema = z.object({
|
|
|
3626
3628
|
"cancelled",
|
|
3627
3629
|
"error"
|
|
3628
3630
|
]),
|
|
3631
|
+
/**
|
|
3632
|
+
* Temporary runs are persisted like normal runs, but are deleted before the
|
|
3633
|
+
* next run starts. Older persisted runs default to durable history.
|
|
3634
|
+
*/
|
|
3635
|
+
temporary: z.boolean().optional().default(false),
|
|
3629
3636
|
startedAt: z.string(),
|
|
3630
3637
|
endedAt: z.string().nullable(),
|
|
3631
3638
|
/**
|
|
@@ -4521,6 +4528,11 @@ const createRunRequestSchema = z.object({
|
|
|
4521
4528
|
}),
|
|
4522
4529
|
trials: z.number().min(1),
|
|
4523
4530
|
/**
|
|
4531
|
+
* Persist this run as temporary history. Temporary runs are visible while
|
|
4532
|
+
* present, then deleted before the next run of any kind starts.
|
|
4533
|
+
*/
|
|
4534
|
+
temporary: z.boolean().optional(),
|
|
4535
|
+
/**
|
|
4524
4536
|
* Optional cache controls for the run. When omitted, the cache is used in
|
|
4525
4537
|
* its default read-through / write-on-miss mode.
|
|
4526
4538
|
*/
|
|
@@ -6188,6 +6200,27 @@ function runTouchesEval(params) {
|
|
|
6188
6200
|
if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
|
|
6189
6201
|
return false;
|
|
6190
6202
|
}
|
|
6203
|
+
async function deleteTemporaryRuns(params) {
|
|
6204
|
+
let deletedRuns = 0;
|
|
6205
|
+
for (const [runId, run] of [...params.runs]) {
|
|
6206
|
+
if (run.manifest.temporary !== true) continue;
|
|
6207
|
+
if (run.manifest.status === "running") {
|
|
6208
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
6209
|
+
run.manifest.status = "cancelled";
|
|
6210
|
+
run.manifest.endedAt = endedAt.toISOString();
|
|
6211
|
+
run.summary.status = "cancelled";
|
|
6212
|
+
run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
|
|
6213
|
+
params.cancelRunningRun(run);
|
|
6214
|
+
}
|
|
6215
|
+
params.runs.delete(runId);
|
|
6216
|
+
await rm(run.runDir, {
|
|
6217
|
+
recursive: true,
|
|
6218
|
+
force: true
|
|
6219
|
+
});
|
|
6220
|
+
deletedRuns += 1;
|
|
6221
|
+
}
|
|
6222
|
+
return deletedRuns;
|
|
6223
|
+
}
|
|
6191
6224
|
async function recomputeEvalStatusesInRuns(params) {
|
|
6192
6225
|
let updatedRuns = 0;
|
|
6193
6226
|
for (const run of params.runs) {
|
|
@@ -6384,6 +6417,12 @@ function encodeCaseDetailFileName(caseId) {
|
|
|
6384
6417
|
return encodeURIComponent(caseId);
|
|
6385
6418
|
}
|
|
6386
6419
|
//#endregion
|
|
6420
|
+
//#region ../runner/src/stackFormatting.ts
|
|
6421
|
+
const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
|
|
6422
|
+
function stripTerminalControlCodes(value) {
|
|
6423
|
+
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
6424
|
+
}
|
|
6425
|
+
//#endregion
|
|
6387
6426
|
//#region ../runner/src/moduleIsolation.ts
|
|
6388
6427
|
const isolationParam = "agent-evals-isolate";
|
|
6389
6428
|
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
@@ -6474,12 +6513,6 @@ async function runWithModuleIsolation(context, fn) {
|
|
|
6474
6513
|
return await isolationStorage.run(context, fn);
|
|
6475
6514
|
}
|
|
6476
6515
|
//#endregion
|
|
6477
|
-
//#region ../runner/src/stackFormatting.ts
|
|
6478
|
-
const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
|
|
6479
|
-
function stripTerminalControlCodes(value) {
|
|
6480
|
-
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
6481
|
-
}
|
|
6482
|
-
//#endregion
|
|
6483
6516
|
//#region ../runner/src/runExecution.ts
|
|
6484
6517
|
function filterEvalCases(cases, caseIds) {
|
|
6485
6518
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
@@ -7061,6 +7094,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7061
7094
|
const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
|
|
7062
7095
|
if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
|
|
7063
7096
|
const cases = filterEvalCases(runnableCases, request.target.caseIds);
|
|
7097
|
+
evalMeta.caseCount = runnableCases.length;
|
|
7098
|
+
evalMeta.caseIds = runnableCases.map((evalCase) => evalCase.id);
|
|
7064
7099
|
runState.summary.totalCases += cases.length;
|
|
7065
7100
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
7066
7101
|
evalDef,
|
|
@@ -7269,4 +7304,4 @@ function toLastRunStatus(status) {
|
|
|
7269
7304
|
return status === "pending" ? null : status;
|
|
7270
7305
|
}
|
|
7271
7306
|
//#endregion
|
|
7272
|
-
export {
|
|
7307
|
+
export { apiCallMetricFormatSchema as $, evalAssert as $n, evalChartTypeSchema as $t, configReloadStateSchema as A, jsonCellSchema as An, evalStatsConfigSchema as At, simulateTokenAllocation as B, hashCacheKeySync as Bn, manualInputJsonFieldSchema as Bt, parseEvalDiscovery as C, traceSpanSchema as Cn, assertionFailureSchema as Ct, normalizeScoreDef as D, columnFormatSchema as Dn, evalFreshnessStatusSchema as Dt, buildDeclaredColumnDefs as E, columnDefSchema as En, discoveryIssueSchema as Et, extractCacheEntries as F, buildTraceTree as Fn, runLogPhaseSchema as Ft, deriveScopedSummaryFromCases as G, repoFile as Gn, manualInputTextFieldSchema as Gt, getNestedAttribute as H, deserializeCacheValue as Hn, manualInputNumberFieldSchema as Ht, extractCacheHits as I, captureEvalSpanError as In, scoreTraceSchema as It, runManifestSchema as J, evalExpect as Jn, evalChartBuiltinMetricSchema as Jt, deriveStatusFromCaseRows as K, manualInputFileValueSchema as Kn, evalChartAggregateSchema as Kt, extractApiCalls as L, evalSpan as Ln, manualInputBooleanFieldSchema as Lt, createRunRequestSchema as M, repoFileRefSchema as Mn, runLogEntrySchema as Mt, updateManualScoreRequestSchema as N, runArtifactRefSchema as Nn, runLogLevelSchema as Nt, validateCharts as O, columnKindSchema as On, evalStatAggregateSchema as Ot, sseEnvelopeSchema as P, z$1 as Pn, runLogLocationSchema as Pt, agentEvalsConfigSchema as Q, configureEvalRunLogs as Qn, evalChartTooltipExtraSchema as Qt, extractLlmCalls as R, evalTracer as Rn, manualInputDescriptorSchema as Rt, loadEvalModule as S, traceSpanKindSchema as Sn, getCaseRowEvalKey as St, loadConfig as T, cellValueSchema as Tn, caseRowSchema as Tt, getEvalTitle as U, serializeCacheRecording as Un, manualInputSelectFieldSchema as Ut, applyDerivedCallAttributes as V, deserializeCacheRecording as Vn, manualInputMultilineFieldSchema as Vt, getEvalDisplayStatus as W, serializeCacheValue as Wn, manualInputSelectOptionSchema as Wt, DEFAULT_API_CALLS_CONFIG as X, advanceEvalTime as Xn, evalChartConfigSchema as Xt, runSummarySchema as Y, EvalAssertionError as Yn, evalChartColorSchema as Yt, DEFAULT_LLM_CALLS_CONFIG as Z, appendToEvalOutput as Zn, evalChartMetricSchema as Zt, resolveTracePresentation as _, traceAttributeDisplayPlacementSchema as _n, runLogsConfigSchema as _t, generateRunId as a, cacheFileSchema as an, isInEvalScope as ar, evalColumnsSchema as at, parseManualInputValues as b, traceDisplayInputConfigSchema as bn, buildEvalKey as bt, loadPersistedRunSnapshot as c, cacheOperationTypeSchema as cn, runInEvalRuntimeScope as cr, llmCallMetricFormatSchema as ct, persistCaseDetail as d, cacheStatusSchema as dn, setEvalOutput as dr, llmCallPricingRateSchema as dt, evalChartsConfigSchema as en, evalLog as er, apiCallMetricPlacementSchema as et, deleteTemporaryRuns as f, serializedCacheSpanSchema as fn, setScopeCacheContext as fr, llmCallPricingSchema as ft, runTouchesEval as g, traceAttributeDisplayInputSchema as gn, resolveLlmCallsConfig as gt, recomputePersistedCaseStatus as h, traceAttributeDisplayFormatSchema as hn, getEvalRegistry as hr, resolveApiCallsConfig as ht, stripTerminalControlCodes as i, cacheEntryWithDebugKeySchema as in, incrementEvalOutput as ir, evalColumnOverrideSchema as it, configReloadStatusSchema as j, numberDisplayOptionsSchema as jn, evalSummarySchema as jt, createFsCacheStore as k, fileRefSchema as kn, evalStatItemSchema as kt, loadPersistedRunSnapshots as l, cacheRecordingOpSchema as ln, runInEvalScope as lr, llmCallMetricPlacementSchema as lt, recomputeEvalStatusesInRuns as m, traceCacheRefSchema as mn, defineEval as mr, removeDefaultConfigSchema as mt, getTargetEvalKeys as n, cacheDebugKeyFileSchema as nn, getEvalCaseInput as nr, apiCallsConfigSchema as nt, getLastRunStatuses as o, cacheListItemSchema as on, mergeEvalOutput as or, evalDeriveConfigSchema as ot, persistRunState as p, spanCacheOptionsSchema as pn, startEvalBackgroundJob as pr, llmCallsConfigSchema as pt, deriveStatusFromChildStatuses as q, readManualInputFile as qn, evalChartAxisSchema as qt, getTargetEvals as r, cacheEntrySchema as rn, getEvalStartTime as rr, defaultConfigKeySchema as rt, getLatestRunInfos as s, cacheModeSchema as sn, nextEvalId as sr, llmCallCostCurrencySchema as st, executeRun as t, cacheDebugKeyEntrySchema as tn, getCurrentScope as tr, apiCallMetricSchema as tt, nextShortIdFromSnapshots as u, cacheRecordingSchema as un, runInExistingEvalScope as ur, llmCallMetricSchema as ut, resolveArtifactPath as v, traceAttributeDisplaySchema as vn, trialSelectionModeSchema as vt, resolveEvalDefaultConfig as w, traceSpanWarningSchema as wn, caseDetailSchema as wt, deriveEvalFreshness as x, traceSpanErrorSchema as xn, getCaseRowCaseKey as xt, buildManualInputDescriptor as y, traceDisplayConfigSchema as yn, buildCaseKey as yt, simulateLlmCallCost as z, hashCacheKey as zn, manualInputFieldDescriptorSchema as zt };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src
|
|
1
|
+
import { n as createRunner } from "./cli-CwGcJYWe.mjs";
|
|
2
|
+
import "./src--13_4uDG.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-BTH8m_Er.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.37.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,8 +32,8 @@
|
|
|
32
32
|
"@types/node": "^24.7.2",
|
|
33
33
|
"typescript": "^5.9.2",
|
|
34
34
|
"@agent-evals/runner": "0.0.1",
|
|
35
|
-
"@agent-evals/
|
|
36
|
-
"@agent-evals/
|
|
35
|
+
"@agent-evals/sdk": "0.0.1",
|
|
36
|
+
"@agent-evals/shared": "0.0.1"
|
|
37
37
|
},
|
|
38
38
|
"scripts": {
|
|
39
39
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -27,7 +27,13 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
27
27
|
- Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
|
|
28
28
|
for targeted CLI runs. Set `allowCliRunAll: true` in
|
|
29
29
|
`agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
|
|
30
|
-
still run grouped evals and confirms before starting more than five.
|
|
30
|
+
still run grouped evals and confirms before starting more than five. On a
|
|
31
|
+
single eval page, the Run chevron can open a picker to run specific authored
|
|
32
|
+
case ids; those case-picked runs are temporary by default and can be made
|
|
33
|
+
durable in the modal.
|
|
34
|
+
- `agent-evals run --temporary` persists a run like normal history, but deletes
|
|
35
|
+
it before the next run starts. Temporary runs appear in `show-runs` and the UI
|
|
36
|
+
while present; normal runs are never deleted by temporary-run cleanup.
|
|
31
37
|
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
32
38
|
place when the runner is idle. If config changes during an active run, the UI
|
|
33
39
|
shows a pending reload banner and blocks new runs until the current run
|
|
@@ -507,6 +513,8 @@ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
|
|
|
507
513
|
`.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
|
|
508
514
|
metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
509
515
|
these when debugging persisted output, costs, columns, traces, or failures.
|
|
516
|
+
Temporary runs use the same directory layout, but are removed before the next
|
|
517
|
+
run of any kind starts.
|
|
510
518
|
|
|
511
519
|
Use `agent-evals show-runs` when you need stable file
|
|
512
520
|
paths before reading saved output:
|