@ls-stack/agent-eval 0.42.0 → 0.42.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-1vE5Ryry.mjs → app-DPamBr5R.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-5CB9eJZy.js → index-DH0qYD76.js} +2 -2
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Bk5g-bat.mjs → cli-BeJCJMQo.mjs} +49 -23
- package/dist/index.d.mts +8 -8
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-DhTiT4V0.mjs → runOrchestration-OVUFw1fL.mjs} +18 -30
- package/dist/{runner-BG0L4yId.mjs → runner-B1KygirW.mjs} +2 -2
- package/dist/{runner-B1Cyevvr.mjs → runner-BJQq7cpd.mjs} +1 -1
- package/dist/{src-t6OVp1li.mjs → src-D7_xKo7h.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +16 -9
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DH0qYD76.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-eFM9VIsz.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig,
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -476,6 +476,11 @@ function validateManualInputsForRequest(params) {
|
|
|
476
476
|
};
|
|
477
477
|
}
|
|
478
478
|
//#endregion
|
|
479
|
+
//#region ../runner/src/objectUtils.ts
|
|
480
|
+
function isRecord(value) {
|
|
481
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
482
|
+
}
|
|
483
|
+
//#endregion
|
|
479
484
|
//#region ../runner/src/recalculateDerivedAttributes.ts
|
|
480
485
|
function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
|
|
481
486
|
const caseKey = getCaseRowCaseKey(caseRow);
|
|
@@ -505,7 +510,7 @@ async function recalculateDerivedAttributesForCase(params) {
|
|
|
505
510
|
});
|
|
506
511
|
let nextTrace = spansWithDerivedAttributes;
|
|
507
512
|
let nextTraceDisplay = caseDetail.traceDisplay;
|
|
508
|
-
const evalMeta = params.evals.get(
|
|
513
|
+
const evalMeta = caseRow.evalKey === void 0 ? void 0 : params.evals.get(caseRow.evalKey);
|
|
509
514
|
const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
|
|
510
515
|
if (entry !== void 0) entry.use((evalDef) => {
|
|
511
516
|
const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
|
|
@@ -787,6 +792,36 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
787
792
|
managerContext.emitDiscoveryEvent();
|
|
788
793
|
}
|
|
789
794
|
//#endregion
|
|
795
|
+
//#region ../runner/src/runnerStateHydration.ts
|
|
796
|
+
/** Rehydrate a persisted run while preserving live listeners/process handles. */
|
|
797
|
+
function toRunnerRunState(snapshot, existing) {
|
|
798
|
+
return {
|
|
799
|
+
...snapshot,
|
|
800
|
+
listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
|
|
801
|
+
childProcess: existing?.childProcess,
|
|
802
|
+
childTerminalReceived: existing?.childTerminalReceived ?? false
|
|
803
|
+
};
|
|
804
|
+
}
|
|
805
|
+
//#endregion
|
|
806
|
+
//#region ../runner/src/runTargetPersistence.ts
|
|
807
|
+
/** Build the exact-key run target persisted in run history. */
|
|
808
|
+
function buildPersistedRunTarget(params) {
|
|
809
|
+
const { target, evalKeys } = params;
|
|
810
|
+
if (target.mode === "all") return { mode: "all" };
|
|
811
|
+
const persistEvalKeys = (target.evalKeys?.length ?? 0) > 0 || (target.evalIds?.length ?? 0) > 0 || (target.files?.length ?? 0) > 0;
|
|
812
|
+
const keyedTarget = {
|
|
813
|
+
mode: target.mode,
|
|
814
|
+
evalKeys: persistEvalKeys && evalKeys.length > 0 ? evalKeys : void 0,
|
|
815
|
+
files: target.files,
|
|
816
|
+
tagsFilter: target.tagsFilter
|
|
817
|
+
};
|
|
818
|
+
if (target.mode === "caseIds") return {
|
|
819
|
+
...keyedTarget,
|
|
820
|
+
caseIds: target.caseIds
|
|
821
|
+
};
|
|
822
|
+
return keyedTarget;
|
|
823
|
+
}
|
|
824
|
+
//#endregion
|
|
790
825
|
//#region ../runner/src/watchRoots.ts
|
|
791
826
|
const globMagicCharacters = new Set([
|
|
792
827
|
"*",
|
|
@@ -823,9 +858,6 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
823
858
|
}
|
|
824
859
|
//#endregion
|
|
825
860
|
//#region ../runner/src/runner.ts
|
|
826
|
-
function isRecord(value) {
|
|
827
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
828
|
-
}
|
|
829
861
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
830
862
|
function createRunner({ watchForChanges = true } = {}) {
|
|
831
863
|
let config;
|
|
@@ -898,7 +930,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
898
930
|
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
899
931
|
runs: runs.values(),
|
|
900
932
|
evalKey: evalMeta.key,
|
|
901
|
-
evalId: evalMeta.id,
|
|
902
933
|
evalExists: evals.has(evalMeta.key),
|
|
903
934
|
scoreThresholds,
|
|
904
935
|
persistCaseDetail
|
|
@@ -930,7 +961,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
930
961
|
target: run.manifest.target,
|
|
931
962
|
caseRows: run.cases,
|
|
932
963
|
evalKey: evalMeta?.key ?? evalKey,
|
|
933
|
-
evalId: evalMeta?.id,
|
|
934
964
|
evalExists: evalMeta !== void 0
|
|
935
965
|
})) continue;
|
|
936
966
|
if (run.manifest.status === "running") continue;
|
|
@@ -959,7 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
959
989
|
updated: false,
|
|
960
990
|
reason: "Case not found"
|
|
961
991
|
};
|
|
962
|
-
const evalMeta = evals.get(
|
|
992
|
+
const evalMeta = caseRow.evalKey === void 0 ? void 0 : evals.get(caseRow.evalKey);
|
|
963
993
|
if (!evalMeta) return {
|
|
964
994
|
updated: false,
|
|
965
995
|
reason: "Eval not found"
|
|
@@ -1170,6 +1200,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1170
1200
|
const cacheMode = request.cache?.mode ?? "use";
|
|
1171
1201
|
const runDir = join(localStateDir, "runs", runId);
|
|
1172
1202
|
const gitState = readGitWorktreeState(workspaceRoot);
|
|
1203
|
+
const targetEvalKeys = getTargetEvalKeys({
|
|
1204
|
+
request,
|
|
1205
|
+
sortedEvals: getSortedEvalMetas()
|
|
1206
|
+
});
|
|
1173
1207
|
const manifest = {
|
|
1174
1208
|
id: runId,
|
|
1175
1209
|
shortId,
|
|
@@ -1179,7 +1213,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1179
1213
|
endedAt: null,
|
|
1180
1214
|
commitSha: gitState.commitSha,
|
|
1181
1215
|
evalSourceFingerprints: {},
|
|
1182
|
-
target:
|
|
1216
|
+
target: buildPersistedRunTarget({
|
|
1217
|
+
target: request.target,
|
|
1218
|
+
evalKeys: targetEvalKeys
|
|
1219
|
+
}),
|
|
1183
1220
|
trials: request.trials,
|
|
1184
1221
|
trialSelection: config.trialSelection ?? "lowestScore",
|
|
1185
1222
|
cacheMode
|
|
@@ -1224,10 +1261,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1224
1261
|
runs.set(runId, runState);
|
|
1225
1262
|
setLatestRunInfoMap({
|
|
1226
1263
|
latestRunInfoMap,
|
|
1227
|
-
evalIds:
|
|
1228
|
-
request: materializedRequest,
|
|
1229
|
-
sortedEvals: getSortedEvalMetas()
|
|
1230
|
-
}),
|
|
1264
|
+
evalIds: targetEvalKeys,
|
|
1231
1265
|
info: {
|
|
1232
1266
|
status: "running",
|
|
1233
1267
|
startedAt: now,
|
|
@@ -1478,14 +1512,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1478
1512
|
nextShortIdNum = Math.max(nextShortIdNum, nextShortIdFromSnapshots(persistedRuns));
|
|
1479
1513
|
if (changed) emitDiscoveryEvent();
|
|
1480
1514
|
}
|
|
1481
|
-
function toRunnerRunState(snapshot, existing) {
|
|
1482
|
-
return {
|
|
1483
|
-
...snapshot,
|
|
1484
|
-
listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
|
|
1485
|
-
childProcess: existing?.childProcess,
|
|
1486
|
-
childTerminalReceived: existing?.childTerminalReceived ?? false
|
|
1487
|
-
};
|
|
1488
|
-
}
|
|
1489
1515
|
return runner;
|
|
1490
1516
|
}
|
|
1491
1517
|
//#endregion
|
|
@@ -2046,8 +2072,8 @@ async function commandApp(args) {
|
|
|
2046
2072
|
const { serve } = await import("@hono/node-server");
|
|
2047
2073
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2048
2074
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2049
|
-
const appModule = await import("./app-
|
|
2050
|
-
const runnerModule = await import("./runner-
|
|
2075
|
+
const appModule = await import("./app-DPamBr5R.mjs");
|
|
2076
|
+
const runnerModule = await import("./runner-BJQq7cpd.mjs");
|
|
2051
2077
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2052
2078
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2053
2079
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -3849,7 +3849,7 @@ type LlmCallEntry = {
|
|
|
3849
3849
|
cacheCreationInputTokens: number | null;
|
|
3850
3850
|
reasoningTokens: number | null;
|
|
3851
3851
|
totalTokens: number | null; /** Time to first token for the LLM call in milliseconds, when reported by the span. */
|
|
3852
|
-
latencyMs: number | null;
|
|
3852
|
+
latencyMs: number | null; /** Output-token throughput over the full elapsed LLM call duration. */
|
|
3853
3853
|
tokensPerSecond: number | null;
|
|
3854
3854
|
costUsd: number | null;
|
|
3855
3855
|
inputCostUsd: number | null;
|
|
@@ -3956,7 +3956,8 @@ declare function simulateTokenAllocation({
|
|
|
3956
3956
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
3957
3957
|
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
3958
3958
|
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
3959
|
-
* as `durationMs`.
|
|
3959
|
+
* as `durationMs`. `tokensPerSecond` is output tokens divided by that full
|
|
3960
|
+
* elapsed duration. Built-in USD costs are derived only from configured model
|
|
3960
3961
|
* pricing and token counts. `totalTokens` is always derived from input +
|
|
3961
3962
|
* output tokens. Cached input and cache creation tokens are reported
|
|
3962
3963
|
* separately because they are subsets of input/output usage. The main cache
|
|
@@ -5885,7 +5886,7 @@ type EvalRunner = {
|
|
|
5885
5886
|
clearCache(filter?: CacheClearFilter): Promise<void>;
|
|
5886
5887
|
/**
|
|
5887
5888
|
* Recompute persisted case and run statuses for terminal runs touching one
|
|
5888
|
-
* eval. Accepts the exact eval key
|
|
5889
|
+
* eval. Accepts the exact eval key.
|
|
5889
5890
|
*/
|
|
5890
5891
|
recomputeStatusesForEval(evalKey: string): Promise<{
|
|
5891
5892
|
updatedRuns: number;
|
|
@@ -5896,7 +5897,7 @@ type EvalRunner = {
|
|
|
5896
5897
|
}): Promise<RecalculateDerivedAttributesResult>;
|
|
5897
5898
|
/**
|
|
5898
5899
|
* Delete terminal persisted runs that touch one eval from memory and disk.
|
|
5899
|
-
* Accepts the exact eval key
|
|
5900
|
+
* Accepts the exact eval key.
|
|
5900
5901
|
*/
|
|
5901
5902
|
cleanRunsForEval(evalKey: string): Promise<{
|
|
5902
5903
|
deletedRuns: number;
|
|
@@ -5936,13 +5937,12 @@ type EvalRunner = {
|
|
|
5936
5937
|
validateManualInputs(request: CreateRunRequest$1): ManualInputValidationResult;
|
|
5937
5938
|
}; //#endregion
|
|
5938
5939
|
//#region src/runner.d.ts
|
|
5939
|
-
type CreateRunnerOptions = {
|
|
5940
|
-
watchForChanges?: boolean;
|
|
5941
|
-
};
|
|
5942
5940
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
5943
5941
|
declare function createRunner({
|
|
5944
5942
|
watchForChanges
|
|
5945
|
-
}?:
|
|
5943
|
+
}?: {
|
|
5944
|
+
watchForChanges?: boolean;
|
|
5945
|
+
}): EvalRunner; //#endregion
|
|
5946
5946
|
//#region src/manualInput/files.d.ts
|
|
5947
5947
|
type StageManualInputFileParams = {
|
|
5948
5948
|
workspaceRoot: string;
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync,
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BeJCJMQo.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-D7_xKo7h.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as
|
|
1
|
+
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
2
|
import { z } from "zod/v4";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -866,10 +866,6 @@ function buildCaseKey(params) {
|
|
|
866
866
|
encodeURIComponent(params.caseId)
|
|
867
867
|
].join("#");
|
|
868
868
|
}
|
|
869
|
-
/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
|
|
870
|
-
function getCaseRowEvalKey(row) {
|
|
871
|
-
return row.evalKey ?? row.evalId;
|
|
872
|
-
}
|
|
873
869
|
/** Return the collision-safe case key stored on a row, falling back for legacy data. */
|
|
874
870
|
function getCaseRowCaseKey(row) {
|
|
875
871
|
return row.caseKey ?? row.caseId;
|
|
@@ -2056,12 +2052,11 @@ function computeTotalTokens({ input, output }) {
|
|
|
2056
2052
|
if (input === null && output === null) return null;
|
|
2057
2053
|
return (input ?? 0) + (output ?? 0);
|
|
2058
2054
|
}
|
|
2059
|
-
function computeTokensPerSecond({ outputTokens, durationMs
|
|
2055
|
+
function computeTokensPerSecond({ outputTokens, durationMs }) {
|
|
2060
2056
|
if (outputTokens === null || durationMs === null) return null;
|
|
2061
2057
|
if (outputTokens === 0) return 0;
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
return outputTokens / (generationMs / 1e3);
|
|
2058
|
+
if (durationMs <= 0) return null;
|
|
2059
|
+
return outputTokens / (durationMs / 1e3);
|
|
2065
2060
|
}
|
|
2066
2061
|
function readSteps(attributes, path) {
|
|
2067
2062
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -2094,7 +2089,8 @@ function pickError$1(span) {
|
|
|
2094
2089
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
2095
2090
|
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
2096
2091
|
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
2097
|
-
* as `durationMs`.
|
|
2092
|
+
* as `durationMs`. `tokensPerSecond` is output tokens divided by that full
|
|
2093
|
+
* elapsed duration. Built-in USD costs are derived only from configured model
|
|
2098
2094
|
* pricing and token counts. `totalTokens` is always derived from input +
|
|
2099
2095
|
* output tokens. Cached input and cache creation tokens are reported
|
|
2100
2096
|
* separately because they are subsets of input/output usage. The main cache
|
|
@@ -2189,8 +2185,7 @@ function extractLlmCalls(spans, config) {
|
|
|
2189
2185
|
latencyMs,
|
|
2190
2186
|
tokensPerSecond: computeTokensPerSecond({
|
|
2191
2187
|
outputTokens,
|
|
2192
|
-
durationMs
|
|
2193
|
-
latencyMs
|
|
2188
|
+
durationMs
|
|
2194
2189
|
}),
|
|
2195
2190
|
costUsd,
|
|
2196
2191
|
inputCostUsd,
|
|
@@ -6587,9 +6582,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
|
|
|
6587
6582
|
return caseRow.status === "error" ? "error" : "pass";
|
|
6588
6583
|
}
|
|
6589
6584
|
function runTouchesEval(params) {
|
|
6590
|
-
if (params.caseRows.some((caseRow) =>
|
|
6585
|
+
if (params.caseRows.some((caseRow) => caseRow.evalKey === params.evalKey)) return true;
|
|
6591
6586
|
if (params.target.mode === "all") return params.evalExists;
|
|
6592
|
-
if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ??
|
|
6587
|
+
if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? false;
|
|
6593
6588
|
return false;
|
|
6594
6589
|
}
|
|
6595
6590
|
async function deleteTemporaryRuns(params) {
|
|
@@ -6620,13 +6615,12 @@ async function recomputeEvalStatusesInRuns(params) {
|
|
|
6620
6615
|
target: run.manifest.target,
|
|
6621
6616
|
caseRows: run.cases,
|
|
6622
6617
|
evalKey: params.evalKey,
|
|
6623
|
-
evalId: params.evalId,
|
|
6624
6618
|
evalExists: params.evalExists
|
|
6625
6619
|
})) continue;
|
|
6626
6620
|
if (run.manifest.status === "running") continue;
|
|
6627
6621
|
let changed = false;
|
|
6628
6622
|
for (const caseRow of run.cases) {
|
|
6629
|
-
if (
|
|
6623
|
+
if (caseRow.evalKey !== params.evalKey) continue;
|
|
6630
6624
|
const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
6631
6625
|
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
6632
6626
|
if (caseRow.status === nextStatus) continue;
|
|
@@ -6709,15 +6703,14 @@ function getLastRunStatuses(params) {
|
|
|
6709
6703
|
function getLatestRunInfos(params) {
|
|
6710
6704
|
const { runs, knownEvals } = params;
|
|
6711
6705
|
const knownEvalMetas = [...knownEvals];
|
|
6712
|
-
const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
|
|
6713
6706
|
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
6714
6707
|
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
6715
6708
|
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
6716
6709
|
for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
|
|
6717
|
-
status: getEvalStatusForRun(run, evalKey,
|
|
6710
|
+
status: getEvalStatusForRun(run, evalKey, manualScoreKeysByEval.get(evalKey) ?? []),
|
|
6718
6711
|
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
6719
6712
|
commitSha: run.manifest.commitSha ?? null,
|
|
6720
|
-
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ??
|
|
6713
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? null
|
|
6721
6714
|
});
|
|
6722
6715
|
return latestRunInfos;
|
|
6723
6716
|
}
|
|
@@ -6776,19 +6769,14 @@ async function readCaseDetails(runDir) {
|
|
|
6776
6769
|
}
|
|
6777
6770
|
function getRunEvalKeys(run, knownEvals) {
|
|
6778
6771
|
const knownEvalMetas = [...knownEvals];
|
|
6779
|
-
const evalKeys = new Set(
|
|
6780
|
-
for (const caseRow of run.cases)
|
|
6781
|
-
|
|
6782
|
-
|
|
6783
|
-
}
|
|
6784
|
-
if (run.manifest.target.mode === "evalIds") {
|
|
6785
|
-
for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
|
|
6786
|
-
for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
|
|
6787
|
-
} else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
|
|
6772
|
+
const evalKeys = /* @__PURE__ */ new Set();
|
|
6773
|
+
for (const caseRow of run.cases) if (caseRow.evalKey !== void 0) evalKeys.add(caseRow.evalKey);
|
|
6774
|
+
if (run.manifest.target.mode === "evalIds") for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
|
|
6775
|
+
else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
|
|
6788
6776
|
return [...evalKeys];
|
|
6789
6777
|
}
|
|
6790
|
-
function getEvalStatusForRun(run, evalKey,
|
|
6791
|
-
const evalCases = run.cases.filter((caseRow) =>
|
|
6778
|
+
function getEvalStatusForRun(run, evalKey, manualScoreKeys) {
|
|
6779
|
+
const evalCases = run.cases.filter((caseRow) => caseRow.evalKey === evalKey);
|
|
6792
6780
|
if (evalCases.length > 0) {
|
|
6793
6781
|
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
6794
6782
|
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
@@ -7810,4 +7798,4 @@ function toLastRunStatus(status) {
|
|
|
7810
7798
|
return status === "pending" ? null : status;
|
|
7811
7799
|
}
|
|
7812
7800
|
//#endregion
|
|
7813
|
-
export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F,
|
|
7801
|
+
export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BeJCJMQo.mjs";
|
|
2
|
+
import "./src-D7_xKo7h.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B1KygirW.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
|
+
import "./cli-BeJCJMQo.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -371,7 +371,7 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
371
371
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
372
372
|
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
373
373
|
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
374
|
-
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
374
|
+
output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
375
375
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
376
376
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
377
377
|
`providers` entries for provider-specific rates, add `costCurrencies` to show
|
|
@@ -490,10 +490,10 @@ Mental model:
|
|
|
490
490
|
- Cache identity is the namespace plus the authored key. Source-file
|
|
491
491
|
fingerprints are tracked for run freshness separately, but do not participate
|
|
492
492
|
in cache-key hashing.
|
|
493
|
-
- Cached spans require an explicit `cache.namespace
|
|
494
|
-
|
|
495
|
-
namespaces share entries across operations/evals
|
|
496
|
-
key.
|
|
493
|
+
- Cached spans require an explicit `cache.namespace`. Value caches can also set
|
|
494
|
+
an explicit `namespace`; prefer doing that when the cache is part of a
|
|
495
|
+
documented workflow. Matching namespaces share entries across operations/evals
|
|
496
|
+
that use the same authored key.
|
|
497
497
|
- Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
|
|
498
498
|
authored cached operations may read or persist entries. Both default to
|
|
499
499
|
`true`. Use `read: false` to always execute instead of replaying hits, and
|
|
@@ -536,10 +536,17 @@ Mental model:
|
|
|
536
536
|
|
|
537
537
|
## Artifacts
|
|
538
538
|
|
|
539
|
-
Run output lives under `.agent-evals/runs/<run-id
|
|
540
|
-
`.agent-evals/cache
|
|
541
|
-
|
|
542
|
-
|
|
539
|
+
Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
|
|
540
|
+
`.agent-evals/cache/`, grouped into runner-managed owner files. Do not rely on a
|
|
541
|
+
specific cache filename when authoring evals; configure cache namespaces
|
|
542
|
+
manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
|
|
543
|
+
inspect the persisted namespace/key entries. Files in a run directory include
|
|
544
|
+
run metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
545
|
+
run files when debugging persisted output, costs, columns, traces, or failures;
|
|
546
|
+
inspect cache entries when debugging replayed span/value-cache results.
|
|
547
|
+
Targeted evals in `run.json` are recorded by exact `evalKeys`
|
|
548
|
+
(`filePath + evalId`) rather than authored eval ids, so duplicate eval ids stay
|
|
549
|
+
unambiguous in saved history.
|
|
543
550
|
Temporary runs use the same directory layout, but are removed before the next
|
|
544
551
|
run of any kind starts.
|
|
545
552
|
|