@ls-stack/agent-eval 0.58.2 → 0.58.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BxD6aHbp.mjs → app-sGeXC4AT.mjs} +63 -11
- package/dist/apps/web/dist/assets/index-BXFsxHVc.js +377 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +4 -3
- package/dist/{cli-HBwXIJsg.mjs → cli-Bf5RzM8O.mjs} +7 -6
- package/dist/index.d.mts +129 -123
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +5 -4
- package/dist/{runExecution-pHJ0_TzH.mjs → runExecution-CLkC-4Z1.mjs} +40 -19
- package/dist/{runOrchestration-ngVXShH4.mjs → runOrchestration-BS-WxTee.mjs} +1 -1
- package/dist/{runner-BnZMGBla.mjs → runner-Bz5ZPqmm.mjs} +1 -1
- package/dist/{runner-D_pz2NON.mjs → runner-DW-11txl.mjs} +2 -2
- package/dist/{src-AeXGBJ26.mjs → src-BjMMDm_O.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +7 -6
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +0 -377
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-CLkC-4Z1.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Bf5RzM8O.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-BjMMDm_O.mjs";
|
|
4
4
|
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-CLkC-4Z1.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BS-WxTee.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -137,11 +137,12 @@ async function main() {
|
|
|
137
137
|
registerAgentEvalsPackageResolutionHooks();
|
|
138
138
|
const config = await loadConfig();
|
|
139
139
|
configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
|
|
140
|
+
const cacheRetentionOptions = getCacheRetentionOptions(config.cache);
|
|
140
141
|
const cacheStore = createFsCacheStore({
|
|
141
142
|
workspaceRoot: context.workspaceRoot,
|
|
142
143
|
dir: config.cache?.dir,
|
|
143
|
-
maxEntriesPerNamespace:
|
|
144
|
-
maxEntriesByNamespace:
|
|
144
|
+
maxEntriesPerNamespace: cacheRetentionOptions.maxEntriesPerNamespace,
|
|
145
|
+
maxEntriesByNamespace: cacheRetentionOptions.maxEntriesByNamespace,
|
|
145
146
|
lastAccessedAtUpdateIntervalMs: config.cache?.lastAccessedAtUpdateIntervalMs
|
|
146
147
|
});
|
|
147
148
|
const evalMetas = await discoverRunEvals({
|
|
@@ -1009,9 +1009,8 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
|
|
|
1009
1009
|
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
1010
1010
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
1011
1011
|
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
const evalTracingAssertionsConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" }), z.record(z.string().min(1), evalTracingAssertionsFnSchema)]);
|
|
1012
|
+
/** Schema for trace-derived assertion config. */
|
|
1013
|
+
const evalTracingAssertionsConfigSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" });
|
|
1015
1014
|
/** Schema for UI overrides on derived or scored columns. */
|
|
1016
1015
|
const evalColumnOverrideSchema = z.object({
|
|
1017
1016
|
label: z.string().optional(),
|
|
@@ -1402,6 +1401,10 @@ function resolveApiCallsConfig(input) {
|
|
|
1402
1401
|
metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
|
|
1403
1402
|
};
|
|
1404
1403
|
}
|
|
1404
|
+
const cacheMaxEntriesSchema = z.union([z.number(), z.object({
|
|
1405
|
+
default: z.number().optional(),
|
|
1406
|
+
namespaces: z.record(z.string(), z.number()).optional()
|
|
1407
|
+
})]).optional();
|
|
1405
1408
|
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
1406
1409
|
const agentEvalsConfigSchema = z.object({
|
|
1407
1410
|
workspaceRoot: z.string().optional(),
|
|
@@ -1425,11 +1428,26 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1425
1428
|
cache: z.object({
|
|
1426
1429
|
enabled: z.boolean().optional(),
|
|
1427
1430
|
dir: z.string().optional(),
|
|
1428
|
-
|
|
1431
|
+
maxEntries: cacheMaxEntriesSchema,
|
|
1432
|
+
maxEntriesPerNamespace: z.number().optional(),
|
|
1429
1433
|
maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
|
|
1430
1434
|
pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1431
1435
|
lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1432
|
-
maxEntriesPerEval: z.
|
|
1436
|
+
maxEntriesPerEval: z.number().optional()
|
|
1437
|
+
}).transform(({ maxEntries, maxEntriesByNamespace, maxEntriesPerEval, maxEntriesPerNamespace, ...cache }) => {
|
|
1438
|
+
const defaultMaxEntries = maxEntriesPerNamespace ?? maxEntriesPerEval;
|
|
1439
|
+
if (maxEntries !== void 0) return {
|
|
1440
|
+
...cache,
|
|
1441
|
+
maxEntries
|
|
1442
|
+
};
|
|
1443
|
+
if (defaultMaxEntries !== void 0 || maxEntriesByNamespace !== void 0) return {
|
|
1444
|
+
...cache,
|
|
1445
|
+
maxEntries: {
|
|
1446
|
+
default: defaultMaxEntries,
|
|
1447
|
+
namespaces: maxEntriesByNamespace
|
|
1448
|
+
}
|
|
1449
|
+
};
|
|
1450
|
+
return cache;
|
|
1433
1451
|
}).optional()
|
|
1434
1452
|
});
|
|
1435
1453
|
//#endregion
|
|
@@ -5079,6 +5097,19 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5079
5097
|
};
|
|
5080
5098
|
}
|
|
5081
5099
|
//#endregion
|
|
5100
|
+
//#region ../runner/src/cacheConfig.ts
|
|
5101
|
+
function getCacheRetentionOptions(cacheConfig) {
|
|
5102
|
+
const maxEntries = cacheConfig?.maxEntries;
|
|
5103
|
+
if (typeof maxEntries === "number") return {
|
|
5104
|
+
maxEntriesPerNamespace: maxEntries,
|
|
5105
|
+
maxEntriesByNamespace: void 0
|
|
5106
|
+
};
|
|
5107
|
+
return {
|
|
5108
|
+
maxEntriesPerNamespace: maxEntries?.default,
|
|
5109
|
+
maxEntriesByNamespace: maxEntries?.namespaces
|
|
5110
|
+
};
|
|
5111
|
+
}
|
|
5112
|
+
//#endregion
|
|
5082
5113
|
//#region ../runner/src/cacheAccessTime.ts
|
|
5083
5114
|
const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
|
|
5084
5115
|
function normalizeLastAccessedAtUpdateIntervalMs(value) {
|
|
@@ -6754,19 +6785,9 @@ async function runOneTracingAssertion(params) {
|
|
|
6754
6785
|
}
|
|
6755
6786
|
async function runTracingAssertionsConfig(params) {
|
|
6756
6787
|
if (params.tracingAssertions === void 0) return;
|
|
6757
|
-
|
|
6758
|
-
|
|
6759
|
-
|
|
6760
|
-
tracingAssertion: params.tracingAssertions,
|
|
6761
|
-
scope: params.scope,
|
|
6762
|
-
traceTree: params.traceTree,
|
|
6763
|
-
evalCase: params.evalCase
|
|
6764
|
-
});
|
|
6765
|
-
return;
|
|
6766
|
-
}
|
|
6767
|
-
for (const [key, tracingAssertion] of Object.entries(params.tracingAssertions)) await runOneTracingAssertion({
|
|
6768
|
-
label: `tracingAssertions "${key}"`,
|
|
6769
|
-
tracingAssertion,
|
|
6788
|
+
await runOneTracingAssertion({
|
|
6789
|
+
label: "tracingAssertions",
|
|
6790
|
+
tracingAssertion: params.tracingAssertions,
|
|
6770
6791
|
scope: params.scope,
|
|
6771
6792
|
traceTree: params.traceTree,
|
|
6772
6793
|
evalCase: params.evalCase
|
|
@@ -7037,4 +7058,4 @@ function recordAssertionFailure(scope, failure) {
|
|
|
7037
7058
|
});
|
|
7038
7059
|
}
|
|
7039
7060
|
//#endregion
|
|
7040
|
-
export {
|
|
7061
|
+
export { setEvalOutput as $, serializeCacheValue as A, evalStatAggregateSchema as At, evalLog as B, evalSpan as C, resolveApiCallsConfig as Ct, deserializeCacheRecording as D, getCaseRowCaseKey as Dt, hashCacheKeySync as E, buildEvalKey as Et, EvalAssertionError as F, defineEval as Ft, isInEvalScope as G, getCurrentScope as H, EvalRuntimeUsageError as I, getEvalRegistry as It, nextEvalId as J, matchesEvalTags as K, appendToEvalOutput as L, runWithEvalRegistry as Lt, manualInputFileValueSchema as M, manualInputDescriptorSchema as Mt, readManualInputFile as N, evalChartsConfigSchema as Nt, deserializeCacheValue as O, caseDetailSchema as Ot, evalExpect as P, columnDefSchema as Pt, runWithEvalClock as Q, configureEvalRunLogs as R, captureEvalSpanError as S, runSummarySchema as St, hashCacheKey as T, buildCaseKey as Tt, getEvalCaseInput as U, evalTime as V, incrementEvalOutput as W, runInEvalScope as X, runInEvalRuntimeScope as Y, runInExistingEvalScope as Z, createBufferedCacheStore as _, dedupeEvalTags as _t, isCaseChildParentMessage as a, extractCacheHits as at, z$1 as b, validateTagsFilterExpression as bt, resolveArtifactPath as c, simulateLlmCallCost as ct, loadEvalModule as d, getNestedAttribute as dt, setScopeCacheContext as et, resolveEvalDefaultConfig as f, getEvalTitle as ft, commitPendingCacheWrites as g, deriveStatusFromChildStatuses as gt, normalizeScoreDef as h, deriveStatusFromCaseRows as ht, isCaseChildMessage as i, extractCacheEntries as it, repoFile as j, evalStatsConfigSchema as jt, serializeCacheRecording as k, caseRowSchema as kt, registerAgentEvalsPackageResolutionHooks as l, simulateTokenAllocation as lt, buildDeclaredColumnDefs as m, deriveScopedSummaryFromCases as mt, resolveRunnableEvalCases as n, createRunRequestSchema as nt, stripTerminalControlCodes as o, extractApiCalls as ot, loadConfig as p, getEvalDisplayStatus as pt, mergeEvalOutput as q, runCase as r, updateManualScoreRequestSchema as rt, resolveTracePresentation as s, extractLlmCalls as st, filterEvalCases as t, startEvalBackgroundJob as tt, runWithModuleIsolation as u, applyDerivedCallAttributes as ut, createFsCacheStore as v, matchesTagsFilter as vt, evalTracer as w, resolveLlmCallsConfig as wt, buildTraceTree as x, runManifestSchema as xt, getCacheRetentionOptions as y, validateEvalTagName as yt, evalAssert as z };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Dt as
|
|
1
|
+
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-CLkC-4Z1.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-DW-11txl.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Bf5RzM8O.mjs";
|
|
2
|
+
import "./src-BjMMDm_O.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-CLkC-4Z1.mjs";
|
|
2
|
+
import "./cli-Bf5RzM8O.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -366,10 +366,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
366
366
|
form, return `undefined` to omit one output for that case. Do not call
|
|
367
367
|
`evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
|
|
368
368
|
`tracingAssertions` for trace-derived pass/fail checks.
|
|
369
|
-
- `tracingAssertions` can be authored globally or
|
|
370
|
-
finished-trace invariant should pass or fail the
|
|
371
|
-
score column. It receives the same
|
|
372
|
-
`
|
|
369
|
+
- `tracingAssertions` is a single function that can be authored globally or
|
|
370
|
+
locally on one eval when a finished-trace invariant should pass or fail the
|
|
371
|
+
case without creating a fake score column. It receives the same
|
|
372
|
+
`{ trace, input, case }` context as `deriveFromTracing`; call
|
|
373
|
+
`evalAssert(...)` or `evalExpect(...)` inside it.
|
|
373
374
|
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
374
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
375
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
@@ -545,8 +546,8 @@ Mental model:
|
|
|
545
546
|
JSON blob refs. Each namespace is capped at 100 entries by default. The runner
|
|
546
547
|
prunes least recently accessed indexed entries after a run finishes and the
|
|
547
548
|
runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
|
|
548
|
-
`cache.
|
|
549
|
-
`
|
|
549
|
+
`cache.maxEntries` as a number for the default cap, or as
|
|
550
|
+
`{ default, namespaces }` for exact namespace-specific caps.
|
|
550
551
|
- Unindexed legacy cache files are ignored by normal lookup/listing. Use
|
|
551
552
|
`agent-evals cache repair` to remove unindexed cache files, stale index rows,
|
|
552
553
|
debug sidecars, and unreferenced blob files.
|