@ls-stack/agent-eval 0.60.1 → 0.60.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CxKEVlng.mjs → app-l3ynaNsb.mjs} +4 -4
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-CVBSlTD8.mjs → cli-BSVUCUxr.mjs} +34 -8
- package/dist/index.d.mts +110 -110
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-CjWJUUZ5.mjs → runExecution-Bq0Y3y_1.mjs} +2 -2
- package/dist/{runOrchestration-DE2TFAS6.mjs → runOrchestration-C7qQISz2.mjs} +41 -8
- package/dist/{runner-Cu1CQPTB.mjs → runner-C9xNJHt3.mjs} +1 -1
- package/dist/{runner-DzDRasWV.mjs → runner-DmkSq-QG.mjs} +2 -2
- package/dist/{src-DjOTPnDz.mjs → src-D5vGo2iv.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +2 -0
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BSVUCUxr.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-D5vGo2iv.mjs";
|
|
4
4
|
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-
|
|
2
|
-
import {
|
|
1
|
+
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
|
+
import { C as parseEvalDiscovery, h as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C7qQISz2.mjs";
|
|
3
3
|
import { z } from "zod";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -289,7 +289,7 @@ z.object({
|
|
|
289
289
|
key: z.string(),
|
|
290
290
|
namespace: z.string(),
|
|
291
291
|
storedAt: z.string(),
|
|
292
|
-
/** Last successful cache
|
|
292
|
+
/** Last successful cache read or write time. Legacy entries may be `null`. */
|
|
293
293
|
lastAccessedAt: z.string().nullable()
|
|
294
294
|
});
|
|
295
295
|
z.object({
|
|
@@ -5395,7 +5395,7 @@ function createFsCacheStore(options) {
|
|
|
5395
5395
|
const index = await readNamespaceIndex(cacheDir, entry.namespace);
|
|
5396
5396
|
index.entries[entry.key] = {
|
|
5397
5397
|
storedAt: entry.storedAt,
|
|
5398
|
-
lastAccessedAt:
|
|
5398
|
+
lastAccessedAt: entry.storedAt,
|
|
5399
5399
|
blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
|
|
5400
5400
|
};
|
|
5401
5401
|
await writeNamespaceIndex(cacheDir, index);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-
|
|
1
|
+
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
2
|
import { Result, resultify } from "t-result";
|
|
3
3
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
5
|
-
import { existsSync } from "node:fs";
|
|
5
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { spawn } from "node:child_process";
|
|
8
8
|
//#region ../runner/src/chartValidation.ts
|
|
@@ -670,7 +670,7 @@ async function recomputeEvalStatusesInRuns(params) {
|
|
|
670
670
|
let changed = false;
|
|
671
671
|
for (const caseRow of run.cases) {
|
|
672
672
|
if (caseRow.evalKey !== params.evalKey) continue;
|
|
673
|
-
const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
673
|
+
const caseDetail = params.getCaseDetail?.(run, caseRow) ?? run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
674
674
|
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
675
675
|
if (caseRow.status === nextStatus) continue;
|
|
676
676
|
caseRow.status = nextStatus;
|
|
@@ -725,14 +725,22 @@ function nextShortIdFromSnapshots(snapshots) {
|
|
|
725
725
|
}
|
|
726
726
|
return maxNum + 1;
|
|
727
727
|
}
|
|
728
|
-
|
|
728
|
+
/**
|
|
729
|
+
* Load persisted run metadata from the local state directory.
|
|
730
|
+
*
|
|
731
|
+
* Case details are skipped by default so long-running app processes can keep
|
|
732
|
+
* run history in memory without retaining every trace payload. Pass
|
|
733
|
+
* `includeCaseDetails` only for narrow maintenance flows that need full
|
|
734
|
+
* details for every case.
|
|
735
|
+
*/
|
|
736
|
+
async function loadPersistedRunSnapshots(localStateDir, options = {}) {
|
|
729
737
|
const runsDir = join(localStateDir, "runs");
|
|
730
738
|
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
731
739
|
if (entriesResult.error) return [];
|
|
732
740
|
const snapshots = [];
|
|
733
741
|
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
734
742
|
for (const runDir of runDirs) {
|
|
735
|
-
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
743
|
+
const snapshot = await loadPersistedRunSnapshot(runDir, options);
|
|
736
744
|
if (!snapshot) continue;
|
|
737
745
|
snapshots.push(snapshot);
|
|
738
746
|
}
|
|
@@ -766,7 +774,14 @@ function getLatestRunInfos(params) {
|
|
|
766
774
|
function toLastRunStatus$1(status) {
|
|
767
775
|
return status === "pending" ? null : status;
|
|
768
776
|
}
|
|
769
|
-
|
|
777
|
+
/**
|
|
778
|
+
* Load one persisted run snapshot from disk.
|
|
779
|
+
*
|
|
780
|
+
* The returned snapshot includes manifest, summary, and case rows. Case
|
|
781
|
+
* details are loaded only when `includeCaseDetails` is true; otherwise callers
|
|
782
|
+
* should use `loadPersistedCaseDetail` for the specific case being inspected.
|
|
783
|
+
*/
|
|
784
|
+
async function loadPersistedRunSnapshot(runDir, options = {}) {
|
|
770
785
|
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
771
786
|
if (!manifest) return null;
|
|
772
787
|
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
@@ -776,9 +791,18 @@ async function loadPersistedRunSnapshot(runDir) {
|
|
|
776
791
|
manifest,
|
|
777
792
|
summary,
|
|
778
793
|
cases: await readCaseRows(runDir),
|
|
779
|
-
caseDetails: await readCaseDetails(runDir)
|
|
794
|
+
caseDetails: options.includeCaseDetails === true ? await readCaseDetails(runDir) : /* @__PURE__ */ new Map()
|
|
780
795
|
};
|
|
781
796
|
}
|
|
797
|
+
/**
|
|
798
|
+
* Load one persisted case detail by its artifact file id.
|
|
799
|
+
*
|
|
800
|
+
* Returns `null` when the file is missing, invalid JSON, or no longer matches
|
|
801
|
+
* the current case-detail schema.
|
|
802
|
+
*/
|
|
803
|
+
function loadPersistedCaseDetail(runDir, fileId) {
|
|
804
|
+
return readParsedJsonFileSync(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
805
|
+
}
|
|
782
806
|
async function readParsedJsonFile(filePath, schema) {
|
|
783
807
|
const fileResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
784
808
|
if (fileResult.error) return null;
|
|
@@ -788,6 +812,15 @@ async function readParsedJsonFile(filePath, schema) {
|
|
|
788
812
|
if (!parsed.success) return null;
|
|
789
813
|
return parsed.data;
|
|
790
814
|
}
|
|
815
|
+
function readParsedJsonFileSync(filePath, schema) {
|
|
816
|
+
const fileResult = resultify(() => readFileSync(filePath, "utf-8"));
|
|
817
|
+
if (fileResult.error) return null;
|
|
818
|
+
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
819
|
+
if (jsonResult.error) return null;
|
|
820
|
+
const parsed = schema.safeParse(jsonResult.value);
|
|
821
|
+
if (!parsed.success) return null;
|
|
822
|
+
return parsed.data;
|
|
823
|
+
}
|
|
791
824
|
async function readCaseRows(runDir) {
|
|
792
825
|
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
793
826
|
if (fileResult.error) return [];
|
|
@@ -1660,4 +1693,4 @@ function toLastRunStatus(status) {
|
|
|
1660
1693
|
return status === "pending" ? null : status;
|
|
1661
1694
|
}
|
|
1662
1695
|
//#endregion
|
|
1663
|
-
export {
|
|
1696
|
+
export { parseEvalDiscovery as C, loadIsolatedEvalRegistry as S, recomputePersistedCaseStatus as _, validateTagsFilters as a, parseManualInputValues as b, getLatestRunInfos as c, loadPersistedRunSnapshots as d, nextShortIdFromSnapshots as f, recomputeEvalStatusesInRuns as g, persistRunState as h, resolveEvalTags as i, loadPersistedCaseDetail as l, deleteTemporaryRuns as m, getTargetEvalKeys as n, generateRunId as o, persistCaseDetail as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshot as u, runTouchesEval as v, validateCharts as w, deriveEvalFreshness as x, buildManualInputDescriptor as y };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-DmkSq-QG.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BSVUCUxr.mjs";
|
|
2
|
+
import "./src-D5vGo2iv.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance({ loadEnv = true } = {}) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
|
+
import "./cli-BSVUCUxr.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.60.
|
|
3
|
+
"version": "0.60.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
|
-
"@agent-evals/
|
|
35
|
+
"@agent-evals/runner": "0.0.1",
|
|
36
36
|
"@agent-evals/shared": "0.0.1",
|
|
37
|
-
"@agent-evals/
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -568,6 +568,8 @@ Mental model:
|
|
|
568
568
|
runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
|
|
569
569
|
`cache.maxEntries` as a number for the default cap, or as
|
|
570
570
|
`{ default, namespaces }` for exact namespace-specific caps.
|
|
571
|
+
Writes initialize the row's last access time to the stored time; later cache
|
|
572
|
+
hits refresh that timestamp at the configured access-time update interval.
|
|
571
573
|
- Unindexed legacy cache files are ignored by normal lookup/listing. Use
|
|
572
574
|
`agent-evals cache repair` to remove unindexed cache files, stale index rows,
|
|
573
575
|
debug sidecars, and unreferenced blob files.
|