@ls-stack/agent-eval 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1887,7 +1887,6 @@ type EvalTraceTree = {
1887
1887
  /** Context passed to an eval's `execute` function for a single case run. */
1888
1888
  type EvalExecuteContext<TInput> = {
1889
1889
  input: TInput;
1890
- signal: AbortSignal;
1891
1890
  };
1892
1891
  /** Context passed to `deriveFromTracing` after execution has completed. */
1893
1892
  type EvalDeriveContext<TInput> = {
@@ -2383,8 +2382,8 @@ type EvalRunner = {
2383
2382
  manifest: RunManifest;
2384
2383
  summary: RunSummary;
2385
2384
  cases: CaseRow[];
2386
- } | undefined; /** Request cancellation for an in-flight run. */
2387
- cancelRun(id: string): void; /** Return full details for a single case in a run, when available. */
2385
+ } | undefined; /** Request cancellation for an in-flight run and persist its cancelled state. */
2386
+ cancelRun(id: string): Promise<void>; /** Return full details for a single case in a run, when available. */
2388
2387
  getCaseDetail(runId: string, caseId: string): CaseDetail | undefined; /** Subscribe to streamed events for a specific run. */
2389
2388
  subscribe(runId: string, listener: (event: SseEnvelope) => void): () => void; /** Subscribe to discovery updates triggered by file changes or manual refresh. */
2390
2389
  subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
package/dist/index.mjs CHANGED
@@ -1,3 +1,4 @@
1
- import { $ as columnDefSchema, A as evalStatsConfigSchema, B as evalChartTypeSchema, C as spanCacheOptionsSchema, Ct as setEvalOutput, D as evalFreshnessStatusSchema, Dt as getEvalRegistry, E as caseRowSchema, Et as defineEval, F as evalChartBuiltinMetricSchema, G as traceAttributeDisplaySchema, H as traceAttributeDisplayFormatSchema, I as evalChartColorSchema, J as traceSpanErrorSchema, K as traceDisplayConfigSchema, L as evalChartConfigSchema, M as scoreTraceSchema, N as evalChartAggregateSchema, O as evalStatAggregateSchema, P as evalChartAxisSchema, Q as cellValueSchema, R as evalChartMetricSchema, S as serializedCacheSpanSchema, St as runInEvalScope, T as caseDetailSchema, Tt as repoFile, U as traceAttributeDisplayInputSchema, V as evalChartsConfigSchema, W as traceAttributeDisplayPlacementSchema, X as traceSpanSchema, Y as traceSpanKindSchema, Z as traceSpanWarningSchema, _ as cacheListItemSchema, _t as getCurrentScope, a as sseEnvelopeSchema, at as repoFileRefSchema, b as cacheRecordingOpSchema, bt as isInEvalScope, c as deriveScopedSummaryFromCases, ct as buildTraceTree, d as runManifestSchema, dt as evalTracer, et as columnFormatSchema, f as runSummarySchema, ft as hashCacheKey, g as cacheFileSchema, gt as evalAssert, h as cacheEntrySchema, ht as appendToEvalOutput, i as updateManualScoreRequestSchema, it as numberDisplayOptionsSchema, j as evalSummarySchema, k as evalStatItemSchema, l as deriveStatusFromCaseRows, lt as captureEvalSpanError, m as trialSelectionModeSchema, mt as EvalAssertionError, n as createRunner, nt as fileRefSchema, o as getEvalTitle, ot as runArtifactRefSchema, p as agentEvalsConfigSchema, pt as hashCacheKeySync, q as traceDisplayInputConfigSchema, r as createRunRequestSchema, rt as jsonCellSchema, s as getEvalDisplayStatus, st as z, t as runCli, tt as columnKindSchema, u as deriveStatusFromChildStatuses, ut as evalSpan, v as cacheModeSchema, vt as getEvalCaseInput, w as assertionFailureSchema, wt as setScopeCacheContext, x as cacheRecordingSchema, xt as mergeEvalOutput, y as cacheOperationTypeSchema, yt as incrementEvalOutput, z as evalChartTooltipExtraSchema } from "./cli-C-n-Fd4o.mjs";
2
- import "./src-Dy31CPXH.mjs";
1
+ import { $ as evalChartAxisSchema, A as runManifestSchema, At as evalTracer, B as cacheRecordingSchema, Bt as mergeEvalOutput, C as updateManualScoreRequestSchema, Ct as numberDisplayOptionsSchema, D as deriveScopedSummaryFromCases, Dt as buildTraceTree, E as getEvalDisplayStatus, Et as z, F as cacheFileSchema, Ft as evalAssert, G as caseRowSchema, Gt as defineEval, H as spanCacheOptionsSchema, Ht as setEvalOutput, I as cacheListItemSchema, It as getCurrentScope, J as evalStatItemSchema, K as evalFreshnessStatusSchema, Kt as getEvalRegistry, L as cacheModeSchema, Lt as getEvalCaseInput, M as agentEvalsConfigSchema, Mt as hashCacheKeySync, N as trialSelectionModeSchema, Nt as EvalAssertionError, O as deriveStatusFromCaseRows, Ot as captureEvalSpanError, P as cacheEntrySchema, Pt as appendToEvalOutput, Q as evalChartAggregateSchema, R as cacheOperationTypeSchema, Rt as incrementEvalOutput, S as createRunRequestSchema, St as jsonCellSchema, T as getEvalTitle, Tt as runArtifactRefSchema, U as assertionFailureSchema, Ut as setScopeCacheContext, V as serializedCacheSpanSchema, Vt as runInEvalScope, W as caseDetailSchema, Wt as repoFile, X as evalSummarySchema, Y as evalStatsConfigSchema, Z as scoreTraceSchema, _t as cellValueSchema, at as evalChartTypeSchema, bt as columnKindSchema, ct as traceAttributeDisplayInputSchema, dt as traceDisplayConfigSchema, et as evalChartBuiltinMetricSchema, ft as traceDisplayInputConfigSchema, gt as traceSpanWarningSchema, ht as traceSpanSchema, it as evalChartTooltipExtraSchema, j as runSummarySchema, jt as hashCacheKey, k as deriveStatusFromChildStatuses, kt as evalSpan, lt as traceAttributeDisplayPlacementSchema, mt as traceSpanKindSchema, nt as evalChartConfigSchema, ot as evalChartsConfigSchema, pt as traceSpanErrorSchema, q as evalStatAggregateSchema, rt as evalChartMetricSchema, st as traceAttributeDisplayFormatSchema, tt as evalChartColorSchema, ut as traceAttributeDisplaySchema, vt as columnDefSchema, w as sseEnvelopeSchema, wt as repoFileRefSchema, xt as fileRefSchema, yt as columnFormatSchema, z as cacheRecordingOpSchema, zt as isInEvalScope } from "./runOrchestration-HaMahl6b.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-BTtgQLjB.mjs";
3
+ import "./src-Bt5Fz9HS.mjs";
3
4
  export { EvalAssertionError, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
@@ -0,0 +1 @@
1
+ export { };
@@ -0,0 +1,107 @@
1
+ import { A as runManifestSchema, S as createRunRequestSchema, Y as evalStatsConfigSchema, j as runSummarySchema, ot as evalChartsConfigSchema, t as executeRun, v as loadConfig, vt as columnDefSchema, x as createFsCacheStore } from "./runOrchestration-HaMahl6b.mjs";
2
+ import { createHash } from "node:crypto";
3
+ import { readFile } from "node:fs/promises";
4
+ import { z } from "zod/v4";
5
+ //#region ../runner/src/runChild.ts
6
+ const evalMetaSchema = z.object({
7
+ id: z.string(),
8
+ title: z.string().optional(),
9
+ filePath: z.string(),
10
+ sourceFilePath: z.string(),
11
+ sourceFingerprint: z.string().nullable(),
12
+ columnDefs: z.array(columnDefSchema),
13
+ caseCount: z.number().nullable(),
14
+ stats: evalStatsConfigSchema.optional(),
15
+ charts: evalChartsConfigSchema.optional()
16
+ });
17
+ const runChildContextSchema = z.object({
18
+ request: createRunRequestSchema,
19
+ workspaceRoot: z.string(),
20
+ runDir: z.string(),
21
+ manifest: runManifestSchema,
22
+ summary: runSummarySchema,
23
+ evals: z.array(evalMetaSchema)
24
+ });
25
+ function sendMessage(message) {
26
+ if (process.send === void 0) return;
27
+ process.send(message);
28
+ }
29
+ function getSourceFingerprint(source) {
30
+ return createHash("sha256").update(source).digest("hex");
31
+ }
32
+ function getConfiguredConcurrency(configConcurrency) {
33
+ if (typeof configConcurrency !== "number" || !Number.isFinite(configConcurrency)) return 1;
34
+ return Math.max(1, Math.floor(configConcurrency));
35
+ }
36
+ function getTargetEvals(params) {
37
+ if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
38
+ return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
39
+ }
40
+ async function readContext(contextPath) {
41
+ if (contextPath === void 0) throw new Error("Missing run child context path");
42
+ return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
43
+ }
44
+ async function main() {
45
+ process.on("disconnect", () => {
46
+ process.exit(1);
47
+ });
48
+ const context = await readContext(process.argv[2]);
49
+ process.chdir(context.workspaceRoot);
50
+ const config = await loadConfig();
51
+ const cacheStore = createFsCacheStore({
52
+ workspaceRoot: context.workspaceRoot,
53
+ dir: config.cache?.dir,
54
+ maxEntriesPerEval: config.cache?.maxEntriesPerEval
55
+ });
56
+ const evals = new Map(context.evals.map((evalMeta) => [evalMeta.id, evalMeta]));
57
+ const lastRunStatusMap = /* @__PURE__ */ new Map();
58
+ const latestRunInfoMap = /* @__PURE__ */ new Map();
59
+ await executeRun({
60
+ runState: {
61
+ runDir: context.runDir,
62
+ manifest: context.manifest,
63
+ summary: context.summary,
64
+ cases: [],
65
+ caseDetails: /* @__PURE__ */ new Map(),
66
+ listeners: /* @__PURE__ */ new Set()
67
+ },
68
+ request: context.request,
69
+ runDir: context.runDir,
70
+ config,
71
+ evals,
72
+ cacheStore,
73
+ lastRunStatusMap,
74
+ latestRunInfoMap,
75
+ emitEvent(_runState, event) {
76
+ if (event.type === "case.finished") return;
77
+ sendMessage({
78
+ type: "event",
79
+ event
80
+ });
81
+ },
82
+ emitDiscoveryEvent() {},
83
+ workspaceRoot: context.workspaceRoot,
84
+ getSourceFingerprint,
85
+ getConfiguredConcurrency: () => getConfiguredConcurrency(config.concurrency),
86
+ getSortedEvalMetas: () => [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath)),
87
+ getTargetEvals: (request) => getTargetEvals({
88
+ evals,
89
+ request
90
+ }),
91
+ onCaseFinished(caseDetail, caseRow) {
92
+ sendMessage({
93
+ type: "case.finished",
94
+ caseDetail,
95
+ caseRow
96
+ });
97
+ }
98
+ });
99
+ sendMessage({
100
+ type: "done",
101
+ evals: [...evals.values()]
102
+ });
103
+ }
104
+ await main();
105
+ process.disconnect();
106
+ //#endregion
107
+ export {};