@ls-stack/agent-eval 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,3 +1,4 @@
1
- import { $ as columnKindSchema, A as evalSummarySchema, B as evalChartsConfigSchema, C as assertionFailureSchema, D as evalStatAggregateSchema, E as evalFreshnessStatusSchema, F as evalChartColorSchema, G as traceDisplayConfigSchema, H as traceAttributeDisplayInputSchema, I as evalChartConfigSchema, J as traceSpanKindSchema, K as traceDisplayInputConfigSchema, L as evalChartMetricSchema, M as evalChartAggregateSchema, N as evalChartAxisSchema, O as evalStatItemSchema, P as evalChartBuiltinMetricSchema, Q as columnFormatSchema, R as evalChartTooltipExtraSchema, S as spanCacheOptionsSchema, T as caseRowSchema, U as traceAttributeDisplayPlacementSchema, V as traceAttributeDisplayFormatSchema, W as traceAttributeDisplaySchema, X as cellValueSchema, Y as traceSpanSchema, Z as columnDefSchema, _ as cacheListItemSchema, _t as setEvalOutput, a as sseEnvelopeSchema, at as buildTraceTree, b as cacheRecordingSchema, bt as defineEval, c as deriveScopedSummaryFromCases, ct as evalTracer, d as runManifestSchema, dt as EvalAssertionError, et as fileRefSchema, f as runSummarySchema, ft as evalAssert, g as cacheFileSchema, gt as runInEvalScope, h as cacheEntrySchema, ht as isInEvalScope, i as updateManualScoreRequestSchema, it as runArtifactRefSchema, j as scoreTraceSchema, k as evalStatsConfigSchema, l as deriveStatusFromCaseRows, lt as hashCacheKey, m as trialSelectionModeSchema, mt as incrementEvalOutput, n as createRunner, nt as numberDisplayOptionsSchema, o as getEvalTitle, ot as captureEvalSpanError, p as agentEvalsConfigSchema, pt as getCurrentScope, q as traceSpanErrorSchema, r as createRunRequestSchema, rt as repoFileRefSchema, s as getEvalDisplayStatus, st as evalSpan, t as runCli, tt as jsonCellSchema, u as deriveStatusFromChildStatuses, ut as hashCacheKeySync, v as cacheModeSchema, vt as setScopeCacheContext, w as caseDetailSchema, x as serializedCacheSpanSchema, xt as getEvalRegistry, y as cacheRecordingOpSchema, yt as repoFile, z as evalChartTypeSchema } from "./cli-C5FL7C4G.mjs";
2
- import "./src-gqm1z1Nu.mjs";
3
- export { EvalAssertionError, agentEvalsConfigSchema, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, trialSelectionModeSchema, updateManualScoreRequestSchema };
1
+ import { $ as evalChartAxisSchema, A as runManifestSchema, At as evalTracer, B as cacheRecordingSchema, Bt as mergeEvalOutput, C as updateManualScoreRequestSchema, Ct as numberDisplayOptionsSchema, D as deriveScopedSummaryFromCases, Dt as buildTraceTree, E as getEvalDisplayStatus, Et as z, F as cacheFileSchema, Ft as evalAssert, G as caseRowSchema, Gt as defineEval, H as spanCacheOptionsSchema, Ht as setEvalOutput, I as cacheListItemSchema, It as getCurrentScope, J as evalStatItemSchema, K as evalFreshnessStatusSchema, Kt as getEvalRegistry, L as cacheModeSchema, Lt as getEvalCaseInput, M as agentEvalsConfigSchema, Mt as hashCacheKeySync, N as trialSelectionModeSchema, Nt as EvalAssertionError, O as deriveStatusFromCaseRows, Ot as captureEvalSpanError, P as cacheEntrySchema, Pt as appendToEvalOutput, Q as evalChartAggregateSchema, R as cacheOperationTypeSchema, Rt as incrementEvalOutput, S as createRunRequestSchema, St as jsonCellSchema, T as getEvalTitle, Tt as runArtifactRefSchema, U as assertionFailureSchema, Ut as setScopeCacheContext, V as serializedCacheSpanSchema, Vt as runInEvalScope, W as caseDetailSchema, Wt as repoFile, X as evalSummarySchema, Y as evalStatsConfigSchema, Z as scoreTraceSchema, _t as cellValueSchema, at as evalChartTypeSchema, bt as columnKindSchema, ct as traceAttributeDisplayInputSchema, dt as traceDisplayConfigSchema, et as evalChartBuiltinMetricSchema, ft as traceDisplayInputConfigSchema, gt as traceSpanWarningSchema, ht as traceSpanSchema, it as evalChartTooltipExtraSchema, j as runSummarySchema, jt as hashCacheKey, k as deriveStatusFromChildStatuses, kt as evalSpan, lt as traceAttributeDisplayPlacementSchema, mt as traceSpanKindSchema, nt as evalChartConfigSchema, ot as evalChartsConfigSchema, pt as traceSpanErrorSchema, q as evalStatAggregateSchema, rt as evalChartMetricSchema, st as traceAttributeDisplayFormatSchema, tt as evalChartColorSchema, ut as traceAttributeDisplaySchema, vt as columnDefSchema, w as sseEnvelopeSchema, wt as repoFileRefSchema, xt as fileRefSchema, yt as columnFormatSchema, z as cacheRecordingOpSchema, zt as isInEvalScope } from "./runOrchestration-HaMahl6b.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-BTtgQLjB.mjs";
3
+ import "./src-Bt5Fz9HS.mjs";
4
+ export { EvalAssertionError, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
@@ -0,0 +1 @@
1
+ export { };
@@ -0,0 +1,107 @@
1
+ import { A as runManifestSchema, S as createRunRequestSchema, Y as evalStatsConfigSchema, j as runSummarySchema, ot as evalChartsConfigSchema, t as executeRun, v as loadConfig, vt as columnDefSchema, x as createFsCacheStore } from "./runOrchestration-HaMahl6b.mjs";
2
+ import { createHash } from "node:crypto";
3
+ import { readFile } from "node:fs/promises";
4
+ import { z } from "zod/v4";
5
+ //#region ../runner/src/runChild.ts
6
+ const evalMetaSchema = z.object({
7
+ id: z.string(),
8
+ title: z.string().optional(),
9
+ filePath: z.string(),
10
+ sourceFilePath: z.string(),
11
+ sourceFingerprint: z.string().nullable(),
12
+ columnDefs: z.array(columnDefSchema),
13
+ caseCount: z.number().nullable(),
14
+ stats: evalStatsConfigSchema.optional(),
15
+ charts: evalChartsConfigSchema.optional()
16
+ });
17
+ const runChildContextSchema = z.object({
18
+ request: createRunRequestSchema,
19
+ workspaceRoot: z.string(),
20
+ runDir: z.string(),
21
+ manifest: runManifestSchema,
22
+ summary: runSummarySchema,
23
+ evals: z.array(evalMetaSchema)
24
+ });
25
+ function sendMessage(message) {
26
+ if (process.send === void 0) return;
27
+ process.send(message);
28
+ }
29
+ function getSourceFingerprint(source) {
30
+ return createHash("sha256").update(source).digest("hex");
31
+ }
32
+ function getConfiguredConcurrency(configConcurrency) {
33
+ if (typeof configConcurrency !== "number" || !Number.isFinite(configConcurrency)) return 1;
34
+ return Math.max(1, Math.floor(configConcurrency));
35
+ }
36
+ function getTargetEvals(params) {
37
+ if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
38
+ return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
39
+ }
40
+ async function readContext(contextPath) {
41
+ if (contextPath === void 0) throw new Error("Missing run child context path");
42
+ return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
43
+ }
44
+ async function main() {
45
+ process.on("disconnect", () => {
46
+ process.exit(1);
47
+ });
48
+ const context = await readContext(process.argv[2]);
49
+ process.chdir(context.workspaceRoot);
50
+ const config = await loadConfig();
51
+ const cacheStore = createFsCacheStore({
52
+ workspaceRoot: context.workspaceRoot,
53
+ dir: config.cache?.dir,
54
+ maxEntriesPerEval: config.cache?.maxEntriesPerEval
55
+ });
56
+ const evals = new Map(context.evals.map((evalMeta) => [evalMeta.id, evalMeta]));
57
+ const lastRunStatusMap = /* @__PURE__ */ new Map();
58
+ const latestRunInfoMap = /* @__PURE__ */ new Map();
59
+ await executeRun({
60
+ runState: {
61
+ runDir: context.runDir,
62
+ manifest: context.manifest,
63
+ summary: context.summary,
64
+ cases: [],
65
+ caseDetails: /* @__PURE__ */ new Map(),
66
+ listeners: /* @__PURE__ */ new Set()
67
+ },
68
+ request: context.request,
69
+ runDir: context.runDir,
70
+ config,
71
+ evals,
72
+ cacheStore,
73
+ lastRunStatusMap,
74
+ latestRunInfoMap,
75
+ emitEvent(_runState, event) {
76
+ if (event.type === "case.finished") return;
77
+ sendMessage({
78
+ type: "event",
79
+ event
80
+ });
81
+ },
82
+ emitDiscoveryEvent() {},
83
+ workspaceRoot: context.workspaceRoot,
84
+ getSourceFingerprint,
85
+ getConfiguredConcurrency: () => getConfiguredConcurrency(config.concurrency),
86
+ getSortedEvalMetas: () => [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath)),
87
+ getTargetEvals: (request) => getTargetEvals({
88
+ evals,
89
+ request
90
+ }),
91
+ onCaseFinished(caseDetail, caseRow) {
92
+ sendMessage({
93
+ type: "case.finished",
94
+ caseDetail,
95
+ caseRow
96
+ });
97
+ }
98
+ });
99
+ sendMessage({
100
+ type: "done",
101
+ evals: [...evals.values()]
102
+ });
103
+ }
104
+ await main();
105
+ process.disconnect();
106
+ //#endregion
107
+ export {};