@ls-stack/agent-eval 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-Drw0IpOd.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-Cx2CvM6a.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-BVnLr79e.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,10 +1,11 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-lOZdhO2D.mjs";
2
+ import { t as runCli } from "./cli-B-sCTyz8.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
6
6
  const inspectFlagPrefix = "--inspect";
7
7
  const inspectBrkFlagPrefix = "--inspect-brk";
8
+ const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
8
9
  function needsModuleMocksFlag() {
9
10
  return !process.execArgv.includes(moduleMocksFlag);
10
11
  }
@@ -32,10 +33,13 @@ function isInspectArg(arg) {
32
33
  }
33
34
  function buildExecArgv(inspectArg) {
34
35
  const nextExecArgv = [moduleMocksFlag, ...process.execArgv.filter((arg) => arg !== moduleMocksFlag && !isInspectArg(arg))];
35
- if (inspectArg !== void 0) nextExecArgv.push(inspectArg);
36
- else nextExecArgv.push(...process.execArgv.filter(isInspectArg));
36
+ if (inspectArg === void 0) nextExecArgv.push(...process.execArgv.filter(isInspectArg));
37
37
  return nextExecArgv;
38
38
  }
39
+ function setRunChildInspectArg(inspectArg) {
40
+ if (inspectArg === void 0) return;
41
+ process.env[runChildInspectArgEnv] = inspectArg;
42
+ }
39
43
  function execArgvMatches(nextExecArgv) {
40
44
  return process.execArgv.length === nextExecArgv.length && process.execArgv.every((arg, index) => arg === nextExecArgv[index]);
41
45
  }
@@ -67,6 +71,7 @@ async function reexecWithNodeArgs(argv, execArgv) {
67
71
  });
68
72
  }
69
73
  const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
74
+ setRunChildInspectArg(inspectArg);
70
75
  const execArgv = buildExecArgv(inspectArg);
71
76
  if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv);
72
77
  else await runCli(argv);
@@ -1,4 +1,4 @@
1
- import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, fn as getEvalRegistry, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-H0pSUl3I.mjs";
1
+ import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -82,98 +82,6 @@ function validateCharts(params) {
82
82
  };
83
83
  }
84
84
  //#endregion
85
- //#region ../runner/src/discovery.ts
86
- const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
87
- const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
88
- function parseEvalMetas(filePath, content) {
89
- const metas = [];
90
- let searchIndex = 0;
91
- while (searchIndex < content.length) {
92
- const defineEvalIndex = content.indexOf("defineEval", searchIndex);
93
- if (defineEvalIndex === -1) break;
94
- const extracted = extractDefineEvalObject(content, defineEvalIndex);
95
- if (!extracted) {
96
- searchIndex = defineEvalIndex + 10;
97
- continue;
98
- }
99
- const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
100
- if (id !== void 0) {
101
- const result = {
102
- filePath,
103
- id
104
- };
105
- const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
106
- if (title !== void 0) result.title = title;
107
- metas.push(result);
108
- }
109
- searchIndex = extracted.nextIndex;
110
- }
111
- return metas;
112
- }
113
- function extractDefineEvalObject(content, defineEvalIndex) {
114
- const openParenIndex = content.indexOf("(", defineEvalIndex);
115
- if (openParenIndex === -1) return void 0;
116
- const objectStartIndex = content.indexOf("{", openParenIndex);
117
- if (objectStartIndex === -1) return void 0;
118
- let depth = 0;
119
- let quote;
120
- let inBlockComment = false;
121
- let inLineComment = false;
122
- let isEscaped = false;
123
- for (let index = objectStartIndex; index < content.length; index++) {
124
- const currentChar = content[index];
125
- const nextChar = content[index + 1];
126
- if (inLineComment) {
127
- if (currentChar === "\n") inLineComment = false;
128
- continue;
129
- }
130
- if (inBlockComment) {
131
- if (currentChar === "*" && nextChar === "/") {
132
- inBlockComment = false;
133
- index++;
134
- }
135
- continue;
136
- }
137
- if (quote) {
138
- if (isEscaped) {
139
- isEscaped = false;
140
- continue;
141
- }
142
- if (currentChar === "\\") {
143
- isEscaped = true;
144
- continue;
145
- }
146
- if (currentChar === quote) quote = void 0;
147
- continue;
148
- }
149
- if (currentChar === "/" && nextChar === "/") {
150
- inLineComment = true;
151
- index++;
152
- continue;
153
- }
154
- if (currentChar === "/" && nextChar === "*") {
155
- inBlockComment = true;
156
- index++;
157
- continue;
158
- }
159
- if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
160
- quote = currentChar;
161
- continue;
162
- }
163
- if (currentChar === "{") {
164
- depth++;
165
- continue;
166
- }
167
- if (currentChar === "}") {
168
- depth--;
169
- if (depth === 0) return {
170
- nextIndex: index + 1,
171
- objectText: content.slice(objectStartIndex, index + 1)
172
- };
173
- }
174
- }
175
- }
176
- //#endregion
177
85
  //#region ../runner/src/gitState.ts
178
86
  function runGitCommand(workspaceRoot, args) {
179
87
  const result = spawnSync("git", args, {
@@ -208,6 +116,9 @@ function isRunChildMessage(value) {
208
116
  }
209
117
  //#endregion
210
118
  //#region ../runner/src/runChildManager.ts
119
+ const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
120
+ const inspectFlagPrefix = "--inspect";
121
+ const inspectBrkFlagPrefix = "--inspect-brk";
211
122
  function startRunChild(params) {
212
123
  const child = spawn(process.execPath, [
213
124
  ...getRunChildExecArgv(),
@@ -256,10 +167,16 @@ function getRunChildExecArgv() {
256
167
  if (arg === "--input-type") skipNext = true;
257
168
  continue;
258
169
  }
170
+ if (isInspectArg(arg)) continue;
259
171
  execArgv.push(arg);
260
172
  }
173
+ const inspectArg = process.env[runChildInspectArgEnv];
174
+ if (inspectArg !== void 0 && isInspectArg(inspectArg)) execArgv.push(inspectArg);
261
175
  return execArgv;
262
176
  }
177
+ function isInspectArg(arg) {
178
+ return arg === inspectFlagPrefix || arg.startsWith(`${inspectFlagPrefix}=`) || arg === inspectBrkFlagPrefix || arg.startsWith(`${inspectBrkFlagPrefix}=`);
179
+ }
263
180
  function killRunChild(runState) {
264
181
  const child = runState.childProcess;
265
182
  runState.childProcess = void 0;
@@ -307,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
307
224
  function applyChildEvalMetas(evals, childMetas) {
308
225
  for (const childMeta of childMetas) {
309
226
  const evalMeta = evals.get(childMeta.id);
310
- if (evalMeta === void 0) continue;
227
+ if (evalMeta === void 0) {
228
+ evals.set(childMeta.id, childMeta);
229
+ continue;
230
+ }
311
231
  evalMeta.columnDefs = childMeta.columnDefs;
312
232
  evalMeta.caseCount = childMeta.caseCount;
313
233
  evalMeta.stats = childMeta.stats;
@@ -719,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
719
639
  workspaceRoot,
720
640
  runDir,
721
641
  manifest,
722
- summary,
723
- evals: getSortedEvalMetas()
642
+ summary
724
643
  };
725
644
  await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
726
645
  startRunChild({
@@ -1050,8 +969,8 @@ async function commandApp(args) {
1050
969
  const { serve } = await import("@hono/node-server");
1051
970
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1052
971
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1053
- const appModule = await import("./app-Dg3qYVku.mjs");
1054
- const runnerModule = await import("./runner-BK1KX2SA.mjs");
972
+ const appModule = await import("./app-B7FUWsVm.mjs");
973
+ const runnerModule = await import("./runner-Dt-Ynv6s.mjs");
1055
974
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1056
975
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1057
976
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -2720,6 +2720,14 @@ type EvalCaseScope = {
2720
2720
  caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
2721
2721
  pendingBackgroundJobs: Set<Promise<unknown>>;
2722
2722
  };
2723
+ /**
2724
+ * Runtime phase currently owned by the eval runner.
2725
+ *
2726
+ * `null` means the current async execution is outside an eval run. `env`
2727
+ * covers run-time module/environment loading, including top-level code in
2728
+ * modules imported while a run is being prepared.
2729
+ */
2730
+ type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
2723
2731
  /** Error thrown when an eval assertion fails during case execution. */
2724
2732
  declare class EvalAssertionError extends Error {
2725
2733
  constructor(message: string);
@@ -2727,12 +2735,14 @@ declare class EvalAssertionError extends Error {
2727
2735
  /** Return the current eval scope for the active async context, if any. */
2728
2736
  declare function getCurrentScope(): EvalCaseScope | undefined;
2729
2737
  /**
2730
- * Return whether the current async execution is inside an active eval case.
2738
+ * Return the current eval runner phase for this async execution.
2731
2739
  *
2732
- * This is useful for shared workflow code that wants to branch on eval-only
2733
- * behavior without importing or inspecting the full eval scope.
2740
+ * Returns `null` outside eval-owned work, `env` while the runner is loading
2741
+ * eval modules for a run, `cases` while generating cases, `eval` while running
2742
+ * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
2743
+ * while validating outputs, and `scorer` while computing scores.
2734
2744
  */
2735
- declare function isInEvalScope(): boolean;
2745
+ declare function isInEvalScope(): EvalRuntimeScope | null;
2736
2746
  /**
2737
2747
  * Register background work that should settle before eval finalization.
2738
2748
  *
@@ -2762,8 +2772,18 @@ type RunInEvalScopeOptions = {
2762
2772
  /** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
2763
2773
  idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
2764
2774
  cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
2765
- waitForBackgroundJobs?: boolean;
2775
+ waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
2776
+ runtimeScope?: EvalRuntimeScope;
2766
2777
  };
2778
+ /** Execute a callback while `isInEvalScope()` reports a runner phase. */
2779
+ declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
2780
+ /**
2781
+ * Execute a callback with an existing case scope and a specific runner phase.
2782
+ *
2783
+ * Runner-internal helper for post-execute phases that still need access to the
2784
+ * completed case scope through output, trace, assertion, and input helpers.
2785
+ */
2786
+ declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
2767
2787
  /**
2768
2788
  * Execute a callback inside a fresh eval case scope and capture its outputs,
2769
2789
  * trace data, and terminal error state.
@@ -3135,4 +3155,4 @@ declare function createRunner({
3135
3155
  */
3136
3156
  declare function runCli(argv: string[]): Promise<void>;
3137
3157
  //#endregion
3138
- export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3158
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as defineEval, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, fn as getEvalRegistry, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as startEvalBackgroundJob, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as repoFile, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-H0pSUl3I.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-lOZdhO2D.mjs";
3
- import "./src-Btb9RCYD.mjs";
4
- export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
3
+ import "./src-jaOlXwb5.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,7 +1,9 @@
1
- import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-H0pSUl3I.mjs";
1
+ import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
+ import { relative } from "node:path";
4
5
  import { z } from "zod/v4";
6
+ import { glob } from "glob";
5
7
  //#region ../runner/src/runChild.ts
6
8
  const evalMetaSchema = z.object({
7
9
  id: z.string(),
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
20
22
  runDir: z.string(),
21
23
  manifest: runManifestSchema,
22
24
  summary: runSummarySchema,
23
- evals: z.array(evalMetaSchema)
25
+ evals: z.array(evalMetaSchema).optional()
24
26
  });
25
27
  function sendMessage(message) {
26
28
  if (process.send === void 0) return;
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
37
39
  if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
38
40
  return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
39
41
  }
42
+ function toWorkspaceRelativePath(params) {
43
+ return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
44
+ }
45
+ async function discoverRunEvals(params) {
46
+ const discovered = [];
47
+ for (const pattern of params.config.include) {
48
+ const files = await glob(pattern, {
49
+ cwd: params.workspaceRoot,
50
+ absolute: true
51
+ });
52
+ discovered.push(...files);
53
+ }
54
+ const evals = /* @__PURE__ */ new Map();
55
+ for (const filePath of discovered) {
56
+ const source = await readFile(filePath, "utf-8");
57
+ const sourceFingerprint = getSourceFingerprint(source);
58
+ const metas = parseEvalMetas(filePath, source);
59
+ for (const meta of metas) evals.set(meta.id, {
60
+ id: meta.id,
61
+ title: meta.title,
62
+ filePath: toWorkspaceRelativePath({
63
+ filePath: meta.filePath,
64
+ workspaceRoot: params.workspaceRoot
65
+ }),
66
+ sourceFilePath: meta.filePath,
67
+ sourceFingerprint,
68
+ columnDefs: [],
69
+ caseCount: null
70
+ });
71
+ }
72
+ return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
73
+ }
40
74
  async function readContext(contextPath) {
41
75
  if (contextPath === void 0) throw new Error("Missing run child context path");
42
76
  return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
@@ -54,7 +88,11 @@ async function main() {
54
88
  maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
55
89
  maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
56
90
  });
57
- const evals = new Map(context.evals.map((evalMeta) => [evalMeta.id, evalMeta]));
91
+ const evalMetas = await discoverRunEvals({
92
+ config,
93
+ workspaceRoot: context.workspaceRoot
94
+ });
95
+ const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
58
96
  const lastRunStatusMap = /* @__PURE__ */ new Map();
59
97
  const latestRunInfoMap = /* @__PURE__ */ new Map();
60
98
  await executeRun({