@ls-stack/agent-eval 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Dg3qYVku.mjs → app-B7FUWsVm.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +117 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +8 -3
- package/dist/{cli-lOZdhO2D.mjs → cli-B-sCTyz8.mjs} +17 -98
- package/dist/index.d.mts +26 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +41 -3
- package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-B3fYtpKo.mjs} +269 -124
- package/dist/{runner-BK1KX2SA.mjs → runner-Dt-Ynv6s.mjs} +1 -1
- package/dist/{runner-CmbmfBG2.mjs → runner-vunKoSBu.mjs} +2 -2
- package/dist/src-jaOlXwb5.mjs +3 -0
- package/package.json +3 -2
- package/skills/agent-eval/SKILL.md +400 -0
- package/dist/apps/web/dist/assets/index-Drw0IpOd.js +0 -117
- package/dist/src-Btb9RCYD.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-Cx2CvM6a.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-BVnLr79e.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { t as runCli } from "./cli-
|
|
2
|
+
import { t as runCli } from "./cli-B-sCTyz8.mjs";
|
|
3
3
|
import { spawn } from "node:child_process";
|
|
4
4
|
//#region src/bin.ts
|
|
5
5
|
const moduleMocksFlag = "--experimental-test-module-mocks";
|
|
6
6
|
const inspectFlagPrefix = "--inspect";
|
|
7
7
|
const inspectBrkFlagPrefix = "--inspect-brk";
|
|
8
|
+
const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
|
|
8
9
|
function needsModuleMocksFlag() {
|
|
9
10
|
return !process.execArgv.includes(moduleMocksFlag);
|
|
10
11
|
}
|
|
@@ -32,10 +33,13 @@ function isInspectArg(arg) {
|
|
|
32
33
|
}
|
|
33
34
|
function buildExecArgv(inspectArg) {
|
|
34
35
|
const nextExecArgv = [moduleMocksFlag, ...process.execArgv.filter((arg) => arg !== moduleMocksFlag && !isInspectArg(arg))];
|
|
35
|
-
if (inspectArg
|
|
36
|
-
else nextExecArgv.push(...process.execArgv.filter(isInspectArg));
|
|
36
|
+
if (inspectArg === void 0) nextExecArgv.push(...process.execArgv.filter(isInspectArg));
|
|
37
37
|
return nextExecArgv;
|
|
38
38
|
}
|
|
39
|
+
function setRunChildInspectArg(inspectArg) {
|
|
40
|
+
if (inspectArg === void 0) return;
|
|
41
|
+
process.env[runChildInspectArgEnv] = inspectArg;
|
|
42
|
+
}
|
|
39
43
|
function execArgvMatches(nextExecArgv) {
|
|
40
44
|
return process.execArgv.length === nextExecArgv.length && process.execArgv.every((arg, index) => arg === nextExecArgv[index]);
|
|
41
45
|
}
|
|
@@ -67,6 +71,7 @@ async function reexecWithNodeArgs(argv, execArgv) {
|
|
|
67
71
|
});
|
|
68
72
|
}
|
|
69
73
|
const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
|
|
74
|
+
setRunChildInspectArg(inspectArg);
|
|
70
75
|
const execArgv = buildExecArgv(inspectArg);
|
|
71
76
|
if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv);
|
|
72
77
|
else await runCli(argv);
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as
|
|
1
|
+
import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -82,98 +82,6 @@ function validateCharts(params) {
|
|
|
82
82
|
};
|
|
83
83
|
}
|
|
84
84
|
//#endregion
|
|
85
|
-
//#region ../runner/src/discovery.ts
|
|
86
|
-
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
87
|
-
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
88
|
-
function parseEvalMetas(filePath, content) {
|
|
89
|
-
const metas = [];
|
|
90
|
-
let searchIndex = 0;
|
|
91
|
-
while (searchIndex < content.length) {
|
|
92
|
-
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
93
|
-
if (defineEvalIndex === -1) break;
|
|
94
|
-
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
95
|
-
if (!extracted) {
|
|
96
|
-
searchIndex = defineEvalIndex + 10;
|
|
97
|
-
continue;
|
|
98
|
-
}
|
|
99
|
-
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
100
|
-
if (id !== void 0) {
|
|
101
|
-
const result = {
|
|
102
|
-
filePath,
|
|
103
|
-
id
|
|
104
|
-
};
|
|
105
|
-
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
106
|
-
if (title !== void 0) result.title = title;
|
|
107
|
-
metas.push(result);
|
|
108
|
-
}
|
|
109
|
-
searchIndex = extracted.nextIndex;
|
|
110
|
-
}
|
|
111
|
-
return metas;
|
|
112
|
-
}
|
|
113
|
-
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
114
|
-
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
115
|
-
if (openParenIndex === -1) return void 0;
|
|
116
|
-
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
117
|
-
if (objectStartIndex === -1) return void 0;
|
|
118
|
-
let depth = 0;
|
|
119
|
-
let quote;
|
|
120
|
-
let inBlockComment = false;
|
|
121
|
-
let inLineComment = false;
|
|
122
|
-
let isEscaped = false;
|
|
123
|
-
for (let index = objectStartIndex; index < content.length; index++) {
|
|
124
|
-
const currentChar = content[index];
|
|
125
|
-
const nextChar = content[index + 1];
|
|
126
|
-
if (inLineComment) {
|
|
127
|
-
if (currentChar === "\n") inLineComment = false;
|
|
128
|
-
continue;
|
|
129
|
-
}
|
|
130
|
-
if (inBlockComment) {
|
|
131
|
-
if (currentChar === "*" && nextChar === "/") {
|
|
132
|
-
inBlockComment = false;
|
|
133
|
-
index++;
|
|
134
|
-
}
|
|
135
|
-
continue;
|
|
136
|
-
}
|
|
137
|
-
if (quote) {
|
|
138
|
-
if (isEscaped) {
|
|
139
|
-
isEscaped = false;
|
|
140
|
-
continue;
|
|
141
|
-
}
|
|
142
|
-
if (currentChar === "\\") {
|
|
143
|
-
isEscaped = true;
|
|
144
|
-
continue;
|
|
145
|
-
}
|
|
146
|
-
if (currentChar === quote) quote = void 0;
|
|
147
|
-
continue;
|
|
148
|
-
}
|
|
149
|
-
if (currentChar === "/" && nextChar === "/") {
|
|
150
|
-
inLineComment = true;
|
|
151
|
-
index++;
|
|
152
|
-
continue;
|
|
153
|
-
}
|
|
154
|
-
if (currentChar === "/" && nextChar === "*") {
|
|
155
|
-
inBlockComment = true;
|
|
156
|
-
index++;
|
|
157
|
-
continue;
|
|
158
|
-
}
|
|
159
|
-
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
160
|
-
quote = currentChar;
|
|
161
|
-
continue;
|
|
162
|
-
}
|
|
163
|
-
if (currentChar === "{") {
|
|
164
|
-
depth++;
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
if (currentChar === "}") {
|
|
168
|
-
depth--;
|
|
169
|
-
if (depth === 0) return {
|
|
170
|
-
nextIndex: index + 1,
|
|
171
|
-
objectText: content.slice(objectStartIndex, index + 1)
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
//#endregion
|
|
177
85
|
//#region ../runner/src/gitState.ts
|
|
178
86
|
function runGitCommand(workspaceRoot, args) {
|
|
179
87
|
const result = spawnSync("git", args, {
|
|
@@ -208,6 +116,9 @@ function isRunChildMessage(value) {
|
|
|
208
116
|
}
|
|
209
117
|
//#endregion
|
|
210
118
|
//#region ../runner/src/runChildManager.ts
|
|
119
|
+
const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
|
|
120
|
+
const inspectFlagPrefix = "--inspect";
|
|
121
|
+
const inspectBrkFlagPrefix = "--inspect-brk";
|
|
211
122
|
function startRunChild(params) {
|
|
212
123
|
const child = spawn(process.execPath, [
|
|
213
124
|
...getRunChildExecArgv(),
|
|
@@ -256,10 +167,16 @@ function getRunChildExecArgv() {
|
|
|
256
167
|
if (arg === "--input-type") skipNext = true;
|
|
257
168
|
continue;
|
|
258
169
|
}
|
|
170
|
+
if (isInspectArg(arg)) continue;
|
|
259
171
|
execArgv.push(arg);
|
|
260
172
|
}
|
|
173
|
+
const inspectArg = process.env[runChildInspectArgEnv];
|
|
174
|
+
if (inspectArg !== void 0 && isInspectArg(inspectArg)) execArgv.push(inspectArg);
|
|
261
175
|
return execArgv;
|
|
262
176
|
}
|
|
177
|
+
function isInspectArg(arg) {
|
|
178
|
+
return arg === inspectFlagPrefix || arg.startsWith(`${inspectFlagPrefix}=`) || arg === inspectBrkFlagPrefix || arg.startsWith(`${inspectBrkFlagPrefix}=`);
|
|
179
|
+
}
|
|
263
180
|
function killRunChild(runState) {
|
|
264
181
|
const child = runState.childProcess;
|
|
265
182
|
runState.childProcess = void 0;
|
|
@@ -307,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
|
307
224
|
function applyChildEvalMetas(evals, childMetas) {
|
|
308
225
|
for (const childMeta of childMetas) {
|
|
309
226
|
const evalMeta = evals.get(childMeta.id);
|
|
310
|
-
if (evalMeta === void 0)
|
|
227
|
+
if (evalMeta === void 0) {
|
|
228
|
+
evals.set(childMeta.id, childMeta);
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
311
231
|
evalMeta.columnDefs = childMeta.columnDefs;
|
|
312
232
|
evalMeta.caseCount = childMeta.caseCount;
|
|
313
233
|
evalMeta.stats = childMeta.stats;
|
|
@@ -719,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
719
639
|
workspaceRoot,
|
|
720
640
|
runDir,
|
|
721
641
|
manifest,
|
|
722
|
-
summary
|
|
723
|
-
evals: getSortedEvalMetas()
|
|
642
|
+
summary
|
|
724
643
|
};
|
|
725
644
|
await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
|
|
726
645
|
startRunChild({
|
|
@@ -1050,8 +969,8 @@ async function commandApp(args) {
|
|
|
1050
969
|
const { serve } = await import("@hono/node-server");
|
|
1051
970
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1052
971
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1053
|
-
const appModule = await import("./app-
|
|
1054
|
-
const runnerModule = await import("./runner-
|
|
972
|
+
const appModule = await import("./app-B7FUWsVm.mjs");
|
|
973
|
+
const runnerModule = await import("./runner-Dt-Ynv6s.mjs");
|
|
1055
974
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1056
975
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1057
976
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2720,6 +2720,14 @@ type EvalCaseScope = {
|
|
|
2720
2720
|
caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
|
|
2721
2721
|
pendingBackgroundJobs: Set<Promise<unknown>>;
|
|
2722
2722
|
};
|
|
2723
|
+
/**
|
|
2724
|
+
* Runtime phase currently owned by the eval runner.
|
|
2725
|
+
*
|
|
2726
|
+
* `null` means the current async execution is outside an eval run. `env`
|
|
2727
|
+
* covers run-time module/environment loading, including top-level code in
|
|
2728
|
+
* modules imported while a run is being prepared.
|
|
2729
|
+
*/
|
|
2730
|
+
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
2723
2731
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
2724
2732
|
declare class EvalAssertionError extends Error {
|
|
2725
2733
|
constructor(message: string);
|
|
@@ -2727,12 +2735,14 @@ declare class EvalAssertionError extends Error {
|
|
|
2727
2735
|
/** Return the current eval scope for the active async context, if any. */
|
|
2728
2736
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
2729
2737
|
/**
|
|
2730
|
-
* Return
|
|
2738
|
+
* Return the current eval runner phase for this async execution.
|
|
2731
2739
|
*
|
|
2732
|
-
*
|
|
2733
|
-
*
|
|
2740
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2741
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2742
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
2743
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
2734
2744
|
*/
|
|
2735
|
-
declare function isInEvalScope():
|
|
2745
|
+
declare function isInEvalScope(): EvalRuntimeScope | null;
|
|
2736
2746
|
/**
|
|
2737
2747
|
* Register background work that should settle before eval finalization.
|
|
2738
2748
|
*
|
|
@@ -2762,8 +2772,18 @@ type RunInEvalScopeOptions = {
|
|
|
2762
2772
|
/** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
|
|
2763
2773
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
2764
2774
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
2765
|
-
waitForBackgroundJobs?: boolean;
|
|
2775
|
+
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
2776
|
+
runtimeScope?: EvalRuntimeScope;
|
|
2766
2777
|
};
|
|
2778
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
2779
|
+
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2780
|
+
/**
|
|
2781
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
2782
|
+
*
|
|
2783
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
2784
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
2785
|
+
*/
|
|
2786
|
+
declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2767
2787
|
/**
|
|
2768
2788
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
2769
2789
|
* trace data, and terminal error state.
|
|
@@ -3135,4 +3155,4 @@ declare function createRunner({
|
|
|
3135
3155
|
*/
|
|
3136
3156
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3137
3157
|
//#endregion
|
|
3138
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3158
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
|
|
3
|
+
import "./src-jaOlXwb5.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import { F as
|
|
1
|
+
import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { relative } from "node:path";
|
|
4
5
|
import { z } from "zod/v4";
|
|
6
|
+
import { glob } from "glob";
|
|
5
7
|
//#region ../runner/src/runChild.ts
|
|
6
8
|
const evalMetaSchema = z.object({
|
|
7
9
|
id: z.string(),
|
|
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
|
|
|
20
22
|
runDir: z.string(),
|
|
21
23
|
manifest: runManifestSchema,
|
|
22
24
|
summary: runSummarySchema,
|
|
23
|
-
evals: z.array(evalMetaSchema)
|
|
25
|
+
evals: z.array(evalMetaSchema).optional()
|
|
24
26
|
});
|
|
25
27
|
function sendMessage(message) {
|
|
26
28
|
if (process.send === void 0) return;
|
|
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
|
|
|
37
39
|
if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
|
|
38
40
|
return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
39
41
|
}
|
|
42
|
+
function toWorkspaceRelativePath(params) {
|
|
43
|
+
return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
|
|
44
|
+
}
|
|
45
|
+
async function discoverRunEvals(params) {
|
|
46
|
+
const discovered = [];
|
|
47
|
+
for (const pattern of params.config.include) {
|
|
48
|
+
const files = await glob(pattern, {
|
|
49
|
+
cwd: params.workspaceRoot,
|
|
50
|
+
absolute: true
|
|
51
|
+
});
|
|
52
|
+
discovered.push(...files);
|
|
53
|
+
}
|
|
54
|
+
const evals = /* @__PURE__ */ new Map();
|
|
55
|
+
for (const filePath of discovered) {
|
|
56
|
+
const source = await readFile(filePath, "utf-8");
|
|
57
|
+
const sourceFingerprint = getSourceFingerprint(source);
|
|
58
|
+
const metas = parseEvalMetas(filePath, source);
|
|
59
|
+
for (const meta of metas) evals.set(meta.id, {
|
|
60
|
+
id: meta.id,
|
|
61
|
+
title: meta.title,
|
|
62
|
+
filePath: toWorkspaceRelativePath({
|
|
63
|
+
filePath: meta.filePath,
|
|
64
|
+
workspaceRoot: params.workspaceRoot
|
|
65
|
+
}),
|
|
66
|
+
sourceFilePath: meta.filePath,
|
|
67
|
+
sourceFingerprint,
|
|
68
|
+
columnDefs: [],
|
|
69
|
+
caseCount: null
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
73
|
+
}
|
|
40
74
|
async function readContext(contextPath) {
|
|
41
75
|
if (contextPath === void 0) throw new Error("Missing run child context path");
|
|
42
76
|
return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
|
|
@@ -54,7 +88,11 @@ async function main() {
|
|
|
54
88
|
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
55
89
|
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
56
90
|
});
|
|
57
|
-
const
|
|
91
|
+
const evalMetas = await discoverRunEvals({
|
|
92
|
+
config,
|
|
93
|
+
workspaceRoot: context.workspaceRoot
|
|
94
|
+
});
|
|
95
|
+
const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
|
|
58
96
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
59
97
|
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
60
98
|
await executeRun({
|