@ls-stack/agent-eval 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DKWm1oxc.mjs → app-B7FUWsVm.mjs} +3 -3
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CMiCEQ-3.mjs → cli-B-sCTyz8.mjs} +8 -98
- package/dist/index.d.mts +26 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +41 -3
- package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-B3fYtpKo.mjs} +269 -124
- package/dist/{runner-Dx1sMCbh.mjs → runner-Dt-Ynv6s.mjs} +1 -1
- package/dist/{runner-DLnj18MO.mjs → runner-vunKoSBu.mjs} +2 -2
- package/dist/src-jaOlXwb5.mjs +3 -0
- package/package.json +3 -2
- package/skills/agent-eval/SKILL.md +400 -0
- package/dist/src-BgGL7DDp.mjs +0 -3
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { C as
|
|
2
|
-
import "./src-
|
|
3
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { C as createRunRequestSchema, w as updateManualScoreRequestSchema } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
|
+
import "./src-jaOlXwb5.mjs";
|
|
3
|
+
import { t as getRunnerInstance } from "./runner-vunKoSBu.mjs";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { dirname, join, relative, resolve, sep } from "node:path";
|
|
6
6
|
import { z } from "zod/v4";
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as
|
|
1
|
+
import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -82,98 +82,6 @@ function validateCharts(params) {
|
|
|
82
82
|
};
|
|
83
83
|
}
|
|
84
84
|
//#endregion
|
|
85
|
-
//#region ../runner/src/discovery.ts
|
|
86
|
-
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
87
|
-
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
88
|
-
function parseEvalMetas(filePath, content) {
|
|
89
|
-
const metas = [];
|
|
90
|
-
let searchIndex = 0;
|
|
91
|
-
while (searchIndex < content.length) {
|
|
92
|
-
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
93
|
-
if (defineEvalIndex === -1) break;
|
|
94
|
-
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
95
|
-
if (!extracted) {
|
|
96
|
-
searchIndex = defineEvalIndex + 10;
|
|
97
|
-
continue;
|
|
98
|
-
}
|
|
99
|
-
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
100
|
-
if (id !== void 0) {
|
|
101
|
-
const result = {
|
|
102
|
-
filePath,
|
|
103
|
-
id
|
|
104
|
-
};
|
|
105
|
-
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
106
|
-
if (title !== void 0) result.title = title;
|
|
107
|
-
metas.push(result);
|
|
108
|
-
}
|
|
109
|
-
searchIndex = extracted.nextIndex;
|
|
110
|
-
}
|
|
111
|
-
return metas;
|
|
112
|
-
}
|
|
113
|
-
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
114
|
-
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
115
|
-
if (openParenIndex === -1) return void 0;
|
|
116
|
-
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
117
|
-
if (objectStartIndex === -1) return void 0;
|
|
118
|
-
let depth = 0;
|
|
119
|
-
let quote;
|
|
120
|
-
let inBlockComment = false;
|
|
121
|
-
let inLineComment = false;
|
|
122
|
-
let isEscaped = false;
|
|
123
|
-
for (let index = objectStartIndex; index < content.length; index++) {
|
|
124
|
-
const currentChar = content[index];
|
|
125
|
-
const nextChar = content[index + 1];
|
|
126
|
-
if (inLineComment) {
|
|
127
|
-
if (currentChar === "\n") inLineComment = false;
|
|
128
|
-
continue;
|
|
129
|
-
}
|
|
130
|
-
if (inBlockComment) {
|
|
131
|
-
if (currentChar === "*" && nextChar === "/") {
|
|
132
|
-
inBlockComment = false;
|
|
133
|
-
index++;
|
|
134
|
-
}
|
|
135
|
-
continue;
|
|
136
|
-
}
|
|
137
|
-
if (quote) {
|
|
138
|
-
if (isEscaped) {
|
|
139
|
-
isEscaped = false;
|
|
140
|
-
continue;
|
|
141
|
-
}
|
|
142
|
-
if (currentChar === "\\") {
|
|
143
|
-
isEscaped = true;
|
|
144
|
-
continue;
|
|
145
|
-
}
|
|
146
|
-
if (currentChar === quote) quote = void 0;
|
|
147
|
-
continue;
|
|
148
|
-
}
|
|
149
|
-
if (currentChar === "/" && nextChar === "/") {
|
|
150
|
-
inLineComment = true;
|
|
151
|
-
index++;
|
|
152
|
-
continue;
|
|
153
|
-
}
|
|
154
|
-
if (currentChar === "/" && nextChar === "*") {
|
|
155
|
-
inBlockComment = true;
|
|
156
|
-
index++;
|
|
157
|
-
continue;
|
|
158
|
-
}
|
|
159
|
-
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
160
|
-
quote = currentChar;
|
|
161
|
-
continue;
|
|
162
|
-
}
|
|
163
|
-
if (currentChar === "{") {
|
|
164
|
-
depth++;
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
if (currentChar === "}") {
|
|
168
|
-
depth--;
|
|
169
|
-
if (depth === 0) return {
|
|
170
|
-
nextIndex: index + 1,
|
|
171
|
-
objectText: content.slice(objectStartIndex, index + 1)
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
//#endregion
|
|
177
85
|
//#region ../runner/src/gitState.ts
|
|
178
86
|
function runGitCommand(workspaceRoot, args) {
|
|
179
87
|
const result = spawnSync("git", args, {
|
|
@@ -316,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
|
316
224
|
function applyChildEvalMetas(evals, childMetas) {
|
|
317
225
|
for (const childMeta of childMetas) {
|
|
318
226
|
const evalMeta = evals.get(childMeta.id);
|
|
319
|
-
if (evalMeta === void 0)
|
|
227
|
+
if (evalMeta === void 0) {
|
|
228
|
+
evals.set(childMeta.id, childMeta);
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
320
231
|
evalMeta.columnDefs = childMeta.columnDefs;
|
|
321
232
|
evalMeta.caseCount = childMeta.caseCount;
|
|
322
233
|
evalMeta.stats = childMeta.stats;
|
|
@@ -728,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
728
639
|
workspaceRoot,
|
|
729
640
|
runDir,
|
|
730
641
|
manifest,
|
|
731
|
-
summary
|
|
732
|
-
evals: getSortedEvalMetas()
|
|
642
|
+
summary
|
|
733
643
|
};
|
|
734
644
|
await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
|
|
735
645
|
startRunChild({
|
|
@@ -1059,8 +969,8 @@ async function commandApp(args) {
|
|
|
1059
969
|
const { serve } = await import("@hono/node-server");
|
|
1060
970
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1061
971
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1062
|
-
const appModule = await import("./app-
|
|
1063
|
-
const runnerModule = await import("./runner-
|
|
972
|
+
const appModule = await import("./app-B7FUWsVm.mjs");
|
|
973
|
+
const runnerModule = await import("./runner-Dt-Ynv6s.mjs");
|
|
1064
974
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1065
975
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1066
976
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2720,6 +2720,14 @@ type EvalCaseScope = {
|
|
|
2720
2720
|
caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
|
|
2721
2721
|
pendingBackgroundJobs: Set<Promise<unknown>>;
|
|
2722
2722
|
};
|
|
2723
|
+
/**
|
|
2724
|
+
* Runtime phase currently owned by the eval runner.
|
|
2725
|
+
*
|
|
2726
|
+
* `null` means the current async execution is outside an eval run. `env`
|
|
2727
|
+
* covers run-time module/environment loading, including top-level code in
|
|
2728
|
+
* modules imported while a run is being prepared.
|
|
2729
|
+
*/
|
|
2730
|
+
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
2723
2731
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
2724
2732
|
declare class EvalAssertionError extends Error {
|
|
2725
2733
|
constructor(message: string);
|
|
@@ -2727,12 +2735,14 @@ declare class EvalAssertionError extends Error {
|
|
|
2727
2735
|
/** Return the current eval scope for the active async context, if any. */
|
|
2728
2736
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
2729
2737
|
/**
|
|
2730
|
-
* Return
|
|
2738
|
+
* Return the current eval runner phase for this async execution.
|
|
2731
2739
|
*
|
|
2732
|
-
*
|
|
2733
|
-
*
|
|
2740
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2741
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2742
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
2743
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
2734
2744
|
*/
|
|
2735
|
-
declare function isInEvalScope():
|
|
2745
|
+
declare function isInEvalScope(): EvalRuntimeScope | null;
|
|
2736
2746
|
/**
|
|
2737
2747
|
* Register background work that should settle before eval finalization.
|
|
2738
2748
|
*
|
|
@@ -2762,8 +2772,18 @@ type RunInEvalScopeOptions = {
|
|
|
2762
2772
|
/** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
|
|
2763
2773
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
2764
2774
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
2765
|
-
waitForBackgroundJobs?: boolean;
|
|
2775
|
+
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
2776
|
+
runtimeScope?: EvalRuntimeScope;
|
|
2766
2777
|
};
|
|
2778
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
2779
|
+
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2780
|
+
/**
|
|
2781
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
2782
|
+
*
|
|
2783
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
2784
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
2785
|
+
*/
|
|
2786
|
+
declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2767
2787
|
/**
|
|
2768
2788
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
2769
2789
|
* trace data, and terminal error state.
|
|
@@ -3135,4 +3155,4 @@ declare function createRunner({
|
|
|
3135
3155
|
*/
|
|
3136
3156
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3137
3157
|
//#endregion
|
|
3138
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3158
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
|
|
3
|
+
import "./src-jaOlXwb5.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import { F as
|
|
1
|
+
import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { relative } from "node:path";
|
|
4
5
|
import { z } from "zod/v4";
|
|
6
|
+
import { glob } from "glob";
|
|
5
7
|
//#region ../runner/src/runChild.ts
|
|
6
8
|
const evalMetaSchema = z.object({
|
|
7
9
|
id: z.string(),
|
|
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
|
|
|
20
22
|
runDir: z.string(),
|
|
21
23
|
manifest: runManifestSchema,
|
|
22
24
|
summary: runSummarySchema,
|
|
23
|
-
evals: z.array(evalMetaSchema)
|
|
25
|
+
evals: z.array(evalMetaSchema).optional()
|
|
24
26
|
});
|
|
25
27
|
function sendMessage(message) {
|
|
26
28
|
if (process.send === void 0) return;
|
|
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
|
|
|
37
39
|
if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
|
|
38
40
|
return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
39
41
|
}
|
|
42
|
+
function toWorkspaceRelativePath(params) {
|
|
43
|
+
return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
|
|
44
|
+
}
|
|
45
|
+
async function discoverRunEvals(params) {
|
|
46
|
+
const discovered = [];
|
|
47
|
+
for (const pattern of params.config.include) {
|
|
48
|
+
const files = await glob(pattern, {
|
|
49
|
+
cwd: params.workspaceRoot,
|
|
50
|
+
absolute: true
|
|
51
|
+
});
|
|
52
|
+
discovered.push(...files);
|
|
53
|
+
}
|
|
54
|
+
const evals = /* @__PURE__ */ new Map();
|
|
55
|
+
for (const filePath of discovered) {
|
|
56
|
+
const source = await readFile(filePath, "utf-8");
|
|
57
|
+
const sourceFingerprint = getSourceFingerprint(source);
|
|
58
|
+
const metas = parseEvalMetas(filePath, source);
|
|
59
|
+
for (const meta of metas) evals.set(meta.id, {
|
|
60
|
+
id: meta.id,
|
|
61
|
+
title: meta.title,
|
|
62
|
+
filePath: toWorkspaceRelativePath({
|
|
63
|
+
filePath: meta.filePath,
|
|
64
|
+
workspaceRoot: params.workspaceRoot
|
|
65
|
+
}),
|
|
66
|
+
sourceFilePath: meta.filePath,
|
|
67
|
+
sourceFingerprint,
|
|
68
|
+
columnDefs: [],
|
|
69
|
+
caseCount: null
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
73
|
+
}
|
|
40
74
|
async function readContext(contextPath) {
|
|
41
75
|
if (contextPath === void 0) throw new Error("Missing run child context path");
|
|
42
76
|
return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
|
|
@@ -54,7 +88,11 @@ async function main() {
|
|
|
54
88
|
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
55
89
|
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
56
90
|
});
|
|
57
|
-
const
|
|
91
|
+
const evalMetas = await discoverRunEvals({
|
|
92
|
+
config,
|
|
93
|
+
workspaceRoot: context.workspaceRoot
|
|
94
|
+
});
|
|
95
|
+
const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
|
|
58
96
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
59
97
|
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
60
98
|
await executeRun({
|
|
@@ -46,7 +46,9 @@ function repoFile(path, mimeType) {
|
|
|
46
46
|
//#endregion
|
|
47
47
|
//#region ../sdk/src/runtime.ts
|
|
48
48
|
const scopeStorage = new AsyncLocalStorage();
|
|
49
|
+
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
49
50
|
let activeEvalScopeCount = 0;
|
|
51
|
+
let activeEvalRuntimeScopeCount = 0;
|
|
50
52
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
51
53
|
var EvalAssertionError = class extends Error {
|
|
52
54
|
constructor(message) {
|
|
@@ -60,13 +62,16 @@ function getCurrentScope() {
|
|
|
60
62
|
return scopeStorage.getStore();
|
|
61
63
|
}
|
|
62
64
|
/**
|
|
63
|
-
* Return
|
|
65
|
+
* Return the current eval runner phase for this async execution.
|
|
64
66
|
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
68
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
69
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
70
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
67
71
|
*/
|
|
68
72
|
function isInEvalScope() {
|
|
69
|
-
|
|
73
|
+
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
74
|
+
return runtimeScopeStorage.getStore() ?? null;
|
|
70
75
|
}
|
|
71
76
|
function registerBackgroundJobInScope(scope, promise) {
|
|
72
77
|
const trackedPromise = promise.then(() => {
|
|
@@ -122,6 +127,31 @@ function getEvalCaseInput(path = void 0) {
|
|
|
122
127
|
function setScopeCacheContext(scope, context) {
|
|
123
128
|
scope.cacheContext = context;
|
|
124
129
|
}
|
|
130
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
131
|
+
async function runInEvalRuntimeScope(runtimeScope, fn) {
|
|
132
|
+
activeEvalRuntimeScopeCount++;
|
|
133
|
+
try {
|
|
134
|
+
return await runtimeScopeStorage.run(runtimeScope, fn);
|
|
135
|
+
} finally {
|
|
136
|
+
activeEvalRuntimeScopeCount--;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
141
|
+
*
|
|
142
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
143
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
144
|
+
*/
|
|
145
|
+
async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
146
|
+
activeEvalScopeCount++;
|
|
147
|
+
try {
|
|
148
|
+
return await scopeStorage.run(scope, async () => {
|
|
149
|
+
return await runInEvalRuntimeScope(runtimeScope, fn);
|
|
150
|
+
});
|
|
151
|
+
} finally {
|
|
152
|
+
activeEvalScopeCount--;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
125
155
|
/**
|
|
126
156
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
127
157
|
* trace data, and terminal error state.
|
|
@@ -144,29 +174,24 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
144
174
|
caseCacheRefs: [],
|
|
145
175
|
pendingBackgroundJobs: /* @__PURE__ */ new Set()
|
|
146
176
|
};
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
}
|
|
166
|
-
});
|
|
167
|
-
} finally {
|
|
168
|
-
activeEvalScopeCount--;
|
|
169
|
-
}
|
|
177
|
+
return await runInExistingEvalScope(scope, options.runtimeScope ?? "eval", async () => {
|
|
178
|
+
try {
|
|
179
|
+
const result = await fn();
|
|
180
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
181
|
+
return {
|
|
182
|
+
result,
|
|
183
|
+
scope,
|
|
184
|
+
error: void 0
|
|
185
|
+
};
|
|
186
|
+
} catch (error) {
|
|
187
|
+
if (options.waitForBackgroundJobs !== false) await drainBackgroundJobs(scope);
|
|
188
|
+
return {
|
|
189
|
+
result: void 0,
|
|
190
|
+
scope,
|
|
191
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
});
|
|
170
195
|
}
|
|
171
196
|
/**
|
|
172
197
|
* Return the next deterministic ID for the active eval case execution.
|
|
@@ -4886,6 +4911,98 @@ async function loadConfig() {
|
|
|
4886
4911
|
}
|
|
4887
4912
|
}
|
|
4888
4913
|
//#endregion
|
|
4914
|
+
//#region ../runner/src/discovery.ts
|
|
4915
|
+
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4916
|
+
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
4917
|
+
function parseEvalMetas(filePath, content) {
|
|
4918
|
+
const metas = [];
|
|
4919
|
+
let searchIndex = 0;
|
|
4920
|
+
while (searchIndex < content.length) {
|
|
4921
|
+
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
4922
|
+
if (defineEvalIndex === -1) break;
|
|
4923
|
+
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
4924
|
+
if (!extracted) {
|
|
4925
|
+
searchIndex = defineEvalIndex + 10;
|
|
4926
|
+
continue;
|
|
4927
|
+
}
|
|
4928
|
+
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
4929
|
+
if (id !== void 0) {
|
|
4930
|
+
const result = {
|
|
4931
|
+
filePath,
|
|
4932
|
+
id
|
|
4933
|
+
};
|
|
4934
|
+
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
4935
|
+
if (title !== void 0) result.title = title;
|
|
4936
|
+
metas.push(result);
|
|
4937
|
+
}
|
|
4938
|
+
searchIndex = extracted.nextIndex;
|
|
4939
|
+
}
|
|
4940
|
+
return metas;
|
|
4941
|
+
}
|
|
4942
|
+
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
4943
|
+
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
4944
|
+
if (openParenIndex === -1) return void 0;
|
|
4945
|
+
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
4946
|
+
if (objectStartIndex === -1) return void 0;
|
|
4947
|
+
let depth = 0;
|
|
4948
|
+
let quote;
|
|
4949
|
+
let inBlockComment = false;
|
|
4950
|
+
let inLineComment = false;
|
|
4951
|
+
let isEscaped = false;
|
|
4952
|
+
for (let index = objectStartIndex; index < content.length; index++) {
|
|
4953
|
+
const currentChar = content[index];
|
|
4954
|
+
const nextChar = content[index + 1];
|
|
4955
|
+
if (inLineComment) {
|
|
4956
|
+
if (currentChar === "\n") inLineComment = false;
|
|
4957
|
+
continue;
|
|
4958
|
+
}
|
|
4959
|
+
if (inBlockComment) {
|
|
4960
|
+
if (currentChar === "*" && nextChar === "/") {
|
|
4961
|
+
inBlockComment = false;
|
|
4962
|
+
index++;
|
|
4963
|
+
}
|
|
4964
|
+
continue;
|
|
4965
|
+
}
|
|
4966
|
+
if (quote) {
|
|
4967
|
+
if (isEscaped) {
|
|
4968
|
+
isEscaped = false;
|
|
4969
|
+
continue;
|
|
4970
|
+
}
|
|
4971
|
+
if (currentChar === "\\") {
|
|
4972
|
+
isEscaped = true;
|
|
4973
|
+
continue;
|
|
4974
|
+
}
|
|
4975
|
+
if (currentChar === quote) quote = void 0;
|
|
4976
|
+
continue;
|
|
4977
|
+
}
|
|
4978
|
+
if (currentChar === "/" && nextChar === "/") {
|
|
4979
|
+
inLineComment = true;
|
|
4980
|
+
index++;
|
|
4981
|
+
continue;
|
|
4982
|
+
}
|
|
4983
|
+
if (currentChar === "/" && nextChar === "*") {
|
|
4984
|
+
inBlockComment = true;
|
|
4985
|
+
index++;
|
|
4986
|
+
continue;
|
|
4987
|
+
}
|
|
4988
|
+
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
4989
|
+
quote = currentChar;
|
|
4990
|
+
continue;
|
|
4991
|
+
}
|
|
4992
|
+
if (currentChar === "{") {
|
|
4993
|
+
depth++;
|
|
4994
|
+
continue;
|
|
4995
|
+
}
|
|
4996
|
+
if (currentChar === "}") {
|
|
4997
|
+
depth--;
|
|
4998
|
+
if (depth === 0) return {
|
|
4999
|
+
nextIndex: index + 1,
|
|
5000
|
+
objectText: content.slice(objectStartIndex, index + 1)
|
|
5001
|
+
};
|
|
5002
|
+
}
|
|
5003
|
+
}
|
|
5004
|
+
}
|
|
5005
|
+
//#endregion
|
|
4889
5006
|
//#region ../runner/src/evalModuleLoader.ts
|
|
4890
5007
|
/**
|
|
4891
5008
|
* Import one eval module with a cache key derived from its current source so
|
|
@@ -5280,6 +5397,7 @@ const isolationParam = "agent-evals-isolate";
|
|
|
5280
5397
|
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
5281
5398
|
const isolationStorage = new AsyncLocalStorage();
|
|
5282
5399
|
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
5400
|
+
const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
|
|
5283
5401
|
let hooksRegistered = false;
|
|
5284
5402
|
const requireFromRunner = createRequire(import.meta.url);
|
|
5285
5403
|
const agentPackageUrlBySpecifier = new Map([
|
|
@@ -5305,7 +5423,10 @@ function getIsolationKeyFromParent(parentURL) {
|
|
|
5305
5423
|
}
|
|
5306
5424
|
function isWorkspaceFile(url, workspaceRoot) {
|
|
5307
5425
|
if (url.protocol !== "file:") return false;
|
|
5308
|
-
|
|
5426
|
+
return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
|
|
5427
|
+
}
|
|
5428
|
+
function isWorkspaceFilePath(filePath, workspaceRoot) {
|
|
5429
|
+
const relativePath = relative(workspaceRoot, filePath);
|
|
5309
5430
|
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
5310
5431
|
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
5311
5432
|
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
@@ -5340,15 +5461,23 @@ function registerModuleIsolationHooks() {
|
|
|
5340
5461
|
};
|
|
5341
5462
|
} });
|
|
5342
5463
|
}
|
|
5464
|
+
function clearWorkspaceRequireCacheOnce(context) {
|
|
5465
|
+
if (clearedRequireCacheKeys.has(context.key)) return;
|
|
5466
|
+
clearedRequireCacheKeys.add(context.key);
|
|
5467
|
+
for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
|
|
5468
|
+
}
|
|
5343
5469
|
/**
|
|
5344
5470
|
* Execute module loading and eval code with fresh workspace module URLs.
|
|
5345
5471
|
*
|
|
5346
5472
|
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
5347
|
-
* run-scoped query parameter to workspace file imports.
|
|
5348
|
-
*
|
|
5473
|
+
* run-scoped query parameter to workspace file imports. CommonJS modules use
|
|
5474
|
+
* `require.cache` behind ESM imports, so workspace entries are cleared once per
|
|
5475
|
+
* run. Package imports are left alone so SDK singletons, such as the eval
|
|
5476
|
+
* registry, remain shared.
|
|
5349
5477
|
*/
|
|
5350
5478
|
async function runWithModuleIsolation(context, fn) {
|
|
5351
5479
|
registerModuleIsolationHooks();
|
|
5480
|
+
clearWorkspaceRequireCacheOnce(context);
|
|
5352
5481
|
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
5353
5482
|
return await isolationStorage.run(context, fn);
|
|
5354
5483
|
}
|
|
@@ -5476,20 +5605,26 @@ async function runCase(params) {
|
|
|
5476
5605
|
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
5477
5606
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5478
5607
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5479
|
-
if (!nonAssertError && evalDef.deriveFromTracing)
|
|
5480
|
-
const
|
|
5481
|
-
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5608
|
+
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
5609
|
+
const { deriveFromTracing } = evalDef;
|
|
5610
|
+
try {
|
|
5611
|
+
const derived = await runInExistingEvalScope(scope, "derive", async () => {
|
|
5612
|
+
return await callWithUnknownResult(deriveFromTracing, [{
|
|
5613
|
+
trace: traceTree,
|
|
5614
|
+
input: evalCase.input,
|
|
5615
|
+
case: evalCase
|
|
5616
|
+
}]);
|
|
5617
|
+
});
|
|
5618
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
5619
|
+
for (const [key, value] of Object.entries(derived)) if (!(key in scope.outputs)) scope.outputs[key] = value;
|
|
5620
|
+
} catch (e) {
|
|
5621
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5622
|
+
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5623
|
+
}
|
|
5490
5624
|
}
|
|
5491
5625
|
if (!nonAssertError && evalDef.outputsSchema) {
|
|
5492
|
-
const
|
|
5626
|
+
const { outputsSchema } = evalDef;
|
|
5627
|
+
const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
|
|
5493
5628
|
if (parsedOutputs.success) scope.outputs = {
|
|
5494
5629
|
...scope.outputs,
|
|
5495
5630
|
...parsedOutputs.data
|
|
@@ -5511,6 +5646,7 @@ async function runCase(params) {
|
|
|
5511
5646
|
}, {
|
|
5512
5647
|
input: evalCase.input,
|
|
5513
5648
|
idPrefix: `${scopedIdPrefix}-score-${toStableIdSegment(key)}`,
|
|
5649
|
+
runtimeScope: "scorer",
|
|
5514
5650
|
cacheContext: cacheAdapter ? {
|
|
5515
5651
|
adapter: cacheAdapter,
|
|
5516
5652
|
mode: cacheMode,
|
|
@@ -5791,12 +5927,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5791
5927
|
} catch {
|
|
5792
5928
|
codeFingerprint = "";
|
|
5793
5929
|
}
|
|
5794
|
-
if (codeFingerprint.length > 0)
|
|
5795
|
-
|
|
5930
|
+
if (codeFingerprint.length > 0) {
|
|
5931
|
+
runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
|
|
5932
|
+
evalMeta.sourceFingerprint = codeFingerprint;
|
|
5933
|
+
} else {
|
|
5934
|
+
delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
5935
|
+
evalMeta.sourceFingerprint = null;
|
|
5936
|
+
}
|
|
5796
5937
|
try {
|
|
5797
5938
|
const registry = getEvalRegistry();
|
|
5798
5939
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5799
|
-
await
|
|
5940
|
+
await runInEvalRuntimeScope("env", async () => {
|
|
5941
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
5942
|
+
});
|
|
5800
5943
|
});
|
|
5801
5944
|
const entry = registry.get(evalMeta.id);
|
|
5802
5945
|
if (!entry) {
|
|
@@ -5807,87 +5950,89 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5807
5950
|
continue;
|
|
5808
5951
|
}
|
|
5809
5952
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5810
|
-
await
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
5814
|
-
|
|
5815
|
-
|
|
5816
|
-
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
5822
|
-
|
|
5823
|
-
|
|
5824
|
-
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
5829
|
-
|
|
5830
|
-
|
|
5831
|
-
preparedEvals.push(preparedEval);
|
|
5832
|
-
for (const evalCase of cases) {
|
|
5833
|
-
const trialResults = [];
|
|
5834
|
-
const preparedCase = {
|
|
5835
|
-
caseId: evalCase.id,
|
|
5836
|
-
trialResults,
|
|
5837
|
-
finalized: false
|
|
5953
|
+
await runInEvalRuntimeScope("cases", async () => {
|
|
5954
|
+
await entry.use(async (evalDef) => {
|
|
5955
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
5956
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
5957
|
+
evalId: evalMeta.id
|
|
5958
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5959
|
+
runState.summary.totalCases += cases.length;
|
|
5960
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
5961
|
+
const evalCaseRows = [];
|
|
5962
|
+
const preparedCases = [];
|
|
5963
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
5964
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
5965
|
+
const preparedEval = {
|
|
5966
|
+
evalMeta,
|
|
5967
|
+
accumulatedColumns,
|
|
5968
|
+
evalCaseRows,
|
|
5969
|
+
preparedCases,
|
|
5970
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
5971
|
+
mergeColumns: (columns) => {
|
|
5972
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
5973
|
+
}
|
|
5838
5974
|
};
|
|
5839
|
-
|
|
5840
|
-
for (
|
|
5841
|
-
const
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
|
|
5846
|
-
|
|
5847
|
-
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
|
|
5854
|
-
moduleIsolation,
|
|
5855
|
-
evalFilePath,
|
|
5856
|
-
workspaceRoot,
|
|
5857
|
-
artifactDir: join(runDir, "artifacts"),
|
|
5858
|
-
runId: runState.manifest.id
|
|
5859
|
-
});
|
|
5860
|
-
return {
|
|
5861
|
-
caseDetail,
|
|
5862
|
-
caseRow: {
|
|
5863
|
-
caseId: evalCase.id,
|
|
5975
|
+
preparedEvals.push(preparedEval);
|
|
5976
|
+
for (const evalCase of cases) {
|
|
5977
|
+
const trialResults = [];
|
|
5978
|
+
const preparedCase = {
|
|
5979
|
+
caseId: evalCase.id,
|
|
5980
|
+
trialResults,
|
|
5981
|
+
finalized: false
|
|
5982
|
+
};
|
|
5983
|
+
preparedCases.push(preparedCase);
|
|
5984
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
5985
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
5986
|
+
queuedCases.push({
|
|
5987
|
+
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
5988
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
5989
|
+
evalDef,
|
|
5864
5990
|
evalId: evalMeta.id,
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
5871
|
-
|
|
5872
|
-
|
|
5873
|
-
|
|
5874
|
-
|
|
5875
|
-
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
|
|
5887
|
-
|
|
5888
|
-
|
|
5991
|
+
evalCase,
|
|
5992
|
+
globalTraceDisplay,
|
|
5993
|
+
trial,
|
|
5994
|
+
startTime,
|
|
5995
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
5996
|
+
cacheMode,
|
|
5997
|
+
codeFingerprint,
|
|
5998
|
+
moduleIsolation,
|
|
5999
|
+
evalFilePath,
|
|
6000
|
+
workspaceRoot,
|
|
6001
|
+
artifactDir: join(runDir, "artifacts"),
|
|
6002
|
+
runId: runState.manifest.id
|
|
6003
|
+
});
|
|
6004
|
+
return {
|
|
6005
|
+
caseDetail,
|
|
6006
|
+
caseRow: {
|
|
6007
|
+
caseId: evalCase.id,
|
|
6008
|
+
evalId: evalMeta.id,
|
|
6009
|
+
status: caseRowUpdate.status ?? "pending",
|
|
6010
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
6011
|
+
columns: caseRowUpdate.columns ?? {},
|
|
6012
|
+
trial
|
|
6013
|
+
}
|
|
6014
|
+
};
|
|
6015
|
+
},
|
|
6016
|
+
onComplete: async ({ caseDetail, caseRow }) => {
|
|
6017
|
+
trialResults.push({
|
|
6018
|
+
caseDetail,
|
|
6019
|
+
caseRow,
|
|
6020
|
+
bufferedCacheStore
|
|
6021
|
+
});
|
|
6022
|
+
if (trialResults.length !== request.trials) return;
|
|
6023
|
+
await finalizePreparedCase({
|
|
6024
|
+
runState,
|
|
6025
|
+
runDir,
|
|
6026
|
+
preparedEval,
|
|
6027
|
+
preparedCase,
|
|
6028
|
+
onCaseFinished,
|
|
6029
|
+
emitEvent
|
|
6030
|
+
});
|
|
6031
|
+
}
|
|
6032
|
+
});
|
|
6033
|
+
}
|
|
5889
6034
|
}
|
|
5890
|
-
}
|
|
6035
|
+
});
|
|
5891
6036
|
});
|
|
5892
6037
|
});
|
|
5893
6038
|
} catch (error) {
|
|
@@ -5995,4 +6140,4 @@ function toLastRunStatus(status) {
|
|
|
5995
6140
|
return status === "pending" ? null : status;
|
|
5996
6141
|
}
|
|
5997
6142
|
//#endregion
|
|
5998
|
-
export {
|
|
6143
|
+
export { caseRowSchema as $, appendToEvalOutput as $t, getEvalTitle as A, traceDisplayConfigSchema as At, apiCallMetricFormatSchema as B, fileRefSchema as Bt, createRunRequestSchema as C, serializedCacheSpanSchema as Ct, extractApiCalls as D, traceAttributeDisplayInputSchema as Dt, extractCacheHits as E, traceAttributeDisplayFormatSchema as Et, runManifestSchema as F, traceSpanWarningSchema as Ft, llmCallMetricPlacementSchema as G, z$1 as Gt, apiCallMetricSchema as H, numberDisplayOptionsSchema as Ht, runSummarySchema as I, cellValueSchema as It, resolveApiCallsConfig as J, evalSpan as Jt, llmCallMetricSchema as K, buildTraceTree as Kt, DEFAULT_API_CALLS_CONFIG as L, columnDefSchema as Lt, deriveScopedSummaryFromCases as M, traceSpanErrorSchema as Mt, deriveStatusFromCaseRows as N, traceSpanKindSchema as Nt, extractLlmCalls as O, traceAttributeDisplayPlacementSchema as Ot, deriveStatusFromChildStatuses as P, traceSpanSchema as Pt, caseDetailSchema as Q, EvalAssertionError as Qt, DEFAULT_LLM_CALLS_CONFIG as R, columnFormatSchema as Rt, createFsCacheStore as S, cacheStatusSchema as St, sseEnvelopeSchema as T, traceCacheRefSchema as Tt, apiCallsConfigSchema as U, repoFileRefSchema as Ut, apiCallMetricPlacementSchema as V, jsonCellSchema as Vt, llmCallMetricFormatSchema as W, runArtifactRefSchema as Wt, trialSelectionModeSchema as X, hashCacheKey as Xt, resolveLlmCallsConfig as Y, evalTracer as Yt, assertionFailureSchema as Z, hashCacheKeySync as Zt, loadEvalModule as _, cacheListItemSchema as _t, loadPersistedRunSnapshot as a, mergeEvalOutput as an, scoreTraceSchema as at, buildDeclaredColumnDefs as b, cacheRecordingOpSchema as bt, persistCaseDetail as c, runInEvalScope as cn, evalChartBuiltinMetricSchema as ct, recomputePersistedCaseStatus as d, setScopeCacheContext as dn, evalChartMetricSchema as dt, evalAssert as en, evalFreshnessStatusSchema as et, runTouchesEval as f, startEvalBackgroundJob as fn, evalChartTooltipExtraSchema as ft, setLatestRunInfoMap as g, cacheFileSchema as gt, getTargetEvalIds as h, getEvalRegistry as hn, cacheEntrySchema as ht, getLatestRunInfos as i, isInEvalScope as in, evalSummarySchema as it, getEvalDisplayStatus as j, traceDisplayInputConfigSchema as jt, getNestedAttribute as k, traceAttributeDisplaySchema as kt, persistRunState as l, runInExistingEvalScope as ln, evalChartColorSchema as lt, buildEvalSummary as m, defineEval as mn, evalChartsConfigSchema as mt, generateRunId as n, getEvalCaseInput as nn, evalStatItemSchema as nt, loadPersistedRunSnapshots as o, nextEvalId as on, evalChartAggregateSchema as ot, resolveArtifactPath as p, repoFile as pn, evalChartTypeSchema as pt, llmCallsConfigSchema as q, captureEvalSpanError as qt, getLastRunStatuses as r, incrementEvalOutput as rn, evalStatsConfigSchema as rt, nextShortIdFromSnapshots as s, runInEvalRuntimeScope as sn, evalChartAxisSchema as st, executeRun as t, getCurrentScope as tn, evalStatAggregateSchema as tt, recomputeEvalStatusesInRuns as u, setEvalOutput as un, evalChartConfigSchema as ut, parseEvalMetas as v, cacheModeSchema as vt, updateManualScoreRequestSchema as w, spanCacheOptionsSchema as wt, normalizeScoreDef as x, cacheRecordingSchema as xt, loadConfig as y, cacheOperationTypeSchema as yt, agentEvalsConfigSchema as z, columnKindSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-vunKoSBu.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-B-sCTyz8.mjs";
|
|
2
|
+
import "./src-jaOlXwb5.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.15.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
}
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
|
-
"dist"
|
|
16
|
+
"dist",
|
|
17
|
+
"skills"
|
|
17
18
|
],
|
|
18
19
|
"tsdown": {
|
|
19
20
|
"clean": true,
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agent-eval
|
|
3
|
+
description: Create, run, and maintain TypeScript evals with @ls-stack/agent-eval. Use when adding eval coverage for an LLM or agent workflow, updating *.eval.ts files, checking eval results, configuring agent-evals.config.ts, inspecting saved .agent-evals run artifacts, or wiring product source code with evalTracer spans.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Agent Eval
|
|
7
|
+
|
|
8
|
+
Local-first, UI-first eval runner for LLM and agent systems. Evals are strict
|
|
9
|
+
TypeScript modules named `*.eval.ts`, discovered from `agent-evals.config.ts`,
|
|
10
|
+
and executed through the CLI (`agent-evals run`) or the web UI
|
|
11
|
+
(`agent-evals app`). Runs persist to `.agent-evals/` so results, traces, and
|
|
12
|
+
caches survive across processes.
|
|
13
|
+
|
|
14
|
+
This skill covers the mental model and conventions. For exhaustive field lists
|
|
15
|
+
(config options, eval shape, column formats, score/chart/stats options, trace
|
|
16
|
+
display rules), read the TypeScript declarations shipped with the package:
|
|
17
|
+
|
|
18
|
+
- `AgentEvalsConfig`, `EvalDefinition`, `EvalCase`, `EvalOutputs`,
|
|
19
|
+
`EvalColumnOverride`, `EvalScoreDef`, `EvalManualScoreDef`,
|
|
20
|
+
`EvalTraceTree`, `TraceSpanInfo`, and `z` are exported from
|
|
21
|
+
`@ls-stack/agent-eval`.
|
|
22
|
+
- `.d.ts` files land in `node_modules/@ls-stack/agent-eval/dist/`.
|
|
23
|
+
- CLI surface: `agent-evals --help` and `agent-evals <command> --help`.
|
|
24
|
+
Unknown help targets exit non-zero instead of falling back to global help.
|
|
25
|
+
- The CLI automatically loads `.env` from the current workspace. Shell-provided
|
|
26
|
+
environment variables win; pass `--no-env` to disable `.env` loading once.
|
|
27
|
+
|
|
28
|
+
Assume that enumerated tables in this document may lag behind the types —
|
|
29
|
+
treat the types as source of truth when they disagree.
|
|
30
|
+
|
|
31
|
+
## Where tracing lives
|
|
32
|
+
|
|
33
|
+
**Tracing belongs in the product source code, not in the eval file.** The eval
|
|
34
|
+
file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
|
|
35
|
+
inside the workflow, agent, or tool functions that both production and evals
|
|
36
|
+
invoke.
|
|
37
|
+
|
|
38
|
+
`evalTracer`, `evalSpan`, output helpers, and `evalAssert` are ambient no-ops
|
|
39
|
+
when called outside an eval case scope, so leaving them in production paths is
|
|
40
|
+
safe — they only record anything when the product code runs inside an eval's
|
|
41
|
+
`execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
|
|
42
|
+
(e.g. skip a real network side effect): it returns `null` outside eval-owned
|
|
43
|
+
work and returns `'env'`, `'cases'`, `'eval'`, `'derive'`, `'outputsSchema'`, or
|
|
44
|
+
`'scorer'` during runner phases. Top-level modules imported while a run is being
|
|
45
|
+
prepared see `'env'`; code called from `execute` sees `'eval'`. Use
|
|
46
|
+
`getEvalCaseInput()` to read the current case input, or
|
|
47
|
+
`getEvalCaseInput('customer.tier')` for nested dot-path access; outside a case
|
|
48
|
+
scope it returns `undefined`. Use `nextEvalId()` inside eval-scoped code when a
|
|
49
|
+
stable generated id is needed; it includes the eval file, eval id, case id, and
|
|
50
|
+
a per-case sequence number, and throws outside an eval case scope.
|
|
51
|
+
|
|
52
|
+
### Product code (instrumented once, reused everywhere)
|
|
53
|
+
|
|
54
|
+
```ts
|
|
55
|
+
// src/workflows/refundWorkflow.ts
|
|
56
|
+
import {
|
|
57
|
+
appendToEvalOutput,
|
|
58
|
+
captureEvalSpanError,
|
|
59
|
+
evalAssert,
|
|
60
|
+
evalSpan,
|
|
61
|
+
evalTracer,
|
|
62
|
+
getEvalCaseInput,
|
|
63
|
+
incrementEvalOutput,
|
|
64
|
+
mergeEvalOutput,
|
|
65
|
+
nextEvalId,
|
|
66
|
+
setEvalOutput,
|
|
67
|
+
startEvalBackgroundJob,
|
|
68
|
+
} from '@ls-stack/agent-eval';
|
|
69
|
+
|
|
70
|
+
export async function runRefundWorkflow(input: RefundInput) {
|
|
71
|
+
return evalTracer.span(
|
|
72
|
+
{ kind: 'agent', name: 'refund-workflow' },
|
|
73
|
+
async () => {
|
|
74
|
+
evalSpan.setAttribute('input', input);
|
|
75
|
+
|
|
76
|
+
const plan = await evalTracer.span(
|
|
77
|
+
{
|
|
78
|
+
kind: 'llm',
|
|
79
|
+
name: 'plan-refund',
|
|
80
|
+
cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
|
|
81
|
+
},
|
|
82
|
+
async () => {
|
|
83
|
+
let text: string;
|
|
84
|
+
let usage: { inputTokens: number; outputTokens: number };
|
|
85
|
+
let costUsd: number;
|
|
86
|
+
try {
|
|
87
|
+
({ text, usage, costUsd } = await llm.complete(input.message));
|
|
88
|
+
} catch (error) {
|
|
89
|
+
captureEvalSpanError(error);
|
|
90
|
+
({ text, usage, costUsd } = await llm.completeWithFallback(
|
|
91
|
+
input.message,
|
|
92
|
+
));
|
|
93
|
+
}
|
|
94
|
+
evalSpan.setAttributes({ model: 'gpt-4o-mini', usage });
|
|
95
|
+
const expectedLocale = getEvalCaseInput('locale');
|
|
96
|
+
if (typeof expectedLocale === 'string') {
|
|
97
|
+
evalSpan.setAttribute('expectedLocale', expectedLocale);
|
|
98
|
+
}
|
|
99
|
+
evalSpan.incrementAttribute('llmCalls', 1);
|
|
100
|
+
evalSpan.appendToAttribute('models', 'gpt-4o-mini');
|
|
101
|
+
incrementEvalOutput('costUsd', costUsd);
|
|
102
|
+
appendToEvalOutput('modelCalls', { model: 'gpt-4o-mini', costUsd });
|
|
103
|
+
return text;
|
|
104
|
+
},
|
|
105
|
+
);
|
|
106
|
+
|
|
107
|
+
const result = await applyRefund(plan);
|
|
108
|
+
const reviewId = nextEvalId();
|
|
109
|
+
setEvalOutput('response', result.finalText);
|
|
110
|
+
setEvalOutput('reviewId', reviewId);
|
|
111
|
+
mergeEvalOutput('metadata', { approved: result.approved });
|
|
112
|
+
evalAssert(result.approved, 'refund workflow should approve the case');
|
|
113
|
+
evalSpan.setAttribute('output', { result, reviewId });
|
|
114
|
+
return result;
|
|
115
|
+
},
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Span `kind` values are open-ended strings and are color-coded automatically in
|
|
121
|
+
the UI for every kind used during the app session. Use familiar kinds such as
|
|
122
|
+
`agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they
|
|
123
|
+
fit, and preserve external tracer kinds such as `mastra.workflow.step` when they
|
|
124
|
+
are more specific. The UI automatically promotes only the `input` and `output` span
|
|
125
|
+
attributes. Use `traceDisplay` for other span attributes such as `model`,
|
|
126
|
+
`usage`, or `costUsd`.
|
|
127
|
+
|
|
128
|
+
Use `captureEvalSpanError(error)` for recoverable errors on the active
|
|
129
|
+
`evalTracer.span(...)`, such as optional model/tool failures that fall back and
|
|
130
|
+
continue. You can pass one error, multiple error arguments, or an array. The
|
|
131
|
+
span is still marked `error`, and the UI renders captured errors in a dedicated
|
|
132
|
+
span detail block with timing relative to the span. Pass `'warning'` or
|
|
133
|
+
`{ level: 'warning' }` as the final argument for diagnostics that should be
|
|
134
|
+
visible in span detail without changing an otherwise successful span's status.
|
|
135
|
+
|
|
136
|
+
If a span callback throws, the SDK automatically marks that span as `error`,
|
|
137
|
+
stores the thrown error on it, and rethrows so the case errors. Use that for
|
|
138
|
+
terminal failures; use `captureEvalSpanError(...)` for recoverable failures that
|
|
139
|
+
continue through fallback logic.
|
|
140
|
+
|
|
141
|
+
Fire-and-forget spans started during `execute` are awaited before outputs,
|
|
142
|
+
`deriveFromTracing`, scores, and trace data are finalized, so `void
|
|
143
|
+
evalTracer.span(...)` is safe when the span result is not needed. Register
|
|
144
|
+
non-span promises with `startEvalBackgroundJob(promise)`. The runner only waits
|
|
145
|
+
for settlement; promise and span errors keep their normal behavior. Use
|
|
146
|
+
`waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
|
|
147
|
+
eval definition, when background work should not delay finalization.
|
|
148
|
+
|
|
149
|
+
For libraries or observability exporters that already emit span lifecycle
|
|
150
|
+
events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
|
|
151
|
+
`evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
|
|
152
|
+
events into the eval trace tree without wrapping the upstream work in a
|
|
153
|
+
callback. Pass the upstream span id and parent id when available so the UI keeps
|
|
154
|
+
the original hierarchy.
|
|
155
|
+
|
|
156
|
+
### Eval file (thin)
|
|
157
|
+
|
|
158
|
+
```ts
|
|
159
|
+
// evals/refund-workflow.eval.ts
|
|
160
|
+
import { defineEval, z } from '@ls-stack/agent-eval';
|
|
161
|
+
import { runRefundWorkflow } from '../src/workflows/refundWorkflow.ts';
|
|
162
|
+
|
|
163
|
+
const outputsSchema = z.object({
|
|
164
|
+
response: z.string(),
|
|
165
|
+
costUsd: z.number().optional(),
|
|
166
|
+
toolCalls: z.number(),
|
|
167
|
+
llmTurns: z.number(),
|
|
168
|
+
});
|
|
169
|
+
type RefundOutputs = z.infer<typeof outputsSchema>;
|
|
170
|
+
|
|
171
|
+
defineEval<RefundInput, RefundOutputs>({
|
|
172
|
+
id: 'refund-workflow',
|
|
173
|
+
cases: [
|
|
174
|
+
{ id: 'simple-text', input: { message: 'I want a refund for order #123' } },
|
|
175
|
+
],
|
|
176
|
+
outputsSchema,
|
|
177
|
+
execute: async ({ input }) => {
|
|
178
|
+
await runRefundWorkflow(input);
|
|
179
|
+
},
|
|
180
|
+
deriveFromTracing: ({ trace }) => ({
|
|
181
|
+
toolCalls: trace.findSpansByKind('tool').length,
|
|
182
|
+
llmTurns: trace.findSpansByKind('llm').length,
|
|
183
|
+
}),
|
|
184
|
+
scores: {
|
|
185
|
+
mentionsRefund: {
|
|
186
|
+
passThreshold: 1,
|
|
187
|
+
compute: ({ outputs }) => (/refund/i.test(outputs.response) ? 1 : 0),
|
|
188
|
+
},
|
|
189
|
+
},
|
|
190
|
+
});
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
`execute` usually just calls the product code. Push any placeholder
|
|
194
|
+
`evalTracer.span(...)` wrappers out of the eval and into the product module
|
|
195
|
+
they describe so production runs get the same trajectory. Only keep tracing
|
|
196
|
+
inside `execute` when the behavior being measured is eval-specific (e.g. a
|
|
197
|
+
judge-only sub-step with no production analogue).
|
|
198
|
+
|
|
199
|
+
Case `id` values anchor historical runs, caches, and manual scores — keep them
|
|
200
|
+
stable. See `EvalDefinition` / `EvalCase` in the types for every supported
|
|
201
|
+
field.
|
|
202
|
+
|
|
203
|
+
## Scoring
|
|
204
|
+
|
|
205
|
+
Every score returns a normalized `0..1` value. Pass/fail is per-score: a case
|
|
206
|
+
fails if any score with `passThreshold` falls below it, if an assertion fails,
|
|
207
|
+
or if the case errors. Scores without `passThreshold` are informational.
|
|
208
|
+
|
|
209
|
+
Score functions run in their own trace scope, separate from the execution
|
|
210
|
+
trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
|
|
211
|
+
without polluting the agent trajectory. The case detail UI shows execution
|
|
212
|
+
spans on **Trace** and scorer spans on **Scoring**. Outputs set inside a scorer
|
|
213
|
+
stay private to that score.
|
|
214
|
+
|
|
215
|
+
`manualScores` declares score columns that reviewers fill in the web UI after
|
|
216
|
+
a run. Pending values keep the eval in an `unscored` state instead of failing.
|
|
217
|
+
|
|
218
|
+
See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
219
|
+
(format, threshold, column overrides).
|
|
220
|
+
|
|
221
|
+
## Outputs, columns, trace display
|
|
222
|
+
|
|
223
|
+
- `setEvalOutput(key, value)` writes reviewable data for the case. Values are
|
|
224
|
+
plain data (strings, numbers, booleans, JSON-safe objects) plus native
|
|
225
|
+
`Blob`/`File` or `FileRef` variants for media columns. Inside `execute`,
|
|
226
|
+
prefer the context `setOutput(key, value)` helper when writing schema-backed
|
|
227
|
+
outputs; it is typed from the eval's outputs generic. Keep `setEvalOutput`
|
|
228
|
+
for shared workflow code that does not receive the execute context.
|
|
229
|
+
- Use `incrementEvalOutput(key, delta)` for numeric totals,
|
|
230
|
+
`appendToEvalOutput(key, value)` for arrays that preserve existing scalar
|
|
231
|
+
values, and `mergeEvalOutput(key, patch)` for shallow object updates.
|
|
232
|
+
`evalSpan` has matching `incrementAttribute`, `appendToAttribute`, and
|
|
233
|
+
`mergeAttribute` helpers for span attributes.
|
|
234
|
+
- `outputsSchema` validates final outputs after `execute` and
|
|
235
|
+
`deriveFromTracing`, before computed scores. For Zod object schemas, only
|
|
236
|
+
declared keys are passed to the schema; parsed fields merge back into the raw
|
|
237
|
+
output map, so defaults/transforms apply to configured fields and
|
|
238
|
+
unconfigured outputs stay visible as before. Validation failures fail the case
|
|
239
|
+
and skip computed scores. When you pass a narrowed outputs type as the second
|
|
240
|
+
`defineEval` generic, `outputsSchema` is required.
|
|
241
|
+
- `columns` overrides the display for output and score keys (label, format,
|
|
242
|
+
alignment, visibility). The set of supported formats is declared by the
|
|
243
|
+
`ColumnFormat` union and `EvalColumnOverride` in the types.
|
|
244
|
+
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
245
|
+
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
246
|
+
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
247
|
+
See the `TraceDisplayInputConfig` type.
|
|
248
|
+
- `llmCalls` (in `agent-evals.config.ts`) configures the LLM calls tab in the
|
|
249
|
+
case-run drawer. Defaults to `kind: 'llm'` spans with `model`, `usage.*`,
|
|
250
|
+
`costUsd`, `input`, `output`, etc. read from conventional attribute paths.
|
|
251
|
+
Override `kinds` to broaden the filter, override `attributes.<field>` for
|
|
252
|
+
non-default span shapes, and add entries to `metrics` to surface arbitrary
|
|
253
|
+
user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
|
|
254
|
+
'boolean'`, `placements: ['header' | 'body']`). The tab auto-hides when no
|
|
255
|
+
matching spans exist.
|
|
256
|
+
- `apiCalls` (in `agent-evals.config.ts`) configures the API calls tab in the
|
|
257
|
+
case-run drawer. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and
|
|
258
|
+
`'fetch'` spans with `method`, `url`, `statusCode`, `request`, `response`,
|
|
259
|
+
`requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read
|
|
260
|
+
from conventional attribute paths. Override `kinds` or
|
|
261
|
+
`attributes.<field>` for external tracers, and add `metrics` with the same
|
|
262
|
+
formats and placements as LLM-call metrics. The tab auto-hides when no
|
|
263
|
+
matching spans exist.
|
|
264
|
+
|
|
265
|
+
Stats rows and history charts on the eval card are opt-in via `stats` /
|
|
266
|
+
`charts` on the eval definition. Their shapes live in the types; no need to
|
|
267
|
+
memorize the option set.
|
|
268
|
+
|
|
269
|
+
## Cached operations
|
|
270
|
+
|
|
271
|
+
Wrap a costly pure span in `cache: { key }` so later runs replay its recorded
|
|
272
|
+
effects without re-executing:
|
|
273
|
+
|
|
274
|
+
```ts
|
|
275
|
+
await evalTracer.span(
|
|
276
|
+
{
|
|
277
|
+
kind: 'llm',
|
|
278
|
+
name: 'plan-refund',
|
|
279
|
+
cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
|
|
280
|
+
},
|
|
281
|
+
async () => {
|
|
282
|
+
const result = await llm.complete(input.message);
|
|
283
|
+
evalSpan.setAttributes({ model: 'gpt-4o-mini', output: result });
|
|
284
|
+
incrementEvalOutput('costUsd', computeCost(result));
|
|
285
|
+
appendToEvalOutput('llmCalls', { model: 'gpt-4o-mini' });
|
|
286
|
+
return result;
|
|
287
|
+
},
|
|
288
|
+
);
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
Use `evalTracer.cache(...)` for pure values that should not create their own
|
|
292
|
+
trace span:
|
|
293
|
+
|
|
294
|
+
```ts
|
|
295
|
+
const context = await evalTracer.cache(
|
|
296
|
+
{ name: 'receipt-audit-context', key: { orderId: input.orderId } },
|
|
297
|
+
async () => {
|
|
298
|
+
const result = await loadReceiptContext(input);
|
|
299
|
+
evalSpan.setAttribute('receiptContext', result);
|
|
300
|
+
evalSpan.mergeAttribute('receiptSummary', { orderId: input.orderId });
|
|
301
|
+
return result;
|
|
302
|
+
},
|
|
303
|
+
);
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
Mental model:
|
|
307
|
+
|
|
308
|
+
- Only SDK-mediated effects replay on a hit: sub-spans, checkpoints,
|
|
309
|
+
output helper calls, span attributes. External side
|
|
310
|
+
effects (HTTP, DB writes, file I/O) **do not** replay — cache only pure
|
|
311
|
+
functions of the key.
|
|
312
|
+
- `evalTracer.cache(...)` does not create a span. When it runs inside an active
|
|
313
|
+
span, that span gets a `cache.refs` entry with the value cache name, key,
|
|
314
|
+
namespace, and hit/miss status. When called directly from the case body
|
|
315
|
+
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
316
|
+
array so spanless caches still appear in the UI's **Cache hits** tab, where
|
|
317
|
+
each hit can be expanded for inspection or deleted by namespace/key.
|
|
318
|
+
- The cache key folds in a source-file fingerprint, so editing the eval busts
|
|
319
|
+
the cache automatically.
|
|
320
|
+
- `cache.namespace` on spans or `namespace` on value caches can share entries
|
|
321
|
+
across operations/evals, but the source-file fingerprint still participates
|
|
322
|
+
in the final key. Shared namespaces are reusable across evals in the same
|
|
323
|
+
file; evals in different files miss even with the same namespace and key.
|
|
324
|
+
- Cache keys should be deterministic primitives, arrays, and plain objects.
|
|
325
|
+
`Buffer`, `ArrayBuffer`, and typed arrays hash by bytes. Native `Blob`/`File`
|
|
326
|
+
keys use stable metadata by default (`type`, `size`, plus
|
|
327
|
+
`name`/`lastModified` for `File`) and do not read file bytes. Add
|
|
328
|
+
`serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
|
|
329
|
+
when byte-level cache invalidation is required.
|
|
330
|
+
- Cache entries are stored in inspectable owner files under
|
|
331
|
+
`.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
|
|
332
|
+
default. Configure `cache.maxEntriesPerNamespace` for the default cap and
|
|
333
|
+
`cache.maxEntriesByNamespace` for exact namespace-specific caps.
|
|
334
|
+
- Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
|
|
335
|
+
recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
|
|
336
|
+
typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
|
|
337
|
+
use the deterministic key-hashing rules above.
|
|
338
|
+
- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`)
|
|
339
|
+
and by a chevron menu on each eval card in the UI.
|
|
340
|
+
- The UI Stop action cancels the whole active run by terminating that run's
|
|
341
|
+
isolated execution process.
|
|
342
|
+
|
|
343
|
+
## Artifacts
|
|
344
|
+
|
|
345
|
+
Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
|
|
346
|
+
`.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
|
|
347
|
+
metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
348
|
+
these when debugging persisted output, costs, columns, traces, or failures —
|
|
349
|
+
the filenames are stable even when their internal schema evolves, so pick the
|
|
350
|
+
one whose name matches what you are debugging and read it directly.
|
|
351
|
+
|
|
352
|
+
## Module mocking
|
|
353
|
+
|
|
354
|
+
For true module replacement inside an eval, register `mock.module(...)` from
|
|
355
|
+
`node:test` before dynamically importing the module graph. The CLI enables
|
|
356
|
+
Node's `--experimental-test-module-mocks` flag automatically. Use dynamic
|
|
357
|
+
`import(...)` inside `execute` — static imports happen too early.
|
|
358
|
+
|
|
359
|
+
```ts
|
|
360
|
+
import { mock } from 'node:test';
|
|
361
|
+
import { defineEval } from '@ls-stack/agent-eval';
|
|
362
|
+
|
|
363
|
+
defineEval({
|
|
364
|
+
id: 'module-mock-demo',
|
|
365
|
+
cases: [{ id: 'mocked-dependency', input: { customerId: 'vip-100' } }],
|
|
366
|
+
execute: async ({ input, setOutput }) => {
|
|
367
|
+
mock.module('../src/customerLookup.ts', {
|
|
368
|
+
namedExports: { lookupCustomer: async () => ({ segment: 'vip' }) },
|
|
369
|
+
});
|
|
370
|
+
const { runWorkflow } = await import('../src/workflow.ts');
|
|
371
|
+
const result = await runWorkflow(input);
|
|
372
|
+
setOutput('segment', result.segment);
|
|
373
|
+
},
|
|
374
|
+
});
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
## Workflow checklist
|
|
378
|
+
|
|
379
|
+
When adding or changing evals:
|
|
380
|
+
|
|
381
|
+
1. Put the tracing + ambient SDK calls in the product code that runs in both
|
|
382
|
+
production and evals. Keep eval files thin.
|
|
383
|
+
2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
|
|
384
|
+
3. `evalAssert` for hard invariants, `scores` for graded signals,
|
|
385
|
+
`passThreshold` only on scores that should gate pass/fail.
|
|
386
|
+
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
387
|
+
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
388
|
+
formats from the `ColumnFormat` type.
|
|
389
|
+
5. Promote high-signal span attributes with `traceDisplay` so the UI
|
|
390
|
+
highlights them in the trace tree and detail pane.
|
|
391
|
+
6. Cache costly pure spans with `cache: { key }` and pure spanless values with
|
|
392
|
+
`evalTracer.cache(...)`; never cache operations whose external side effects
|
|
393
|
+
you depend on.
|
|
394
|
+
7. Sanity-check after changes: `agent-evals list`, then
|
|
395
|
+
`agent-evals run --eval <id>`. Open the UI only when you need to inspect
|
|
396
|
+
traces, trends, or fill manual scores. From an eval page, the eval actions
|
|
397
|
+
menu can copy package-manager-specific CLI run and debug commands.
|
|
398
|
+
8. To debug a focused run, use
|
|
399
|
+
`agent-evals run --inspect-brk --eval <id> --case <case-id>` and attach a
|
|
400
|
+
Node.js debugger before continuing execution.
|
package/dist/src-BgGL7DDp.mjs
DELETED