@ls-stack/agent-eval 0.58.1 → 0.58.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DhMIbjlE.mjs → app-ROCEce9X.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/assets/index-PTikBbhf.js +377 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-_g2qOMK6.mjs → cli-SP4kEtYL.mjs} +31 -5
- package/dist/index.d.mts +184 -129
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-d42Lm0i5.mjs → runExecution-CFw0MQFs.mjs} +114 -21
- package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-CxjiQmof.mjs} +73 -6
- package/dist/{runner-BKogjiYd.mjs → runner-BlFQyvN2.mjs} +1 -1
- package/dist/{runner-MSr8sAWm.mjs → runner-CY3bgsjU.mjs} +2 -2
- package/dist/{src-CdZsOn6y.mjs → src-7GbQj1sb.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +19 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
4
|
-
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
1
|
+
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-SP4kEtYL.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-7GbQj1sb.mjs";
|
|
4
|
+
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CxjiQmof.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
|
|
|
851
851
|
const runLogPhaseSchema = z.enum([
|
|
852
852
|
"eval",
|
|
853
853
|
"derive",
|
|
854
|
+
"tracingAssertions",
|
|
854
855
|
"outputsSchema",
|
|
855
856
|
"scorer"
|
|
856
857
|
]);
|
|
@@ -1008,6 +1009,8 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
|
|
|
1008
1009
|
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
1009
1010
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
1010
1011
|
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
1012
|
+
/** Schema for trace-derived assertion config. */
|
|
1013
|
+
const evalTracingAssertionsConfigSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" });
|
|
1011
1014
|
/** Schema for UI overrides on derived or scored columns. */
|
|
1012
1015
|
const evalColumnOverrideSchema = z.object({
|
|
1013
1016
|
label: z.string().optional(),
|
|
@@ -1411,6 +1414,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1411
1414
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1412
1415
|
columns: evalColumnsSchema.optional(),
|
|
1413
1416
|
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
1417
|
+
tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
|
|
1414
1418
|
stats: evalStatsConfigSchema.optional(),
|
|
1415
1419
|
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
1416
1420
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
@@ -1847,8 +1851,9 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1847
1851
|
* freshness state.
|
|
1848
1852
|
*/
|
|
1849
1853
|
function getEvalDisplayStatus(params) {
|
|
1850
|
-
const { stale, outdated, lastRunStatus, isRunning = false } = params;
|
|
1854
|
+
const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
|
|
1851
1855
|
if (isRunning || lastRunStatus === "running") return "running";
|
|
1856
|
+
if (isEnqueued) return "enqueued";
|
|
1852
1857
|
if (lastRunStatus === "pass") {
|
|
1853
1858
|
if (stale) return "stale";
|
|
1854
1859
|
if (outdated) return "outdated";
|
|
@@ -2718,6 +2723,17 @@ var EvalAssertionError = class extends Error {
|
|
|
2718
2723
|
this.name = "EvalAssertionError";
|
|
2719
2724
|
}
|
|
2720
2725
|
};
|
|
2726
|
+
/** Error thrown when an SDK helper is used in an unsupported runner phase. */
|
|
2727
|
+
var EvalRuntimeUsageError = class extends Error {
|
|
2728
|
+
constructor(message) {
|
|
2729
|
+
super(message);
|
|
2730
|
+
this.name = "EvalRuntimeUsageError";
|
|
2731
|
+
}
|
|
2732
|
+
};
|
|
2733
|
+
/** Throw when assertion helpers are used in a runner phase that forbids them. */
|
|
2734
|
+
function assertEvalAssertionsAllowed(apiName) {
|
|
2735
|
+
if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
|
|
2736
|
+
}
|
|
2721
2737
|
function getEvalClockStateNowMs(state) {
|
|
2722
2738
|
const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
|
|
2723
2739
|
return state.startMs + elapsedMs + state.offsetMs;
|
|
@@ -2823,8 +2839,10 @@ function recordSpanForActiveCacheRecording(scope, spanId) {
|
|
|
2823
2839
|
*
|
|
2824
2840
|
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2825
2841
|
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2826
|
-
* case `execute`, `derive` while deriving outputs from traces,
|
|
2827
|
-
*
|
|
2842
|
+
* case `execute`, `derive` while deriving outputs from traces,
|
|
2843
|
+
* `tracingAssertions` while checking trace-derived assertions,
|
|
2844
|
+
* `outputsSchema` while validating outputs, and `scorer` while computing
|
|
2845
|
+
* scores.
|
|
2828
2846
|
*/
|
|
2829
2847
|
function isInEvalScope() {
|
|
2830
2848
|
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
@@ -2845,7 +2863,7 @@ function normalizeLogLevel(level) {
|
|
|
2845
2863
|
}
|
|
2846
2864
|
function getCurrentLogPhase() {
|
|
2847
2865
|
const runtimeScope = runtimeScopeStorage.getStore();
|
|
2848
|
-
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2866
|
+
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2849
2867
|
return null;
|
|
2850
2868
|
}
|
|
2851
2869
|
function formatLogArgs(args) {
|
|
@@ -3300,10 +3318,12 @@ function incrementEvalOutput(key, delta) {
|
|
|
3300
3318
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
3301
3319
|
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
3302
3320
|
* TypeScript assertion signature still narrows the checked value after the
|
|
3303
|
-
* call.
|
|
3321
|
+
* call. Calls inside `deriveFromTracing` throw because derivations must only
|
|
3322
|
+
* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
3304
3323
|
*/
|
|
3305
3324
|
function evalAssert(condition, message) {
|
|
3306
3325
|
const scope = getCurrentScope();
|
|
3326
|
+
assertEvalAssertionsAllowed("evalAssert(...)");
|
|
3307
3327
|
if (condition) {
|
|
3308
3328
|
if (scope) scope.assertions.push({
|
|
3309
3329
|
message,
|
|
@@ -3454,6 +3474,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
|
|
|
3454
3474
|
* case scope is active, matching `evalAssert(...)`.
|
|
3455
3475
|
*/
|
|
3456
3476
|
function evalExpect(value) {
|
|
3477
|
+
assertEvalAssertionsAllowed("evalExpect(...)");
|
|
3457
3478
|
return new EvalExpectationImpl(value, false);
|
|
3458
3479
|
}
|
|
3459
3480
|
//#endregion
|
|
@@ -5005,25 +5026,53 @@ const evalTracer = {
|
|
|
5005
5026
|
};
|
|
5006
5027
|
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
5007
5028
|
function buildTraceTree(spans, checkpoints) {
|
|
5029
|
+
const rootSpans = spans.filter((s) => s.parentId === null);
|
|
5030
|
+
const flattenDfs = () => {
|
|
5031
|
+
const result = [];
|
|
5032
|
+
function visit(parentId) {
|
|
5033
|
+
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5034
|
+
result.push(childSpan);
|
|
5035
|
+
visit(childSpan.id);
|
|
5036
|
+
}
|
|
5037
|
+
}
|
|
5038
|
+
visit(null);
|
|
5039
|
+
return result;
|
|
5040
|
+
};
|
|
5041
|
+
const filterSpanNames = (sourceSpans, kind) => {
|
|
5042
|
+
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
5043
|
+
};
|
|
5008
5044
|
return {
|
|
5009
5045
|
spans,
|
|
5010
|
-
rootSpans
|
|
5046
|
+
rootSpans,
|
|
5011
5047
|
findSpan(name) {
|
|
5012
5048
|
return spans.find((s) => s.name === name);
|
|
5013
5049
|
},
|
|
5050
|
+
findSpans(name) {
|
|
5051
|
+
return spans.filter((s) => s.name === name);
|
|
5052
|
+
},
|
|
5053
|
+
hasSpan(name) {
|
|
5054
|
+
return spans.some((s) => s.name === name);
|
|
5055
|
+
},
|
|
5014
5056
|
findSpansByKind(kind) {
|
|
5015
5057
|
return spans.filter((s) => s.kind === kind);
|
|
5016
5058
|
},
|
|
5059
|
+
findToolCallSpans() {
|
|
5060
|
+
return spans.filter((s) => s.kind === "tool");
|
|
5061
|
+
},
|
|
5062
|
+
listToolCallSpanNames() {
|
|
5063
|
+
return filterSpanNames(spans, "tool");
|
|
5064
|
+
},
|
|
5065
|
+
hasToolCallSpan(name) {
|
|
5066
|
+
return spans.some((s) => s.kind === "tool" && s.name === name);
|
|
5067
|
+
},
|
|
5068
|
+
listSpanNames(kind) {
|
|
5069
|
+
return filterSpanNames(spans, kind);
|
|
5070
|
+
},
|
|
5071
|
+
listSpanNamesDfs(kind) {
|
|
5072
|
+
return filterSpanNames(flattenDfs(), kind);
|
|
5073
|
+
},
|
|
5017
5074
|
flattenDfs() {
|
|
5018
|
-
|
|
5019
|
-
function visit(parentId) {
|
|
5020
|
-
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5021
|
-
result.push(childSpan);
|
|
5022
|
-
visit(childSpan.id);
|
|
5023
|
-
}
|
|
5024
|
-
}
|
|
5025
|
-
visit(null);
|
|
5026
|
-
return result;
|
|
5075
|
+
return flattenDfs();
|
|
5027
5076
|
},
|
|
5028
5077
|
checkpoints
|
|
5029
5078
|
};
|
|
@@ -6665,7 +6714,7 @@ async function resolveDeriveFromTracingConfig(params) {
|
|
|
6665
6714
|
return derived;
|
|
6666
6715
|
}
|
|
6667
6716
|
async function runDeriveFromTracingConfig(params) {
|
|
6668
|
-
if (params.deriveFromTracing === void 0) return;
|
|
6717
|
+
if (params.deriveFromTracing === void 0) return null;
|
|
6669
6718
|
const { deriveFromTracing } = params;
|
|
6670
6719
|
try {
|
|
6671
6720
|
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
@@ -6677,13 +6726,43 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6677
6726
|
outputs: params.scope.outputs,
|
|
6678
6727
|
derived
|
|
6679
6728
|
});
|
|
6729
|
+
return null;
|
|
6680
6730
|
} catch (e) {
|
|
6731
|
+
if (e instanceof EvalRuntimeUsageError) return e;
|
|
6681
6732
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6682
6733
|
recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6734
|
+
return null;
|
|
6683
6735
|
}
|
|
6684
6736
|
}
|
|
6737
|
+
async function runOneTracingAssertion(params) {
|
|
6738
|
+
const { label, tracingAssertion, scope, traceTree, evalCase } = params;
|
|
6739
|
+
const failureCountBefore = scope.assertionFailures.length;
|
|
6740
|
+
const ctx = {
|
|
6741
|
+
trace: traceTree,
|
|
6742
|
+
input: evalCase.input,
|
|
6743
|
+
case: evalCase
|
|
6744
|
+
};
|
|
6745
|
+
try {
|
|
6746
|
+
await runInExistingEvalScope(scope, "tracingAssertions", async () => {
|
|
6747
|
+
await callUnknownFunction(tracingAssertion, [ctx]);
|
|
6748
|
+
});
|
|
6749
|
+
} catch (e) {
|
|
6750
|
+
if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
|
|
6751
|
+
recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
|
|
6752
|
+
}
|
|
6753
|
+
}
|
|
6754
|
+
async function runTracingAssertionsConfig(params) {
|
|
6755
|
+
if (params.tracingAssertions === void 0) return;
|
|
6756
|
+
await runOneTracingAssertion({
|
|
6757
|
+
label: "tracingAssertions",
|
|
6758
|
+
tracingAssertion: params.tracingAssertions,
|
|
6759
|
+
scope: params.scope,
|
|
6760
|
+
traceTree: params.traceTree,
|
|
6761
|
+
evalCase: params.evalCase
|
|
6762
|
+
});
|
|
6763
|
+
}
|
|
6685
6764
|
async function runCase(params) {
|
|
6686
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6765
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6687
6766
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6688
6767
|
evalId,
|
|
6689
6768
|
evalFilePath,
|
|
@@ -6728,22 +6807,36 @@ async function runCase(params) {
|
|
|
6728
6807
|
apiCallsConfig
|
|
6729
6808
|
});
|
|
6730
6809
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
6731
|
-
|
|
6810
|
+
let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
6732
6811
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
|
|
6733
6812
|
if (!nonAssertError) {
|
|
6734
|
-
await runDeriveFromTracingConfig({
|
|
6813
|
+
nonAssertError = await runDeriveFromTracingConfig({
|
|
6735
6814
|
deriveFromTracing: globalDeriveFromTracing,
|
|
6736
6815
|
scope,
|
|
6737
6816
|
traceTree,
|
|
6738
6817
|
evalCase
|
|
6739
6818
|
});
|
|
6740
|
-
await runDeriveFromTracingConfig({
|
|
6819
|
+
if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
|
|
6741
6820
|
deriveFromTracing: evalDef.deriveFromTracing,
|
|
6742
6821
|
scope,
|
|
6743
6822
|
traceTree,
|
|
6744
6823
|
evalCase
|
|
6745
6824
|
});
|
|
6746
6825
|
}
|
|
6826
|
+
if (!nonAssertError) {
|
|
6827
|
+
await runTracingAssertionsConfig({
|
|
6828
|
+
tracingAssertions: globalTracingAssertions,
|
|
6829
|
+
scope,
|
|
6830
|
+
traceTree,
|
|
6831
|
+
evalCase
|
|
6832
|
+
});
|
|
6833
|
+
await runTracingAssertionsConfig({
|
|
6834
|
+
tracingAssertions: evalDef.tracingAssertions,
|
|
6835
|
+
scope,
|
|
6836
|
+
traceTree,
|
|
6837
|
+
evalCase
|
|
6838
|
+
});
|
|
6839
|
+
}
|
|
6747
6840
|
if (!nonAssertError) addDefaultOutputs({
|
|
6748
6841
|
outputs: scope.outputs,
|
|
6749
6842
|
spans: spansWithDerivedAttributes,
|
|
@@ -6933,4 +7026,4 @@ function recordAssertionFailure(scope, failure) {
|
|
|
6933
7026
|
});
|
|
6934
7027
|
}
|
|
6935
7028
|
//#endregion
|
|
6936
|
-
export {
|
|
7029
|
+
export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as
|
|
1
|
+
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CFw0MQFs.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
|
|
|
1030
1030
|
async function executeQueuedCase(params) {
|
|
1031
1031
|
const { queuedCase, globalTraceDisplay } = params;
|
|
1032
1032
|
const startTime = Date.now();
|
|
1033
|
+
await queuedCase.onStart?.();
|
|
1033
1034
|
const result = await queuedCase.execute({
|
|
1034
1035
|
globalTraceDisplay,
|
|
1035
1036
|
startTime
|
|
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
|
|
|
1249
1250
|
return `[${entry.evalId}] ${messageLine}\n${details}`;
|
|
1250
1251
|
}).join("\n");
|
|
1251
1252
|
}
|
|
1253
|
+
function upsertCaseRow(caseRows, nextCaseRow) {
|
|
1254
|
+
const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
|
|
1255
|
+
if (existingIndex === -1) {
|
|
1256
|
+
caseRows.push(nextCaseRow);
|
|
1257
|
+
return;
|
|
1258
|
+
}
|
|
1259
|
+
caseRows[existingIndex] = nextCaseRow;
|
|
1260
|
+
}
|
|
1261
|
+
function removeLiveCaseRows(caseRows, nextCaseRow) {
|
|
1262
|
+
const caseKey = getCaseRowCaseKey(nextCaseRow);
|
|
1263
|
+
for (let i = caseRows.length - 1; i >= 0; i--) {
|
|
1264
|
+
const caseRow = caseRows[i];
|
|
1265
|
+
if (caseRow === void 0) continue;
|
|
1266
|
+
if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
|
|
1267
|
+
if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
|
|
1268
|
+
caseRows.splice(i, 1);
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
function emitCaseRowEvent(params) {
|
|
1272
|
+
params.emitEvent(params.runState, {
|
|
1273
|
+
type: params.type,
|
|
1274
|
+
runId: params.runState.manifest.id,
|
|
1275
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1276
|
+
payload: params.caseRow
|
|
1277
|
+
});
|
|
1278
|
+
}
|
|
1252
1279
|
async function finalizePreparedCase(params) {
|
|
1253
1280
|
const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
|
|
1254
1281
|
if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
|
|
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
|
|
|
1263
1290
|
pendingWrites: winningTrial.pendingCacheWrites
|
|
1264
1291
|
});
|
|
1265
1292
|
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
1266
|
-
runState.cases
|
|
1293
|
+
removeLiveCaseRows(runState.cases, winningTrial.caseRow);
|
|
1294
|
+
upsertCaseRow(runState.cases, winningTrial.caseRow);
|
|
1267
1295
|
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
1268
1296
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
1269
1297
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
|
|
|
1271
1299
|
await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
1272
1300
|
await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
|
|
1273
1301
|
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
1274
|
-
|
|
1302
|
+
emitCaseRowEvent({
|
|
1303
|
+
runState,
|
|
1304
|
+
emitEvent,
|
|
1275
1305
|
type: "case.finished",
|
|
1276
|
-
|
|
1277
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1278
|
-
payload: winningTrial.caseRow
|
|
1306
|
+
caseRow: winningTrial.caseRow
|
|
1279
1307
|
});
|
|
1280
1308
|
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
1281
1309
|
}
|
|
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
1437
1465
|
preparedEvals.push(preparedEval);
|
|
1438
1466
|
for (const evalCase of cases) {
|
|
1439
1467
|
const trialResults = [];
|
|
1468
|
+
const liveCaseRow = {
|
|
1469
|
+
caseId: evalCase.id,
|
|
1470
|
+
evalId: evalMeta.id,
|
|
1471
|
+
evalKey: evalMeta.key,
|
|
1472
|
+
caseKey: buildCaseKey({
|
|
1473
|
+
filePath: evalMeta.filePath,
|
|
1474
|
+
evalId: evalMeta.id,
|
|
1475
|
+
caseId: evalCase.id
|
|
1476
|
+
}),
|
|
1477
|
+
tags: evalCase.tags,
|
|
1478
|
+
status: "pending",
|
|
1479
|
+
durationMs: null,
|
|
1480
|
+
cacheHits: 0,
|
|
1481
|
+
cacheOperations: 0,
|
|
1482
|
+
columns: {},
|
|
1483
|
+
trial: 0
|
|
1484
|
+
};
|
|
1440
1485
|
const preparedCase = {
|
|
1441
1486
|
caseId: evalCase.id,
|
|
1487
|
+
liveCaseRow,
|
|
1442
1488
|
trialResults,
|
|
1443
1489
|
finalized: false
|
|
1444
1490
|
};
|
|
1445
1491
|
preparedCases.push(preparedCase);
|
|
1492
|
+
upsertCaseRow(runState.cases, liveCaseRow);
|
|
1493
|
+
emitCaseRowEvent({
|
|
1494
|
+
runState,
|
|
1495
|
+
emitEvent,
|
|
1496
|
+
type: "case.updated",
|
|
1497
|
+
caseRow: liveCaseRow
|
|
1498
|
+
});
|
|
1446
1499
|
for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
|
|
1500
|
+
onStart: () => {
|
|
1501
|
+
if (preparedCase.finalized) return;
|
|
1502
|
+
preparedCase.liveCaseRow = {
|
|
1503
|
+
...preparedCase.liveCaseRow,
|
|
1504
|
+
status: "running"
|
|
1505
|
+
};
|
|
1506
|
+
upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
|
|
1507
|
+
emitCaseRowEvent({
|
|
1508
|
+
runState,
|
|
1509
|
+
emitEvent,
|
|
1510
|
+
type: "case.started",
|
|
1511
|
+
caseRow: preparedCase.liveCaseRow
|
|
1512
|
+
});
|
|
1513
|
+
},
|
|
1447
1514
|
execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
|
|
1448
1515
|
evalId: evalMeta.id,
|
|
1449
1516
|
evalKey: evalMeta.key,
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-CY3bgsjU.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-SP4kEtYL.mjs";
|
|
2
|
+
import "./src-7GbQj1sb.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import "./cli-SP4kEtYL.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.58.
|
|
3
|
+
"version": "0.58.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/sdk": "0.0.1",
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
35
35
|
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
36
36
|
place when the runner is idle. If config changes during an active run, the
|
|
37
37
|
reload applies after the current run reaches a terminal state.
|
|
38
|
+
- App-triggered runs log the queued target evals, resolved case concurrency,
|
|
39
|
+
each case start for evals that are actually running, and the terminal run
|
|
40
|
+
summary in the server terminal.
|
|
38
41
|
|
|
39
42
|
Assume that enumerated tables in this document may lag behind the types —
|
|
40
43
|
treat the types as source of truth when they disagree.
|
|
@@ -360,7 +363,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
360
363
|
The older object-returning function form remains supported. Global
|
|
361
364
|
derivations run first; runtime outputs are never overwritten, and eval-level
|
|
362
365
|
derivations only fill keys still missing after global derivations. In keyed
|
|
363
|
-
form, return `undefined` to omit one output for that case.
|
|
366
|
+
form, return `undefined` to omit one output for that case. Do not call
|
|
367
|
+
`evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
|
|
368
|
+
`tracingAssertions` for trace-derived pass/fail checks.
|
|
369
|
+
- `tracingAssertions` is a single function that can be authored globally or
|
|
370
|
+
locally on one eval when a finished-trace invariant should pass or fail the
|
|
371
|
+
case without creating a fake score column. It receives the same
|
|
372
|
+
`{ trace, input, case }` context as `deriveFromTracing`; call
|
|
373
|
+
`evalAssert(...)` or `evalExpect(...)` inside it.
|
|
374
|
+
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
375
|
+
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
376
|
+
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
377
|
+
`trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
|
|
378
|
+
`trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
|
|
364
379
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
365
380
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
366
381
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
@@ -629,8 +644,9 @@ When adding or changing evals:
|
|
|
629
644
|
3. `evalAssert` for hard invariants and truthy type narrowing. It records
|
|
630
645
|
pass/fail entries in case-detail `assertions`; failed entries are also kept
|
|
631
646
|
in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
|
|
632
|
-
comparisons, `
|
|
633
|
-
scores
|
|
647
|
+
comparisons, `tracingAssertions` for invariants derived from the finished
|
|
648
|
+
trace, `scores` for graded signals, and `passThreshold` only on scores that
|
|
649
|
+
should gate pass/fail.
|
|
634
650
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
635
651
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
636
652
|
formats from the `ColumnFormat` type.
|