@ls-stack/agent-eval 0.58.2 → 0.58.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BxD6aHbp.mjs → app-ROCEce9X.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-BMWBZw_u.js → index-PTikBbhf.js} +65 -65
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-HBwXIJsg.mjs → cli-SP4kEtYL.mjs} +4 -4
- package/dist/index.d.mts +116 -120
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-pHJ0_TzH.mjs → runExecution-CFw0MQFs.mjs} +5 -16
- package/dist/{runOrchestration-ngVXShH4.mjs → runOrchestration-CxjiQmof.mjs} +1 -1
- package/dist/{runner-BnZMGBla.mjs → runner-BlFQyvN2.mjs} +1 -1
- package/dist/{runner-D_pz2NON.mjs → runner-CY3bgsjU.mjs} +2 -2
- package/dist/{src-AeXGBJ26.mjs → src-7GbQj1sb.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +5 -4
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-SP4kEtYL.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-7GbQj1sb.mjs";
|
|
4
4
|
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CxjiQmof.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -1009,9 +1009,8 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
|
|
|
1009
1009
|
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
1010
1010
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
1011
1011
|
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
const evalTracingAssertionsConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" }), z.record(z.string().min(1), evalTracingAssertionsFnSchema)]);
|
|
1012
|
+
/** Schema for trace-derived assertion config. */
|
|
1013
|
+
const evalTracingAssertionsConfigSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" });
|
|
1015
1014
|
/** Schema for UI overrides on derived or scored columns. */
|
|
1016
1015
|
const evalColumnOverrideSchema = z.object({
|
|
1017
1016
|
label: z.string().optional(),
|
|
@@ -6754,19 +6753,9 @@ async function runOneTracingAssertion(params) {
|
|
|
6754
6753
|
}
|
|
6755
6754
|
async function runTracingAssertionsConfig(params) {
|
|
6756
6755
|
if (params.tracingAssertions === void 0) return;
|
|
6757
|
-
|
|
6758
|
-
|
|
6759
|
-
|
|
6760
|
-
tracingAssertion: params.tracingAssertions,
|
|
6761
|
-
scope: params.scope,
|
|
6762
|
-
traceTree: params.traceTree,
|
|
6763
|
-
evalCase: params.evalCase
|
|
6764
|
-
});
|
|
6765
|
-
return;
|
|
6766
|
-
}
|
|
6767
|
-
for (const [key, tracingAssertion] of Object.entries(params.tracingAssertions)) await runOneTracingAssertion({
|
|
6768
|
-
label: `tracingAssertions "${key}"`,
|
|
6769
|
-
tracingAssertion,
|
|
6756
|
+
await runOneTracingAssertion({
|
|
6757
|
+
label: "tracingAssertions",
|
|
6758
|
+
tracingAssertion: params.tracingAssertions,
|
|
6770
6759
|
scope: params.scope,
|
|
6771
6760
|
traceTree: params.traceTree,
|
|
6772
6761
|
evalCase: params.evalCase
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-
|
|
1
|
+
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CFw0MQFs.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-CY3bgsjU.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-SP4kEtYL.mjs";
|
|
2
|
+
import "./src-7GbQj1sb.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import "./cli-SP4kEtYL.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -366,10 +366,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
366
366
|
form, return `undefined` to omit one output for that case. Do not call
|
|
367
367
|
`evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
|
|
368
368
|
`tracingAssertions` for trace-derived pass/fail checks.
|
|
369
|
-
- `tracingAssertions` can be authored globally or
|
|
370
|
-
finished-trace invariant should pass or fail the
|
|
371
|
-
score column. It receives the same
|
|
372
|
-
`
|
|
369
|
+
- `tracingAssertions` is a single function that can be authored globally or
|
|
370
|
+
locally on one eval when a finished-trace invariant should pass or fail the
|
|
371
|
+
case without creating a fake score column. It receives the same
|
|
372
|
+
`{ trace, input, case }` context as `deriveFromTracing`; call
|
|
373
|
+
`evalAssert(...)` or `evalExpect(...)` inside it.
|
|
373
374
|
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
374
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
375
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|