@ls-stack/agent-eval 0.55.2 → 0.56.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-NI4to6lp.mjs → app-Db_x-Rit.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-2I-eWzVL.css → index-Xa_7PteQ.css} +1 -1
- package/dist/apps/web/dist/assets/index-bB8IBDp1.js +377 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-Bu9347r1.mjs → cli-Ck0mqxd-.mjs} +4 -4
- package/dist/index.d.mts +593 -58
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-C31dpemR.mjs → runExecution-BH7DlMXl.mjs} +100 -8
- package/dist/{runOrchestration-3RoHLW4U.mjs → runOrchestration-C1Ex9QI-.mjs} +1 -1
- package/dist/{runner-B4EfMn1d.mjs → runner-B3hEOT_I.mjs} +2 -2
- package/dist/{runner-CTp9zHbM.mjs → runner-DbVYcapC.mjs} +1 -1
- package/dist/{src-FR60ZR_4.mjs → src-B3iq-tuv.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +13 -3
- package/dist/apps/web/dist/assets/index-C7QjETk8.js +0 -377
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Ck0mqxd-.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-B3iq-tuv.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C1Ex9QI-.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -220,6 +220,19 @@ const traceSpanSchema = z.object({
|
|
|
220
220
|
});
|
|
221
221
|
//#endregion
|
|
222
222
|
//#region ../shared/src/schemas/cache.ts
|
|
223
|
+
const outputColumnOverrideSchema = z.object({
|
|
224
|
+
label: z.string().optional(),
|
|
225
|
+
format: columnFormatSchema.optional(),
|
|
226
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
227
|
+
hideInTable: z.boolean().optional(),
|
|
228
|
+
hideIfNoValue: z.boolean().optional(),
|
|
229
|
+
align: z.enum([
|
|
230
|
+
"left",
|
|
231
|
+
"center",
|
|
232
|
+
"right"
|
|
233
|
+
]).optional(),
|
|
234
|
+
maxStars: z.number().int().min(2).optional()
|
|
235
|
+
});
|
|
223
236
|
/**
|
|
224
237
|
* Mode that controls how the cache is consulted for a given run.
|
|
225
238
|
*
|
|
@@ -311,7 +324,8 @@ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
|
|
|
311
324
|
z.object({
|
|
312
325
|
kind: z.literal("setOutput"),
|
|
313
326
|
key: z.string(),
|
|
314
|
-
value: z.unknown()
|
|
327
|
+
value: z.unknown(),
|
|
328
|
+
column: outputColumnOverrideSchema.optional()
|
|
315
329
|
}),
|
|
316
330
|
z.object({
|
|
317
331
|
kind: z.literal("appendOutput"),
|
|
@@ -792,6 +806,11 @@ const caseRowSchema = z.object({
|
|
|
792
806
|
cacheOperations: z.number().optional(),
|
|
793
807
|
costUsd: z.number().nullable().optional(),
|
|
794
808
|
columns: z.record(z.string(), cellValueSchema),
|
|
809
|
+
/**
|
|
810
|
+
* Runtime column definitions authored by output helpers for this case.
|
|
811
|
+
* These complement eval-level `columns` without changing discovery metadata.
|
|
812
|
+
*/
|
|
813
|
+
outputColumnDefs: z.array(columnDefSchema).optional(),
|
|
795
814
|
/** Winning trial index for the persisted case result. */
|
|
796
815
|
trial: z.number()
|
|
797
816
|
});
|
|
@@ -898,6 +917,11 @@ const caseDetailSchema = z.object({
|
|
|
898
917
|
*/
|
|
899
918
|
scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
|
|
900
919
|
columns: z.record(z.string(), cellValueSchema),
|
|
920
|
+
/**
|
|
921
|
+
* Runtime column definitions authored by output helpers for this case.
|
|
922
|
+
* These complement eval-level `columns` without changing discovery metadata.
|
|
923
|
+
*/
|
|
924
|
+
outputColumnDefs: z.array(columnDefSchema).optional(),
|
|
901
925
|
assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
|
|
902
926
|
/** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
|
|
903
927
|
logs: z.array(runLogEntrySchema).default([]),
|
|
@@ -2140,17 +2164,34 @@ function computeTokensPerSecond({ outputTokens, durationMs }) {
|
|
|
2140
2164
|
if (durationMs <= 0) return null;
|
|
2141
2165
|
return outputTokens / (durationMs / 1e3);
|
|
2142
2166
|
}
|
|
2143
|
-
function readSteps(attributes, path) {
|
|
2167
|
+
function readSteps(attributes, path, childModelSteps) {
|
|
2144
2168
|
const raw = getNestedAttribute(attributes, path);
|
|
2145
2169
|
if (Array.isArray(raw)) return {
|
|
2146
2170
|
stepCount: raw.length,
|
|
2147
2171
|
stepDetails: raw
|
|
2148
2172
|
};
|
|
2173
|
+
if (childModelSteps.length > 0) return {
|
|
2174
|
+
stepCount: childModelSteps.length,
|
|
2175
|
+
stepDetails: childModelSteps
|
|
2176
|
+
};
|
|
2149
2177
|
return {
|
|
2150
2178
|
stepCount: null,
|
|
2151
2179
|
stepDetails: null
|
|
2152
2180
|
};
|
|
2153
2181
|
}
|
|
2182
|
+
function buildModelStepsByParent(spans) {
|
|
2183
|
+
const stepsByParent = /* @__PURE__ */ new Map();
|
|
2184
|
+
for (const span of spans) {
|
|
2185
|
+
if (span.kind !== "model_step" || span.parentId === null) continue;
|
|
2186
|
+
const current = stepsByParent.get(span.parentId);
|
|
2187
|
+
if (current === void 0) {
|
|
2188
|
+
stepsByParent.set(span.parentId, [span]);
|
|
2189
|
+
continue;
|
|
2190
|
+
}
|
|
2191
|
+
current.push(span);
|
|
2192
|
+
}
|
|
2193
|
+
return stepsByParent;
|
|
2194
|
+
}
|
|
2154
2195
|
function collectWarnings$1(span) {
|
|
2155
2196
|
const out = [];
|
|
2156
2197
|
if (span.warning) out.push(span.warning);
|
|
@@ -2182,6 +2223,9 @@ function pickError$1(span) {
|
|
|
2182
2223
|
* charged twice. Cache read/write costs still contribute to the total USD cost
|
|
2183
2224
|
* at their configured rates. The `steps` attribute path may resolve to an array
|
|
2184
2225
|
* of per-step detail objects, with `stepCount` derived from the array length.
|
|
2226
|
+
* When a matching LLM span does not expose that array, direct child spans with
|
|
2227
|
+
* `kind: 'model_step'` are used as the step details instead. This preserves
|
|
2228
|
+
* Mastra/OpenTelemetry traces where model steps are emitted as child spans.
|
|
2185
2229
|
* `durationMs` and `tokensPerSecond` are `null` while the span is still
|
|
2186
2230
|
* running. User-defined `metrics` whose path resolves to
|
|
2187
2231
|
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
@@ -2190,6 +2234,7 @@ function pickError$1(span) {
|
|
|
2190
2234
|
*/
|
|
2191
2235
|
function extractLlmCalls(spans, config) {
|
|
2192
2236
|
const kindSet = new Set(config.kinds);
|
|
2237
|
+
const modelStepsByParent = buildModelStepsByParent(spans);
|
|
2193
2238
|
const result = [];
|
|
2194
2239
|
for (const span of spans) {
|
|
2195
2240
|
if (!kindSet.has(span.kind)) continue;
|
|
@@ -2275,7 +2320,7 @@ function extractLlmCalls(spans, config) {
|
|
|
2275
2320
|
cachedInputCostUsd,
|
|
2276
2321
|
cacheCreationInputCostUsd,
|
|
2277
2322
|
reasoningCostUsd,
|
|
2278
|
-
...readSteps(attrs, config.attributes.steps),
|
|
2323
|
+
...readSteps(attrs, config.attributes.steps, modelStepsByParent.get(span.id) ?? []),
|
|
2279
2324
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
2280
2325
|
durationMs,
|
|
2281
2326
|
input: getNestedAttribute(attrs, config.attributes.input),
|
|
@@ -3040,6 +3085,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
3040
3085
|
input: options.input,
|
|
3041
3086
|
tags: options.tags ?? [],
|
|
3042
3087
|
outputs: {},
|
|
3088
|
+
outputColumnOverrides: {},
|
|
3043
3089
|
assertionFailures: [],
|
|
3044
3090
|
logs: [],
|
|
3045
3091
|
spans: [],
|
|
@@ -3089,6 +3135,11 @@ function recordOpIfActive(scope, op) {
|
|
|
3089
3135
|
const top = scope.recordingStack.at(-1);
|
|
3090
3136
|
if (top) top.ops.push(op);
|
|
3091
3137
|
}
|
|
3138
|
+
function normalizeEvalOutputOptions(options) {
|
|
3139
|
+
if (options === void 0) return void 0;
|
|
3140
|
+
if (typeof options === "string") return { format: options };
|
|
3141
|
+
return options;
|
|
3142
|
+
}
|
|
3092
3143
|
function toAssertionFailure$1(message, error = void 0) {
|
|
3093
3144
|
const name = error?.name;
|
|
3094
3145
|
const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
|
|
@@ -3103,15 +3154,22 @@ function toAssertionFailure$1(message, error = void 0) {
|
|
|
3103
3154
|
*
|
|
3104
3155
|
* Supported values include scalars, JSON-safe objects/arrays, explicit file
|
|
3105
3156
|
* refs, and native `Blob`/`File` instances for media or file columns.
|
|
3157
|
+
*
|
|
3158
|
+
* Pass the optional third argument to persist a display format or full column
|
|
3159
|
+
* override with this runtime output, for example `'markdown'` or
|
|
3160
|
+
* `{ label: 'Receipt', format: 'image', hideInTable: true }`.
|
|
3106
3161
|
*/
|
|
3107
|
-
function setEvalOutput(key, value) {
|
|
3162
|
+
function setEvalOutput(key, value, options = void 0) {
|
|
3108
3163
|
const scope = getCurrentScope();
|
|
3109
3164
|
if (!scope) return;
|
|
3110
3165
|
scope.outputs[key] = value;
|
|
3166
|
+
const column = normalizeEvalOutputOptions(options);
|
|
3167
|
+
if (column !== void 0) scope.outputColumnOverrides[key] = column;
|
|
3111
3168
|
recordOpIfActive(scope, {
|
|
3112
3169
|
kind: "setOutput",
|
|
3113
3170
|
key,
|
|
3114
|
-
value
|
|
3171
|
+
value,
|
|
3172
|
+
column
|
|
3115
3173
|
});
|
|
3116
3174
|
}
|
|
3117
3175
|
/**
|
|
@@ -4110,6 +4168,7 @@ function replayRecording(scope, parentSpan, recording, options) {
|
|
|
4110
4168
|
function applyRecordingOp(scope, parentSpan, op, options) {
|
|
4111
4169
|
if (op.kind === "setOutput") {
|
|
4112
4170
|
scope.outputs[op.key] = op.value;
|
|
4171
|
+
if (op.column !== void 0) scope.outputColumnOverrides[op.key] = op.column;
|
|
4113
4172
|
return;
|
|
4114
4173
|
}
|
|
4115
4174
|
if (op.kind === "appendOutput") {
|
|
@@ -5758,6 +5817,27 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
|
|
|
5758
5817
|
return [...declaredDefs.values()];
|
|
5759
5818
|
}
|
|
5760
5819
|
/**
|
|
5820
|
+
* Build runtime column definitions from output-level display overrides.
|
|
5821
|
+
*
|
|
5822
|
+
* These definitions are persisted on case rows/details so `setOutput(...)`
|
|
5823
|
+
* can format one-off outputs without adding them to eval discovery metadata.
|
|
5824
|
+
*/
|
|
5825
|
+
function buildRuntimeOutputColumnDefs(columns, overrides, configuredColumnKeys = /* @__PURE__ */ new Set()) {
|
|
5826
|
+
return Object.entries(overrides).filter(([key]) => columns[key] !== void 0 && !configuredColumnKeys.has(key)).map(([key, override]) => createColumnDef({
|
|
5827
|
+
key,
|
|
5828
|
+
override,
|
|
5829
|
+
inferredKind: inferKindFromFormat(override.format) ?? (override.numberFormat === void 0 ? inferKind(columns[key]) : "number"),
|
|
5830
|
+
isScore: false,
|
|
5831
|
+
isManualScore: false
|
|
5832
|
+
}));
|
|
5833
|
+
}
|
|
5834
|
+
/** Infer a `ColumnKind` from a runtime value when no override is set. */
|
|
5835
|
+
function inferKind(value) {
|
|
5836
|
+
if (typeof value === "number") return "number";
|
|
5837
|
+
if (typeof value === "boolean") return "boolean";
|
|
5838
|
+
return "string";
|
|
5839
|
+
}
|
|
5840
|
+
/**
|
|
5761
5841
|
* Coerce an arbitrary runtime value into a serializable `CellValue`.
|
|
5762
5842
|
* Runtime values use the SDK's tagged serializer so saved run artifacts keep
|
|
5763
5843
|
* structured data instead of storing JSON strings. Native binary/file root
|
|
@@ -6101,6 +6181,10 @@ function sumNullable(values) {
|
|
|
6101
6181
|
}
|
|
6102
6182
|
return hasValue ? total : void 0;
|
|
6103
6183
|
}
|
|
6184
|
+
function getMaxLlmTurns(calls) {
|
|
6185
|
+
if (calls.length === 0) return void 0;
|
|
6186
|
+
return Math.max(...calls.map((call) => Math.max(call.stepCount ?? 1, 1)));
|
|
6187
|
+
}
|
|
6104
6188
|
function assignIfMissing(params) {
|
|
6105
6189
|
if (!params.activeKeys.has(params.key)) return;
|
|
6106
6190
|
if (params.key in params.outputs) return;
|
|
@@ -6122,7 +6206,7 @@ function addDefaultOutputs(params) {
|
|
|
6122
6206
|
assignIfMissing({
|
|
6123
6207
|
outputs: params.outputs,
|
|
6124
6208
|
key: "llmTurns",
|
|
6125
|
-
value: calls
|
|
6209
|
+
value: getMaxLlmTurns(calls),
|
|
6126
6210
|
activeKeys
|
|
6127
6211
|
});
|
|
6128
6212
|
assignIfMissing({
|
|
@@ -6526,7 +6610,7 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6526
6610
|
}
|
|
6527
6611
|
}
|
|
6528
6612
|
async function runCase(params) {
|
|
6529
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6613
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6530
6614
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6531
6615
|
evalId,
|
|
6532
6616
|
evalFilePath,
|
|
@@ -6694,6 +6778,12 @@ async function runCase(params) {
|
|
|
6694
6778
|
if (cell !== void 0) columns[key] = cell;
|
|
6695
6779
|
}
|
|
6696
6780
|
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
6781
|
+
const outputColumnDefs = buildRuntimeOutputColumnDefs(columns, scope.outputColumnOverrides, new Set(Object.keys(mergeDefaultColumns({
|
|
6782
|
+
globalColumns,
|
|
6783
|
+
columns: evalDef.columns,
|
|
6784
|
+
globalRemove: globalRemoveDefaultConfig,
|
|
6785
|
+
evalRemove: evalDef.removeDefaultConfig
|
|
6786
|
+
}) ?? {})));
|
|
6697
6787
|
const errorInfo = nonAssertError ? {
|
|
6698
6788
|
name: nonAssertError.name,
|
|
6699
6789
|
message: nonAssertError.message,
|
|
@@ -6710,6 +6800,7 @@ async function runCase(params) {
|
|
|
6710
6800
|
trace: displayTrace,
|
|
6711
6801
|
traceDisplay,
|
|
6712
6802
|
columns,
|
|
6803
|
+
...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
|
|
6713
6804
|
assertionFailures: scope.assertionFailures,
|
|
6714
6805
|
logs: scope.logs,
|
|
6715
6806
|
error: errorInfo,
|
|
@@ -6728,7 +6819,8 @@ async function runCase(params) {
|
|
|
6728
6819
|
durationMs: elapsedMs,
|
|
6729
6820
|
cacheHits: cacheHits.length,
|
|
6730
6821
|
cacheOperations: cacheEntries.length,
|
|
6731
|
-
columns
|
|
6822
|
+
columns,
|
|
6823
|
+
...outputColumnDefs.length > 0 ? { outputColumnDefs } : {}
|
|
6732
6824
|
}
|
|
6733
6825
|
};
|
|
6734
6826
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Ck0mqxd-.mjs";
|
|
2
|
+
import "./src-B3iq-tuv.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B3hEOT_I.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import "./cli-Ck0mqxd-.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.56.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
318
318
|
- `setEvalOutput(key, value)` writes reviewable data for the case. Values are
|
|
319
319
|
stored as received: primitives, objects/arrays, explicit file refs, and
|
|
320
320
|
native `Blob`/`File` values. `columns.format` only controls visualization.
|
|
321
|
+
Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
|
|
322
|
+
display hint directly to a runtime output, e.g. `'markdown'` or
|
|
323
|
+
`{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
|
|
324
|
+
global/eval `columns` for the same key take precedence over that runtime
|
|
325
|
+
hint.
|
|
321
326
|
Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
|
|
322
327
|
and class instances use the tagged value serializer instead of a string
|
|
323
328
|
fallback. Native `Blob`/`File` values are copied to run artifacts because
|
|
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
363
368
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
364
369
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
365
370
|
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
366
|
-
attribute paths. `
|
|
367
|
-
|
|
371
|
+
attribute paths. The default `steps` path reads an array from
|
|
372
|
+
`span.attributes.steps`; if it is missing, direct child `model_step` spans are
|
|
373
|
+
shown as that call's steps. `latencyMs` is time to first token; duration,
|
|
374
|
+
total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
|
|
375
|
+
to broaden the filter,
|
|
368
376
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
369
377
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
370
378
|
`providers` entries for provider-specific rates, add `costCurrencies` to show
|
|
@@ -385,7 +393,9 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
385
393
|
tokens, and output tokens separately and use `dedupeConsecutiveValues: true`
|
|
386
394
|
to skip repeated adjacent chart values. `totalTokens` is input + output only;
|
|
387
395
|
cache read/write tokens stay separate and affect `costUsd` at their own
|
|
388
|
-
rates.
|
|
396
|
+
rates. `llmTurns` is the maximum per-call turn count in the case run, using
|
|
397
|
+
configured steps when available and otherwise one turn per matched LLM call
|
|
398
|
+
span.
|
|
389
399
|
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
390
400
|
cacheCreationInputTokens` so cache details are not double-counted.
|
|
391
401
|
`cacheCreationInputTokens` is the total cache-write count; optional
|