@ls-stack/agent-eval 0.58.3 → 0.58.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-ROCEce9X.mjs → app-DLNmRUqH.mjs} +64 -12
- package/dist/apps/web/dist/assets/index-BD6FXk5p.js +377 -0
- package/dist/apps/web/dist/assets/index-C2fbGEsB.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +4 -3
- package/dist/{cli-SP4kEtYL.mjs → cli-ClAkjTvo.mjs} +8 -7
- package/dist/index.d.mts +91 -79
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +5 -4
- package/dist/{runExecution-CFw0MQFs.mjs → runExecution-BMnJXWhN.mjs} +139 -14
- package/dist/{runOrchestration-CxjiQmof.mjs → runOrchestration-CvbTAoEb.mjs} +2 -2
- package/dist/{runner-CY3bgsjU.mjs → runner-BfHgVhGS.mjs} +2 -2
- package/dist/{runner-BlFQyvN2.mjs → runner-DJJekv9f.mjs} +1 -1
- package/dist/{src-7GbQj1sb.mjs → src-DfzidkYr.mjs} +2 -2
- package/package.json +2 -2
- package/skills/agent-eval/SKILL.md +15 -7
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +0 -1
- package/dist/apps/web/dist/assets/index-PTikBbhf.js +0 -377
|
@@ -2,6 +2,7 @@ import { createRequire, registerHooks } from "node:module";
|
|
|
2
2
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
3
3
|
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
4
4
|
import { z, z as z$1 } from "zod/v4";
|
|
5
|
+
import { resultify } from "t-result";
|
|
5
6
|
import dayjs from "dayjs";
|
|
6
7
|
import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
|
|
7
8
|
import { mkdir, readFile, readdir, rename, rm, writeFile } from "node:fs/promises";
|
|
@@ -10,7 +11,6 @@ import { createHash, randomUUID } from "node:crypto";
|
|
|
10
11
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
11
12
|
import { existsSync } from "node:fs";
|
|
12
13
|
import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
|
|
13
|
-
import { resultify } from "t-result";
|
|
14
14
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
15
15
|
//#region ../sdk/src/defineEval.ts
|
|
16
16
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
@@ -1401,6 +1401,10 @@ function resolveApiCallsConfig(input) {
|
|
|
1401
1401
|
metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
|
|
1402
1402
|
};
|
|
1403
1403
|
}
|
|
1404
|
+
const cacheMaxEntriesSchema = z.union([z.number(), z.object({
|
|
1405
|
+
default: z.number().optional(),
|
|
1406
|
+
namespaces: z.record(z.string(), z.number()).optional()
|
|
1407
|
+
})]).optional();
|
|
1404
1408
|
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
1405
1409
|
const agentEvalsConfigSchema = z.object({
|
|
1406
1410
|
workspaceRoot: z.string().optional(),
|
|
@@ -1424,11 +1428,26 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1424
1428
|
cache: z.object({
|
|
1425
1429
|
enabled: z.boolean().optional(),
|
|
1426
1430
|
dir: z.string().optional(),
|
|
1427
|
-
|
|
1431
|
+
maxEntries: cacheMaxEntriesSchema,
|
|
1432
|
+
maxEntriesPerNamespace: z.number().optional(),
|
|
1428
1433
|
maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
|
|
1429
1434
|
pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1430
1435
|
lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
|
|
1431
|
-
maxEntriesPerEval: z.
|
|
1436
|
+
maxEntriesPerEval: z.number().optional()
|
|
1437
|
+
}).transform(({ maxEntries, maxEntriesByNamespace, maxEntriesPerEval, maxEntriesPerNamespace, ...cache }) => {
|
|
1438
|
+
const defaultMaxEntries = maxEntriesPerNamespace ?? maxEntriesPerEval;
|
|
1439
|
+
if (maxEntries !== void 0) return {
|
|
1440
|
+
...cache,
|
|
1441
|
+
maxEntries
|
|
1442
|
+
};
|
|
1443
|
+
if (defaultMaxEntries !== void 0 || maxEntriesByNamespace !== void 0) return {
|
|
1444
|
+
...cache,
|
|
1445
|
+
maxEntries: {
|
|
1446
|
+
default: defaultMaxEntries,
|
|
1447
|
+
namespaces: maxEntriesByNamespace
|
|
1448
|
+
}
|
|
1449
|
+
};
|
|
1450
|
+
return cache;
|
|
1432
1451
|
}).optional()
|
|
1433
1452
|
});
|
|
1434
1453
|
//#endregion
|
|
@@ -1884,7 +1903,7 @@ function getEvalTitle(evalLike) {
|
|
|
1884
1903
|
}
|
|
1885
1904
|
//#endregion
|
|
1886
1905
|
//#region ../shared/src/utils/getNestedAttribute.ts
|
|
1887
|
-
function isRecord$
|
|
1906
|
+
function isRecord$6(value) {
|
|
1888
1907
|
return typeof value === "object" && value !== null;
|
|
1889
1908
|
}
|
|
1890
1909
|
/**
|
|
@@ -1899,14 +1918,14 @@ function getNestedAttribute(value, path) {
|
|
|
1899
1918
|
const parts = path.split(".");
|
|
1900
1919
|
let current = value;
|
|
1901
1920
|
for (const part of parts) {
|
|
1902
|
-
if (!isRecord$
|
|
1921
|
+
if (!isRecord$6(current) || !(part in current)) return;
|
|
1903
1922
|
current = current[part];
|
|
1904
1923
|
}
|
|
1905
1924
|
return current;
|
|
1906
1925
|
}
|
|
1907
1926
|
//#endregion
|
|
1908
1927
|
//#region ../shared/src/utils/deriveCallAttributes.ts
|
|
1909
|
-
function isRecord$
|
|
1928
|
+
function isRecord$5(value) {
|
|
1910
1929
|
return typeof value === "object" && value !== null;
|
|
1911
1930
|
}
|
|
1912
1931
|
function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
@@ -1919,7 +1938,7 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
|
1919
1938
|
continue;
|
|
1920
1939
|
}
|
|
1921
1940
|
const nextValue = current[part];
|
|
1922
|
-
const nextRecord = isRecord$
|
|
1941
|
+
const nextRecord = isRecord$5(nextValue) ? { ...nextValue } : {};
|
|
1923
1942
|
current[part] = nextRecord;
|
|
1924
1943
|
current = nextRecord;
|
|
1925
1944
|
}
|
|
@@ -1944,7 +1963,7 @@ function applyDerivedAttributesForKind(params) {
|
|
|
1944
1963
|
return;
|
|
1945
1964
|
}
|
|
1946
1965
|
})();
|
|
1947
|
-
if (!isRecord$
|
|
1966
|
+
if (!isRecord$5(values)) continue;
|
|
1948
1967
|
for (const [path, value] of Object.entries(values)) {
|
|
1949
1968
|
if (value === void 0) continue;
|
|
1950
1969
|
attributes = mergeNestedAttribute$1(attributes, path, value);
|
|
@@ -2006,6 +2025,13 @@ function readString$2(attributes, path) {
|
|
|
2006
2025
|
const raw = getNestedAttribute(attributes, path);
|
|
2007
2026
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
2008
2027
|
}
|
|
2028
|
+
function isRecord$4(value) {
|
|
2029
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2030
|
+
}
|
|
2031
|
+
function readRecordValue(value, key) {
|
|
2032
|
+
if (!isRecord$4(value)) return void 0;
|
|
2033
|
+
return value[key];
|
|
2034
|
+
}
|
|
2009
2035
|
function computeTokenCost(tokens, usdPerMillion) {
|
|
2010
2036
|
if (tokens === null) return null;
|
|
2011
2037
|
if (tokens === 0) return 0;
|
|
@@ -2216,6 +2242,76 @@ function buildModelStepsByParent(spans) {
|
|
|
2216
2242
|
}
|
|
2217
2243
|
return stepsByParent;
|
|
2218
2244
|
}
|
|
2245
|
+
function buildChildrenByParent(spans) {
|
|
2246
|
+
const childrenByParent = /* @__PURE__ */ new Map();
|
|
2247
|
+
for (const span of spans) {
|
|
2248
|
+
if (span.parentId === null) continue;
|
|
2249
|
+
const current = childrenByParent.get(span.parentId);
|
|
2250
|
+
if (current === void 0) {
|
|
2251
|
+
childrenByParent.set(span.parentId, [span]);
|
|
2252
|
+
continue;
|
|
2253
|
+
}
|
|
2254
|
+
current.push(span);
|
|
2255
|
+
}
|
|
2256
|
+
return childrenByParent;
|
|
2257
|
+
}
|
|
2258
|
+
function appendToolCallValues(out, value) {
|
|
2259
|
+
if (Array.isArray(value)) {
|
|
2260
|
+
out.push(...value);
|
|
2261
|
+
return value.length > 0;
|
|
2262
|
+
}
|
|
2263
|
+
if (value === void 0 || value === null) return false;
|
|
2264
|
+
out.push(value);
|
|
2265
|
+
return true;
|
|
2266
|
+
}
|
|
2267
|
+
function parseJsonRecord(value) {
|
|
2268
|
+
if (typeof value !== "string") return null;
|
|
2269
|
+
const parsed = resultify(() => JSON.parse(value));
|
|
2270
|
+
if (parsed.error || !isRecord$4(parsed.value)) return null;
|
|
2271
|
+
return parsed.value;
|
|
2272
|
+
}
|
|
2273
|
+
function readMastraModelStepOutput(step) {
|
|
2274
|
+
return parseJsonRecord(readRecordValue(readRecordValue(readRecordValue(step, "attributes"), "genAI"), "mastra.model_step.output"));
|
|
2275
|
+
}
|
|
2276
|
+
function isTraceSpan(value) {
|
|
2277
|
+
return isRecord$4(value) && typeof value.id === "string" && typeof value.kind === "string" && typeof value.name === "string";
|
|
2278
|
+
}
|
|
2279
|
+
function toolCallSpanToEntry(span) {
|
|
2280
|
+
const attrs = span.attributes;
|
|
2281
|
+
const genAI = readRecordValue(attrs, "genAI");
|
|
2282
|
+
return {
|
|
2283
|
+
id: span.id,
|
|
2284
|
+
name: span.name,
|
|
2285
|
+
kind: span.kind,
|
|
2286
|
+
status: span.status,
|
|
2287
|
+
input: getNestedAttribute(attrs, "input"),
|
|
2288
|
+
output: getNestedAttribute(attrs, "output"),
|
|
2289
|
+
arguments: readRecordValue(genAI, "gen_ai.tool.call.arguments"),
|
|
2290
|
+
result: readRecordValue(genAI, "gen_ai.tool.call.result")
|
|
2291
|
+
};
|
|
2292
|
+
}
|
|
2293
|
+
function appendToolCallsFromStep({ out, step, childrenByParent }) {
|
|
2294
|
+
let foundStepCalls = false;
|
|
2295
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "toolCalls")) || foundStepCalls;
|
|
2296
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "output.toolCalls")) || foundStepCalls;
|
|
2297
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "attributes.output.toolCalls")) || foundStepCalls;
|
|
2298
|
+
const mastraOutput = readMastraModelStepOutput(step);
|
|
2299
|
+
if (!foundStepCalls && mastraOutput !== null) foundStepCalls = appendToolCallValues(out, mastraOutput.toolCalls) || foundStepCalls;
|
|
2300
|
+
if (!isTraceSpan(step)) return;
|
|
2301
|
+
const childToolSpans = childrenByParent.get(step.id)?.filter((child) => child.kind === "tool_call") ?? [];
|
|
2302
|
+
if (childToolSpans.length === 0) return;
|
|
2303
|
+
out.push(...childToolSpans.map((child) => toolCallSpanToEntry(child)));
|
|
2304
|
+
}
|
|
2305
|
+
function readToolCalls({ attributes, path, stepDetails, childrenByParent }) {
|
|
2306
|
+
const out = [];
|
|
2307
|
+
appendToolCallValues(out, getNestedAttribute(attributes, path));
|
|
2308
|
+
if (stepDetails !== null) for (const step of stepDetails) appendToolCallsFromStep({
|
|
2309
|
+
out,
|
|
2310
|
+
step,
|
|
2311
|
+
childrenByParent
|
|
2312
|
+
});
|
|
2313
|
+
return out.length > 0 ? out : void 0;
|
|
2314
|
+
}
|
|
2219
2315
|
function collectWarnings$1(span) {
|
|
2220
2316
|
const out = [];
|
|
2221
2317
|
if (span.warning) out.push(span.warning);
|
|
@@ -2259,6 +2355,7 @@ function pickError$1(span) {
|
|
|
2259
2355
|
function extractLlmCalls(spans, config) {
|
|
2260
2356
|
const kindSet = new Set(config.kinds);
|
|
2261
2357
|
const modelStepsByParent = buildModelStepsByParent(spans);
|
|
2358
|
+
const childrenByParent = buildChildrenByParent(spans);
|
|
2262
2359
|
const result = [];
|
|
2263
2360
|
for (const span of spans) {
|
|
2264
2361
|
if (!kindSet.has(span.kind)) continue;
|
|
@@ -2317,6 +2414,8 @@ function extractLlmCalls(spans, config) {
|
|
|
2317
2414
|
placements: metric.placements
|
|
2318
2415
|
});
|
|
2319
2416
|
}
|
|
2417
|
+
const childModelSteps = modelStepsByParent.get(span.id) ?? [];
|
|
2418
|
+
const stepInfo = readSteps(attrs, config.attributes.steps, childModelSteps);
|
|
2320
2419
|
result.push({
|
|
2321
2420
|
id: span.id,
|
|
2322
2421
|
name: span.name,
|
|
@@ -2344,13 +2443,18 @@ function extractLlmCalls(spans, config) {
|
|
|
2344
2443
|
cachedInputCostUsd,
|
|
2345
2444
|
cacheCreationInputCostUsd,
|
|
2346
2445
|
reasoningCostUsd,
|
|
2347
|
-
...
|
|
2446
|
+
...stepInfo,
|
|
2348
2447
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
2349
2448
|
durationMs,
|
|
2350
2449
|
input: getNestedAttribute(attrs, config.attributes.input),
|
|
2351
2450
|
output: getNestedAttribute(attrs, config.attributes.output),
|
|
2352
2451
|
reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
|
|
2353
|
-
toolCalls:
|
|
2452
|
+
toolCalls: readToolCalls({
|
|
2453
|
+
attributes: attrs,
|
|
2454
|
+
path: config.attributes.toolCalls,
|
|
2455
|
+
stepDetails: stepInfo.stepDetails,
|
|
2456
|
+
childrenByParent
|
|
2457
|
+
}),
|
|
2354
2458
|
metrics,
|
|
2355
2459
|
warnings: collectWarnings$1(span),
|
|
2356
2460
|
error: pickError$1(span)
|
|
@@ -5038,6 +5142,9 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5038
5142
|
visit(null);
|
|
5039
5143
|
return result;
|
|
5040
5144
|
};
|
|
5145
|
+
const isToolCallSpan = (span) => {
|
|
5146
|
+
return span.kind === "tool" || span.kind === "tool_call";
|
|
5147
|
+
};
|
|
5041
5148
|
const filterSpanNames = (sourceSpans, kind) => {
|
|
5042
5149
|
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
5043
5150
|
};
|
|
@@ -5057,13 +5164,18 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5057
5164
|
return spans.filter((s) => s.kind === kind);
|
|
5058
5165
|
},
|
|
5059
5166
|
findToolCallSpans() {
|
|
5060
|
-
return spans.filter(
|
|
5167
|
+
return spans.filter(isToolCallSpan);
|
|
5061
5168
|
},
|
|
5062
5169
|
listToolCallSpanNames() {
|
|
5063
|
-
return
|
|
5170
|
+
return spans.filter(isToolCallSpan).map((span) => span.name);
|
|
5064
5171
|
},
|
|
5065
5172
|
hasToolCallSpan(name) {
|
|
5066
|
-
return spans.some((s) => s
|
|
5173
|
+
return spans.some((s) => isToolCallSpan(s) && s.name === name);
|
|
5174
|
+
},
|
|
5175
|
+
hasNToolCallSpans(toolName, expectedCalls) {
|
|
5176
|
+
return spans.filter((span) => {
|
|
5177
|
+
return isToolCallSpan(span) && span.name === toolName;
|
|
5178
|
+
}).length === expectedCalls;
|
|
5067
5179
|
},
|
|
5068
5180
|
listSpanNames(kind) {
|
|
5069
5181
|
return filterSpanNames(spans, kind);
|
|
@@ -5078,6 +5190,19 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5078
5190
|
};
|
|
5079
5191
|
}
|
|
5080
5192
|
//#endregion
|
|
5193
|
+
//#region ../runner/src/cacheConfig.ts
|
|
5194
|
+
function getCacheRetentionOptions(cacheConfig) {
|
|
5195
|
+
const maxEntries = cacheConfig?.maxEntries;
|
|
5196
|
+
if (typeof maxEntries === "number") return {
|
|
5197
|
+
maxEntriesPerNamespace: maxEntries,
|
|
5198
|
+
maxEntriesByNamespace: void 0
|
|
5199
|
+
};
|
|
5200
|
+
return {
|
|
5201
|
+
maxEntriesPerNamespace: maxEntries?.default,
|
|
5202
|
+
maxEntriesByNamespace: maxEntries?.namespaces
|
|
5203
|
+
};
|
|
5204
|
+
}
|
|
5205
|
+
//#endregion
|
|
5081
5206
|
//#region ../runner/src/cacheAccessTime.ts
|
|
5082
5207
|
const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
|
|
5083
5208
|
function normalizeLastAccessedAtUpdateIntervalMs(value) {
|
|
@@ -7026,4 +7151,4 @@ function recordAssertionFailure(scope, failure) {
|
|
|
7026
7151
|
});
|
|
7027
7152
|
}
|
|
7028
7153
|
//#endregion
|
|
7029
|
-
export {
|
|
7154
|
+
export { setEvalOutput as $, serializeCacheValue as A, evalStatAggregateSchema as At, evalLog as B, evalSpan as C, resolveApiCallsConfig as Ct, deserializeCacheRecording as D, getCaseRowCaseKey as Dt, hashCacheKeySync as E, buildEvalKey as Et, EvalAssertionError as F, defineEval as Ft, isInEvalScope as G, getCurrentScope as H, EvalRuntimeUsageError as I, getEvalRegistry as It, nextEvalId as J, matchesEvalTags as K, appendToEvalOutput as L, runWithEvalRegistry as Lt, manualInputFileValueSchema as M, manualInputDescriptorSchema as Mt, readManualInputFile as N, evalChartsConfigSchema as Nt, deserializeCacheValue as O, caseDetailSchema as Ot, evalExpect as P, columnDefSchema as Pt, runWithEvalClock as Q, configureEvalRunLogs as R, captureEvalSpanError as S, runSummarySchema as St, hashCacheKey as T, buildCaseKey as Tt, getEvalCaseInput as U, evalTime as V, incrementEvalOutput as W, runInEvalScope as X, runInEvalRuntimeScope as Y, runInExistingEvalScope as Z, createBufferedCacheStore as _, dedupeEvalTags as _t, isCaseChildParentMessage as a, extractCacheHits as at, z$1 as b, validateTagsFilterExpression as bt, resolveArtifactPath as c, simulateLlmCallCost as ct, loadEvalModule as d, getNestedAttribute as dt, setScopeCacheContext as et, resolveEvalDefaultConfig as f, getEvalTitle as ft, commitPendingCacheWrites as g, deriveStatusFromChildStatuses as gt, normalizeScoreDef as h, deriveStatusFromCaseRows as ht, isCaseChildMessage as i, extractCacheEntries as it, repoFile as j, evalStatsConfigSchema as jt, serializeCacheRecording as k, caseRowSchema as kt, registerAgentEvalsPackageResolutionHooks as l, simulateTokenAllocation as lt, buildDeclaredColumnDefs as m, deriveScopedSummaryFromCases as mt, resolveRunnableEvalCases as n, createRunRequestSchema as nt, stripTerminalControlCodes as o, extractApiCalls as ot, loadConfig as p, getEvalDisplayStatus as pt, mergeEvalOutput as q, runCase as r, updateManualScoreRequestSchema as rt, resolveTracePresentation as s, extractLlmCalls as st, filterEvalCases as t, startEvalBackgroundJob as tt, runWithModuleIsolation as u, applyDerivedCallAttributes as ut, createFsCacheStore as v, matchesTagsFilter as vt, evalTracer as w, resolveLlmCallsConfig as wt, buildTraceTree as x, runManifestSchema as xt, getCacheRetentionOptions as y, validateEvalTagName as yt, evalAssert as z };
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { Dt as
|
|
1
|
+
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-BMnJXWhN.mjs";
|
|
2
|
+
import { Result, resultify } from "t-result";
|
|
2
3
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
4
|
import { dirname, join } from "node:path";
|
|
4
5
|
import { existsSync } from "node:fs";
|
|
5
|
-
import { Result, resultify } from "t-result";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { spawn } from "node:child_process";
|
|
8
8
|
//#region ../runner/src/chartValidation.ts
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-ClAkjTvo.mjs";
|
|
2
|
+
import "./src-DfzidkYr.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-BfHgVhGS.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-BMnJXWhN.mjs";
|
|
2
|
+
import "./cli-ClAkjTvo.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.58.
|
|
3
|
+
"version": "0.58.5",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,8 +32,8 @@
|
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
|
-
"@agent-evals/runner": "0.0.1",
|
|
36
35
|
"@agent-evals/sdk": "0.0.1",
|
|
36
|
+
"@agent-evals/runner": "0.0.1",
|
|
37
37
|
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
@@ -374,8 +374,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
374
374
|
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
375
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
376
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
377
|
-
`trace.hasToolCallSpan(name)`,
|
|
378
|
-
`trace.
|
|
377
|
+
`trace.hasToolCallSpan(name)`,
|
|
378
|
+
`trace.hasNToolCallSpans(toolName, expectedCalls)`,
|
|
379
|
+
`trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
|
|
380
|
+
`trace.flattenDfs()`.
|
|
381
|
+
The tool-call helpers include both `kind: 'tool'` spans and imported
|
|
382
|
+
execution spans recorded as `kind: 'tool_call'`.
|
|
379
383
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
380
384
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
381
385
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
@@ -385,9 +389,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
385
389
|
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
386
390
|
attribute paths. The default `steps` path reads an array from
|
|
387
391
|
`span.attributes.steps`; if it is missing, direct child `model_step` spans are
|
|
388
|
-
shown as that call's steps.
|
|
389
|
-
|
|
390
|
-
|
|
392
|
+
shown as that call's steps. Tool calls are aggregated from the configured
|
|
393
|
+
`toolCalls` path plus step-level `toolCalls` on authored step arrays or
|
|
394
|
+
direct `model_step` child spans, including Mastra's serialized
|
|
395
|
+
`mastra.model_step.output` format, and child `tool_call` execution spans
|
|
396
|
+
under each model step. `latencyMs` is time to first token; duration, total
|
|
397
|
+
tokens, output tokens/sec, and USD costs are derived. Override `kinds` to
|
|
398
|
+
broaden the filter,
|
|
391
399
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
392
400
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
393
401
|
`providers` entries for provider-specific rates, add `costCurrencies` to show
|
|
@@ -546,8 +554,8 @@ Mental model:
|
|
|
546
554
|
JSON blob refs. Each namespace is capped at 100 entries by default. The runner
|
|
547
555
|
prunes least recently accessed indexed entries after a run finishes and the
|
|
548
556
|
runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
|
|
549
|
-
`cache.
|
|
550
|
-
`
|
|
557
|
+
`cache.maxEntries` as a number for the default cap, or as
|
|
558
|
+
`{ default, namespaces }` for exact namespace-specific caps.
|
|
551
559
|
- Unindexed legacy cache files are ignored by normal lookup/listing. Use
|
|
552
560
|
`agent-evals cache repair` to remove unindexed cache files, stale index rows,
|
|
553
561
|
debug sidecars, and unreferenced blob files.
|