@ls-stack/agent-eval 0.58.4 → 0.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-sGeXC4AT.mjs → app-B3PEtWqH.mjs} +5 -5
- package/dist/apps/web/dist/assets/{index-BXFsxHVc.js → index-BD6FXk5p.js} +79 -79
- package/dist/apps/web/dist/assets/index-C2fbGEsB.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-Bf5RzM8O.mjs → cli-Dkp2-rBm.mjs} +5 -5
- package/dist/index.d.mts +150 -112
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-CLkC-4Z1.mjs → runExecution-C3XVZHRC.mjs} +192 -11
- package/dist/{runOrchestration-BS-WxTee.mjs → runOrchestration-B5An-AEi.mjs} +2 -2
- package/dist/{runner-Bz5ZPqmm.mjs → runner-BJXz_V_V.mjs} +1 -1
- package/dist/{runner-DW-11txl.mjs → runner-C9J-1fkp.mjs} +2 -2
- package/dist/{src-BjMMDm_O.mjs → src-8dGXUULC.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +21 -5
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +0 -1
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Dkp2-rBm.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-8dGXUULC.mjs";
|
|
4
4
|
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-B5An-AEi.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -2,6 +2,7 @@ import { createRequire, registerHooks } from "node:module";
|
|
|
2
2
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
3
3
|
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
4
4
|
import { z, z as z$1 } from "zod/v4";
|
|
5
|
+
import { resultify } from "t-result";
|
|
5
6
|
import dayjs from "dayjs";
|
|
6
7
|
import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
|
|
7
8
|
import { mkdir, readFile, readdir, rename, rm, writeFile } from "node:fs/promises";
|
|
@@ -10,7 +11,6 @@ import { createHash, randomUUID } from "node:crypto";
|
|
|
10
11
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
11
12
|
import { existsSync } from "node:fs";
|
|
12
13
|
import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
|
|
13
|
-
import { resultify } from "t-result";
|
|
14
14
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
15
15
|
//#region ../sdk/src/defineEval.ts
|
|
16
16
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
@@ -1903,7 +1903,7 @@ function getEvalTitle(evalLike) {
|
|
|
1903
1903
|
}
|
|
1904
1904
|
//#endregion
|
|
1905
1905
|
//#region ../shared/src/utils/getNestedAttribute.ts
|
|
1906
|
-
function isRecord$
|
|
1906
|
+
function isRecord$6(value) {
|
|
1907
1907
|
return typeof value === "object" && value !== null;
|
|
1908
1908
|
}
|
|
1909
1909
|
/**
|
|
@@ -1918,14 +1918,14 @@ function getNestedAttribute(value, path) {
|
|
|
1918
1918
|
const parts = path.split(".");
|
|
1919
1919
|
let current = value;
|
|
1920
1920
|
for (const part of parts) {
|
|
1921
|
-
if (!isRecord$
|
|
1921
|
+
if (!isRecord$6(current) || !(part in current)) return;
|
|
1922
1922
|
current = current[part];
|
|
1923
1923
|
}
|
|
1924
1924
|
return current;
|
|
1925
1925
|
}
|
|
1926
1926
|
//#endregion
|
|
1927
1927
|
//#region ../shared/src/utils/deriveCallAttributes.ts
|
|
1928
|
-
function isRecord$
|
|
1928
|
+
function isRecord$5(value) {
|
|
1929
1929
|
return typeof value === "object" && value !== null;
|
|
1930
1930
|
}
|
|
1931
1931
|
function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
@@ -1938,7 +1938,7 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
|
1938
1938
|
continue;
|
|
1939
1939
|
}
|
|
1940
1940
|
const nextValue = current[part];
|
|
1941
|
-
const nextRecord = isRecord$
|
|
1941
|
+
const nextRecord = isRecord$5(nextValue) ? { ...nextValue } : {};
|
|
1942
1942
|
current[part] = nextRecord;
|
|
1943
1943
|
current = nextRecord;
|
|
1944
1944
|
}
|
|
@@ -1963,7 +1963,7 @@ function applyDerivedAttributesForKind(params) {
|
|
|
1963
1963
|
return;
|
|
1964
1964
|
}
|
|
1965
1965
|
})();
|
|
1966
|
-
if (!isRecord$
|
|
1966
|
+
if (!isRecord$5(values)) continue;
|
|
1967
1967
|
for (const [path, value] of Object.entries(values)) {
|
|
1968
1968
|
if (value === void 0) continue;
|
|
1969
1969
|
attributes = mergeNestedAttribute$1(attributes, path, value);
|
|
@@ -2025,6 +2025,13 @@ function readString$2(attributes, path) {
|
|
|
2025
2025
|
const raw = getNestedAttribute(attributes, path);
|
|
2026
2026
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
2027
2027
|
}
|
|
2028
|
+
function isRecord$4(value) {
|
|
2029
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2030
|
+
}
|
|
2031
|
+
function readRecordValue(value, key) {
|
|
2032
|
+
if (!isRecord$4(value)) return void 0;
|
|
2033
|
+
return value[key];
|
|
2034
|
+
}
|
|
2028
2035
|
function computeTokenCost(tokens, usdPerMillion) {
|
|
2029
2036
|
if (tokens === null) return null;
|
|
2030
2037
|
if (tokens === 0) return 0;
|
|
@@ -2235,6 +2242,76 @@ function buildModelStepsByParent(spans) {
|
|
|
2235
2242
|
}
|
|
2236
2243
|
return stepsByParent;
|
|
2237
2244
|
}
|
|
2245
|
+
function buildChildrenByParent(spans) {
|
|
2246
|
+
const childrenByParent = /* @__PURE__ */ new Map();
|
|
2247
|
+
for (const span of spans) {
|
|
2248
|
+
if (span.parentId === null) continue;
|
|
2249
|
+
const current = childrenByParent.get(span.parentId);
|
|
2250
|
+
if (current === void 0) {
|
|
2251
|
+
childrenByParent.set(span.parentId, [span]);
|
|
2252
|
+
continue;
|
|
2253
|
+
}
|
|
2254
|
+
current.push(span);
|
|
2255
|
+
}
|
|
2256
|
+
return childrenByParent;
|
|
2257
|
+
}
|
|
2258
|
+
function appendToolCallValues(out, value) {
|
|
2259
|
+
if (Array.isArray(value)) {
|
|
2260
|
+
out.push(...value);
|
|
2261
|
+
return value.length > 0;
|
|
2262
|
+
}
|
|
2263
|
+
if (value === void 0 || value === null) return false;
|
|
2264
|
+
out.push(value);
|
|
2265
|
+
return true;
|
|
2266
|
+
}
|
|
2267
|
+
function parseJsonRecord(value) {
|
|
2268
|
+
if (typeof value !== "string") return null;
|
|
2269
|
+
const parsed = resultify(() => JSON.parse(value));
|
|
2270
|
+
if (parsed.error || !isRecord$4(parsed.value)) return null;
|
|
2271
|
+
return parsed.value;
|
|
2272
|
+
}
|
|
2273
|
+
function readMastraModelStepOutput(step) {
|
|
2274
|
+
return parseJsonRecord(readRecordValue(readRecordValue(readRecordValue(step, "attributes"), "genAI"), "mastra.model_step.output"));
|
|
2275
|
+
}
|
|
2276
|
+
function isTraceSpan(value) {
|
|
2277
|
+
return isRecord$4(value) && typeof value.id === "string" && typeof value.kind === "string" && typeof value.name === "string";
|
|
2278
|
+
}
|
|
2279
|
+
function toolCallSpanToEntry(span) {
|
|
2280
|
+
const attrs = span.attributes;
|
|
2281
|
+
const genAI = readRecordValue(attrs, "genAI");
|
|
2282
|
+
return {
|
|
2283
|
+
id: span.id,
|
|
2284
|
+
name: span.name,
|
|
2285
|
+
kind: span.kind,
|
|
2286
|
+
status: span.status,
|
|
2287
|
+
input: getNestedAttribute(attrs, "input"),
|
|
2288
|
+
output: getNestedAttribute(attrs, "output"),
|
|
2289
|
+
arguments: readRecordValue(genAI, "gen_ai.tool.call.arguments"),
|
|
2290
|
+
result: readRecordValue(genAI, "gen_ai.tool.call.result")
|
|
2291
|
+
};
|
|
2292
|
+
}
|
|
2293
|
+
function appendToolCallsFromStep({ out, step, childrenByParent }) {
|
|
2294
|
+
let foundStepCalls = false;
|
|
2295
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "toolCalls")) || foundStepCalls;
|
|
2296
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "output.toolCalls")) || foundStepCalls;
|
|
2297
|
+
foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "attributes.output.toolCalls")) || foundStepCalls;
|
|
2298
|
+
const mastraOutput = readMastraModelStepOutput(step);
|
|
2299
|
+
if (!foundStepCalls && mastraOutput !== null) foundStepCalls = appendToolCallValues(out, mastraOutput.toolCalls) || foundStepCalls;
|
|
2300
|
+
if (!isTraceSpan(step)) return;
|
|
2301
|
+
const childToolSpans = childrenByParent.get(step.id)?.filter((child) => child.kind === "tool_call") ?? [];
|
|
2302
|
+
if (childToolSpans.length === 0) return;
|
|
2303
|
+
out.push(...childToolSpans.map((child) => toolCallSpanToEntry(child)));
|
|
2304
|
+
}
|
|
2305
|
+
function readToolCalls({ attributes, path, stepDetails, childrenByParent }) {
|
|
2306
|
+
const out = [];
|
|
2307
|
+
appendToolCallValues(out, getNestedAttribute(attributes, path));
|
|
2308
|
+
if (stepDetails !== null) for (const step of stepDetails) appendToolCallsFromStep({
|
|
2309
|
+
out,
|
|
2310
|
+
step,
|
|
2311
|
+
childrenByParent
|
|
2312
|
+
});
|
|
2313
|
+
return out.length > 0 ? out : void 0;
|
|
2314
|
+
}
|
|
2238
2315
|
function collectWarnings$1(span) {
|
|
2239
2316
|
const out = [];
|
|
2240
2317
|
if (span.warning) out.push(span.warning);
|
|
@@ -2278,6 +2355,7 @@ function pickError$1(span) {
|
|
|
2278
2355
|
function extractLlmCalls(spans, config) {
|
|
2279
2356
|
const kindSet = new Set(config.kinds);
|
|
2280
2357
|
const modelStepsByParent = buildModelStepsByParent(spans);
|
|
2358
|
+
const childrenByParent = buildChildrenByParent(spans);
|
|
2281
2359
|
const result = [];
|
|
2282
2360
|
for (const span of spans) {
|
|
2283
2361
|
if (!kindSet.has(span.kind)) continue;
|
|
@@ -2336,6 +2414,8 @@ function extractLlmCalls(spans, config) {
|
|
|
2336
2414
|
placements: metric.placements
|
|
2337
2415
|
});
|
|
2338
2416
|
}
|
|
2417
|
+
const childModelSteps = modelStepsByParent.get(span.id) ?? [];
|
|
2418
|
+
const stepInfo = readSteps(attrs, config.attributes.steps, childModelSteps);
|
|
2339
2419
|
result.push({
|
|
2340
2420
|
id: span.id,
|
|
2341
2421
|
name: span.name,
|
|
@@ -2363,13 +2443,18 @@ function extractLlmCalls(spans, config) {
|
|
|
2363
2443
|
cachedInputCostUsd,
|
|
2364
2444
|
cacheCreationInputCostUsd,
|
|
2365
2445
|
reasoningCostUsd,
|
|
2366
|
-
...
|
|
2446
|
+
...stepInfo,
|
|
2367
2447
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
2368
2448
|
durationMs,
|
|
2369
2449
|
input: getNestedAttribute(attrs, config.attributes.input),
|
|
2370
2450
|
output: getNestedAttribute(attrs, config.attributes.output),
|
|
2371
2451
|
reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
|
|
2372
|
-
toolCalls:
|
|
2452
|
+
toolCalls: readToolCalls({
|
|
2453
|
+
attributes: attrs,
|
|
2454
|
+
path: config.attributes.toolCalls,
|
|
2455
|
+
stepDetails: stepInfo.stepDetails,
|
|
2456
|
+
childrenByParent
|
|
2457
|
+
}),
|
|
2373
2458
|
metrics,
|
|
2374
2459
|
warnings: collectWarnings$1(span),
|
|
2375
2460
|
error: pickError$1(span)
|
|
@@ -5057,6 +5142,89 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5057
5142
|
visit(null);
|
|
5058
5143
|
return result;
|
|
5059
5144
|
};
|
|
5145
|
+
const isRecord = (value) => {
|
|
5146
|
+
return typeof value === "object" && value !== null;
|
|
5147
|
+
};
|
|
5148
|
+
const readRecordValue = (value, key) => {
|
|
5149
|
+
if (!isRecord(value)) return void 0;
|
|
5150
|
+
const child = value[key];
|
|
5151
|
+
return isRecord(child) ? child : void 0;
|
|
5152
|
+
};
|
|
5153
|
+
const readStringValue = (value, key) => {
|
|
5154
|
+
if (!isRecord(value)) return void 0;
|
|
5155
|
+
const child = value[key];
|
|
5156
|
+
return typeof child === "string" && child.length > 0 ? child : void 0;
|
|
5157
|
+
};
|
|
5158
|
+
const readValue = (value, key) => {
|
|
5159
|
+
if (!isRecord(value)) return void 0;
|
|
5160
|
+
return value[key];
|
|
5161
|
+
};
|
|
5162
|
+
const parseMaybeJson = (value) => {
|
|
5163
|
+
if (typeof value !== "string") return value;
|
|
5164
|
+
const parsed = resultify(() => JSON.parse(value));
|
|
5165
|
+
return parsed.error ? value : parsed.value;
|
|
5166
|
+
};
|
|
5167
|
+
const firstDefined = (values) => {
|
|
5168
|
+
return values.find((value) => value !== void 0);
|
|
5169
|
+
};
|
|
5170
|
+
const getToolCallMetadata = (span) => {
|
|
5171
|
+
const attributes = span.attributes;
|
|
5172
|
+
return {
|
|
5173
|
+
attributes,
|
|
5174
|
+
genAI: readRecordValue(attributes, "genAI"),
|
|
5175
|
+
mastra: readRecordValue(attributes, "mastra"),
|
|
5176
|
+
toolAttributes: readRecordValue(attributes, "attributes")
|
|
5177
|
+
};
|
|
5178
|
+
};
|
|
5179
|
+
const isToolCallSpan = (span) => {
|
|
5180
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5181
|
+
return span.kind === "tool" || span.kind === "tool_call" || readStringValue(attributes, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "mastra.span.type") === "tool_call" || readStringValue(mastra, "type") === "tool_call" || readStringValue(mastra, "entityType") === "tool";
|
|
5182
|
+
};
|
|
5183
|
+
const getToolCallIdentityNames = (span) => {
|
|
5184
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5185
|
+
return [
|
|
5186
|
+
readStringValue(attributes, "gen_ai.tool.name"),
|
|
5187
|
+
readStringValue(genAI, "gen_ai.tool.name"),
|
|
5188
|
+
readStringValue(mastra, "entityName"),
|
|
5189
|
+
readStringValue(mastra, "entityId"),
|
|
5190
|
+
span.name
|
|
5191
|
+
].filter((name) => name !== void 0);
|
|
5192
|
+
};
|
|
5193
|
+
const getPreferredToolCallName = (span) => {
|
|
5194
|
+
return getToolCallIdentityNames(span)[0] ?? span.name;
|
|
5195
|
+
};
|
|
5196
|
+
const toolCallSpanMatchesName = (span, toolName) => {
|
|
5197
|
+
return getToolCallIdentityNames(span).includes(toolName);
|
|
5198
|
+
};
|
|
5199
|
+
const countToolCallSpans = (toolName) => {
|
|
5200
|
+
return spans.filter((span) => {
|
|
5201
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, toolName);
|
|
5202
|
+
}).length;
|
|
5203
|
+
};
|
|
5204
|
+
const buildToolCallSpan = (span) => {
|
|
5205
|
+
const { attributes, genAI, toolAttributes } = getToolCallMetadata(span);
|
|
5206
|
+
return {
|
|
5207
|
+
name: getPreferredToolCallName(span),
|
|
5208
|
+
spanName: span.name,
|
|
5209
|
+
kind: span.kind,
|
|
5210
|
+
arguments: parseMaybeJson(firstDefined([
|
|
5211
|
+
readValue(attributes, "gen_ai.tool.call.arguments"),
|
|
5212
|
+
readValue(genAI, "gen_ai.tool.call.arguments"),
|
|
5213
|
+
readValue(attributes, "arguments"),
|
|
5214
|
+
readValue(attributes, "input")
|
|
5215
|
+
])),
|
|
5216
|
+
result: parseMaybeJson(firstDefined([
|
|
5217
|
+
readValue(attributes, "gen_ai.tool.call.result"),
|
|
5218
|
+
readValue(genAI, "gen_ai.tool.call.result"),
|
|
5219
|
+
readValue(attributes, "result"),
|
|
5220
|
+
readValue(attributes, "output")
|
|
5221
|
+
])),
|
|
5222
|
+
description: readStringValue(attributes, "gen_ai.tool.description") ?? readStringValue(genAI, "gen_ai.tool.description") ?? readStringValue(toolAttributes, "toolDescription"),
|
|
5223
|
+
toolType: readStringValue(attributes, "gen_ai.tool.type") ?? readStringValue(genAI, "gen_ai.tool.type") ?? readStringValue(toolAttributes, "toolType"),
|
|
5224
|
+
attributes,
|
|
5225
|
+
span
|
|
5226
|
+
};
|
|
5227
|
+
};
|
|
5060
5228
|
const filterSpanNames = (sourceSpans, kind) => {
|
|
5061
5229
|
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
5062
5230
|
};
|
|
@@ -5076,13 +5244,26 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5076
5244
|
return spans.filter((s) => s.kind === kind);
|
|
5077
5245
|
},
|
|
5078
5246
|
findToolCallSpans() {
|
|
5079
|
-
return spans.filter(
|
|
5247
|
+
return spans.filter(isToolCallSpan);
|
|
5080
5248
|
},
|
|
5081
5249
|
listToolCallSpanNames() {
|
|
5082
|
-
return
|
|
5250
|
+
return spans.filter(isToolCallSpan).map(getPreferredToolCallName);
|
|
5083
5251
|
},
|
|
5084
5252
|
hasToolCallSpan(name) {
|
|
5085
|
-
return spans.some((s) =>
|
|
5253
|
+
return spans.some((s) => {
|
|
5254
|
+
return isToolCallSpan(s) && toolCallSpanMatchesName(s, name);
|
|
5255
|
+
});
|
|
5256
|
+
},
|
|
5257
|
+
getToolCallSpans(name) {
|
|
5258
|
+
return spans.filter((span) => {
|
|
5259
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, name);
|
|
5260
|
+
}).map(buildToolCallSpan);
|
|
5261
|
+
},
|
|
5262
|
+
getToolCallSpanCount(toolName) {
|
|
5263
|
+
return countToolCallSpans(toolName);
|
|
5264
|
+
},
|
|
5265
|
+
hasToolCallSpanCount(toolName, expectedCalls) {
|
|
5266
|
+
return countToolCallSpans(toolName) === expectedCalls;
|
|
5086
5267
|
},
|
|
5087
5268
|
listSpanNames(kind) {
|
|
5088
5269
|
return filterSpanNames(spans, kind);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-
|
|
1
|
+
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { Result, resultify } from "t-result";
|
|
2
3
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
4
|
import { dirname, join } from "node:path";
|
|
4
5
|
import { existsSync } from "node:fs";
|
|
5
|
-
import { Result, resultify } from "t-result";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { spawn } from "node:child_process";
|
|
8
8
|
//#region ../runner/src/chartValidation.ts
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Dkp2-rBm.mjs";
|
|
2
|
+
import "./src-8dGXUULC.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import "./cli-Dkp2-rBm.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.59.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -374,8 +374,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
374
374
|
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
375
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
376
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
377
|
-
`trace.hasToolCallSpan(name)`,
|
|
378
|
-
`trace.
|
|
377
|
+
`trace.hasToolCallSpan(name)`,
|
|
378
|
+
`trace.getToolCallSpans(name)`,
|
|
379
|
+
`trace.getToolCallSpanCount(toolName)`,
|
|
380
|
+
`trace.hasToolCallSpanCount(toolName, expectedCalls)`,
|
|
381
|
+
`trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
|
|
382
|
+
`trace.flattenDfs()`.
|
|
383
|
+
The tool-call helpers include both `kind: 'tool'` spans and imported
|
|
384
|
+
execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts
|
|
385
|
+
match the span `name` as well as GenAI/Mastra identity attributes such as
|
|
386
|
+
`genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer
|
|
387
|
+
those tool identity attributes when present. `getToolCallSpans(name)`
|
|
388
|
+
returns one normalized object per matching call, including parsed
|
|
389
|
+
`arguments`, parsed `result`, `description`, `toolType`, `attributes`, and
|
|
390
|
+
the original `span`.
|
|
379
391
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
380
392
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
381
393
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
@@ -385,9 +397,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
385
397
|
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
386
398
|
attribute paths. The default `steps` path reads an array from
|
|
387
399
|
`span.attributes.steps`; if it is missing, direct child `model_step` spans are
|
|
388
|
-
shown as that call's steps.
|
|
389
|
-
|
|
390
|
-
|
|
400
|
+
shown as that call's steps. Tool calls are aggregated from the configured
|
|
401
|
+
`toolCalls` path plus step-level `toolCalls` on authored step arrays or
|
|
402
|
+
direct `model_step` child spans, including Mastra's serialized
|
|
403
|
+
`mastra.model_step.output` format, and child `tool_call` execution spans
|
|
404
|
+
under each model step. `latencyMs` is time to first token; duration, total
|
|
405
|
+
tokens, output tokens/sec, and USD costs are derived. Override `kinds` to
|
|
406
|
+
broaden the filter,
|
|
391
407
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
392
408
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
393
409
|
`providers` entries for provider-specific rates, add `costCurrencies` to show
|