npm - @ls-stack/agent-eval - Versions diffs - 0.58.4 → 0.59.0 - Mend

@ls-stack/agent-eval 0.58.4 → 0.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-sGeXC4AT.mjs → app-B3PEtWqH.mjs} +5 -5
package/dist/apps/web/dist/assets/{index-BXFsxHVc.js → index-BD6FXk5p.js} +79 -79
package/dist/apps/web/dist/assets/index-C2fbGEsB.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +1 -1
package/dist/{cli-Bf5RzM8O.mjs → cli-Dkp2-rBm.mjs} +5 -5
package/dist/index.d.mts +150 -112
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-CLkC-4Z1.mjs → runExecution-C3XVZHRC.mjs} +192 -11
package/dist/{runOrchestration-BS-WxTee.mjs → runOrchestration-B5An-AEi.mjs} +2 -2
package/dist/{runner-Bz5ZPqmm.mjs → runner-BJXz_V_V.mjs} +1 -1
package/dist/{runner-DW-11txl.mjs → runner-C9J-1fkp.mjs} +2 -2
package/dist/{src-BjMMDm_O.mjs → src-8dGXUULC.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +21 -5
package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +0 -1

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-CLkC-4Z1.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Bf5RzM8O.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-BjMMDm_O.mjs";
+import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-C3XVZHRC.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Dkp2-rBm.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-8dGXUULC.mjs";
 export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-CLkC-4Z1.mjs";
-import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BS-WxTee.mjs";
+import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
+import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-B5An-AEi.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runExecution-CLkC-4Z1.mjs → runExecution-C3XVZHRC.mjs} RENAMED Viewed

@@ -2,6 +2,7 @@ import { createRequire, registerHooks } from "node:module";
 import { AsyncLocalStorage } from "node:async_hooks";
 import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { z, z as z$1 } from "zod/v4";
+import { resultify } from "t-result";
 import dayjs from "dayjs";
 import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
 import { mkdir, readFile, readdir, rename, rm, writeFile } from "node:fs/promises";
@@ -10,7 +11,6 @@ import { createHash, randomUUID } from "node:crypto";
 import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
 import { existsSync } from "node:fs";
 import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
-import { resultify } from "t-result";
 import { fileURLToPath, pathToFileURL } from "node:url";
 //#region ../sdk/src/defineEval.ts
 const evalRegistry = /* @__PURE__ */ new Map();
@@ -1903,7 +1903,7 @@ function getEvalTitle(evalLike) {
 }
 //#endregion
 //#region ../shared/src/utils/getNestedAttribute.ts
-function isRecord$5(value) {
+function isRecord$6(value) {
 	return typeof value === "object" && value !== null;
 }
 /**
@@ -1918,14 +1918,14 @@ function getNestedAttribute(value, path) {
 	const parts = path.split(".");
 	let current = value;
 	for (const part of parts) {
-		if (!isRecord$5(current) || !(part in current)) return;
+		if (!isRecord$6(current) || !(part in current)) return;
 		current = current[part];
 	}
 	return current;
 }
 //#endregion
 //#region ../shared/src/utils/deriveCallAttributes.ts
-function isRecord$4(value) {
+function isRecord$5(value) {
 	return typeof value === "object" && value !== null;
 }
 function mergeNestedAttribute$1(value, path, attributeValue) {
@@ -1938,7 +1938,7 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
 			continue;
 		}
 		const nextValue = current[part];
-		const nextRecord = isRecord$4(nextValue) ? { ...nextValue } : {};
+		const nextRecord = isRecord$5(nextValue) ? { ...nextValue } : {};
 		current[part] = nextRecord;
 		current = nextRecord;
 	}
@@ -1963,7 +1963,7 @@ function applyDerivedAttributesForKind(params) {
 					return;
 				}
 			})();
-			if (!isRecord$4(values)) continue;
+			if (!isRecord$5(values)) continue;
 			for (const [path, value] of Object.entries(values)) {
 				if (value === void 0) continue;
 				attributes = mergeNestedAttribute$1(attributes, path, value);
@@ -2025,6 +2025,13 @@ function readString$2(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
 	return typeof raw === "string" && raw.length > 0 ? raw : null;
 }
+function isRecord$4(value) {
+	return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function readRecordValue(value, key) {
+	if (!isRecord$4(value)) return void 0;
+	return value[key];
+}
 function computeTokenCost(tokens, usdPerMillion) {
 	if (tokens === null) return null;
 	if (tokens === 0) return 0;
@@ -2235,6 +2242,76 @@ function buildModelStepsByParent(spans) {
 	}
 	return stepsByParent;
 }
+function buildChildrenByParent(spans) {
+	const childrenByParent = /* @__PURE__ */ new Map();
+	for (const span of spans) {
+		if (span.parentId === null) continue;
+		const current = childrenByParent.get(span.parentId);
+		if (current === void 0) {
+			childrenByParent.set(span.parentId, [span]);
+			continue;
+		}
+		current.push(span);
+	}
+	return childrenByParent;
+}
+function appendToolCallValues(out, value) {
+	if (Array.isArray(value)) {
+		out.push(...value);
+		return value.length > 0;
+	}
+	if (value === void 0 || value === null) return false;
+	out.push(value);
+	return true;
+}
+function parseJsonRecord(value) {
+	if (typeof value !== "string") return null;
+	const parsed = resultify(() => JSON.parse(value));
+	if (parsed.error || !isRecord$4(parsed.value)) return null;
+	return parsed.value;
+}
+function readMastraModelStepOutput(step) {
+	return parseJsonRecord(readRecordValue(readRecordValue(readRecordValue(step, "attributes"), "genAI"), "mastra.model_step.output"));
+}
+function isTraceSpan(value) {
+	return isRecord$4(value) && typeof value.id === "string" && typeof value.kind === "string" && typeof value.name === "string";
+}
+function toolCallSpanToEntry(span) {
+	const attrs = span.attributes;
+	const genAI = readRecordValue(attrs, "genAI");
+	return {
+		id: span.id,
+		name: span.name,
+		kind: span.kind,
+		status: span.status,
+		input: getNestedAttribute(attrs, "input"),
+		output: getNestedAttribute(attrs, "output"),
+		arguments: readRecordValue(genAI, "gen_ai.tool.call.arguments"),
+		result: readRecordValue(genAI, "gen_ai.tool.call.result")
+	};
+}
+function appendToolCallsFromStep({ out, step, childrenByParent }) {
+	let foundStepCalls = false;
+	foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "toolCalls")) || foundStepCalls;
+	foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "output.toolCalls")) || foundStepCalls;
+	foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "attributes.output.toolCalls")) || foundStepCalls;
+	const mastraOutput = readMastraModelStepOutput(step);
+	if (!foundStepCalls && mastraOutput !== null) foundStepCalls = appendToolCallValues(out, mastraOutput.toolCalls) || foundStepCalls;
+	if (!isTraceSpan(step)) return;
+	const childToolSpans = childrenByParent.get(step.id)?.filter((child) => child.kind === "tool_call") ?? [];
+	if (childToolSpans.length === 0) return;
+	out.push(...childToolSpans.map((child) => toolCallSpanToEntry(child)));
+}
+function readToolCalls({ attributes, path, stepDetails, childrenByParent }) {
+	const out = [];
+	appendToolCallValues(out, getNestedAttribute(attributes, path));
+	if (stepDetails !== null) for (const step of stepDetails) appendToolCallsFromStep({
+		out,
+		step,
+		childrenByParent
+	});
+	return out.length > 0 ? out : void 0;
+}
 function collectWarnings$1(span) {
 	const out = [];
 	if (span.warning) out.push(span.warning);
@@ -2278,6 +2355,7 @@ function pickError$1(span) {
 function extractLlmCalls(spans, config) {
 	const kindSet = new Set(config.kinds);
 	const modelStepsByParent = buildModelStepsByParent(spans);
+	const childrenByParent = buildChildrenByParent(spans);
 	const result = [];
 	for (const span of spans) {
 		if (!kindSet.has(span.kind)) continue;
@@ -2336,6 +2414,8 @@ function extractLlmCalls(spans, config) {
 				placements: metric.placements
 			});
 		}
+		const childModelSteps = modelStepsByParent.get(span.id) ?? [];
+		const stepInfo = readSteps(attrs, config.attributes.steps, childModelSteps);
 		result.push({
 			id: span.id,
 			name: span.name,
@@ -2363,13 +2443,18 @@ function extractLlmCalls(spans, config) {
 			cachedInputCostUsd,
 			cacheCreationInputCostUsd,
 			reasoningCostUsd,
-			...readSteps(attrs, config.attributes.steps, modelStepsByParent.get(span.id) ?? []),
+			...stepInfo,
 			finishReason: readString$2(attrs, config.attributes.finishReason),
 			durationMs,
 			input: getNestedAttribute(attrs, config.attributes.input),
 			output: getNestedAttribute(attrs, config.attributes.output),
 			reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
-			toolCalls: getNestedAttribute(attrs, config.attributes.toolCalls),
+			toolCalls: readToolCalls({
+				attributes: attrs,
+				path: config.attributes.toolCalls,
+				stepDetails: stepInfo.stepDetails,
+				childrenByParent
+			}),
 			metrics,
 			warnings: collectWarnings$1(span),
 			error: pickError$1(span)
@@ -5057,6 +5142,89 @@ function buildTraceTree(spans, checkpoints) {
 		visit(null);
 		return result;
 	};
+	const isRecord = (value) => {
+		return typeof value === "object" && value !== null;
+	};
+	const readRecordValue = (value, key) => {
+		if (!isRecord(value)) return void 0;
+		const child = value[key];
+		return isRecord(child) ? child : void 0;
+	};
+	const readStringValue = (value, key) => {
+		if (!isRecord(value)) return void 0;
+		const child = value[key];
+		return typeof child === "string" && child.length > 0 ? child : void 0;
+	};
+	const readValue = (value, key) => {
+		if (!isRecord(value)) return void 0;
+		return value[key];
+	};
+	const parseMaybeJson = (value) => {
+		if (typeof value !== "string") return value;
+		const parsed = resultify(() => JSON.parse(value));
+		return parsed.error ? value : parsed.value;
+	};
+	const firstDefined = (values) => {
+		return values.find((value) => value !== void 0);
+	};
+	const getToolCallMetadata = (span) => {
+		const attributes = span.attributes;
+		return {
+			attributes,
+			genAI: readRecordValue(attributes, "genAI"),
+			mastra: readRecordValue(attributes, "mastra"),
+			toolAttributes: readRecordValue(attributes, "attributes")
+		};
+	};
+	const isToolCallSpan = (span) => {
+		const { attributes, genAI, mastra } = getToolCallMetadata(span);
+		return span.kind === "tool" || span.kind === "tool_call" || readStringValue(attributes, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "mastra.span.type") === "tool_call" || readStringValue(mastra, "type") === "tool_call" || readStringValue(mastra, "entityType") === "tool";
+	};
+	const getToolCallIdentityNames = (span) => {
+		const { attributes, genAI, mastra } = getToolCallMetadata(span);
+		return [
+			readStringValue(attributes, "gen_ai.tool.name"),
+			readStringValue(genAI, "gen_ai.tool.name"),
+			readStringValue(mastra, "entityName"),
+			readStringValue(mastra, "entityId"),
+			span.name
+		].filter((name) => name !== void 0);
+	};
+	const getPreferredToolCallName = (span) => {
+		return getToolCallIdentityNames(span)[0] ?? span.name;
+	};
+	const toolCallSpanMatchesName = (span, toolName) => {
+		return getToolCallIdentityNames(span).includes(toolName);
+	};
+	const countToolCallSpans = (toolName) => {
+		return spans.filter((span) => {
+			return isToolCallSpan(span) && toolCallSpanMatchesName(span, toolName);
+		}).length;
+	};
+	const buildToolCallSpan = (span) => {
+		const { attributes, genAI, toolAttributes } = getToolCallMetadata(span);
+		return {
+			name: getPreferredToolCallName(span),
+			spanName: span.name,
+			kind: span.kind,
+			arguments: parseMaybeJson(firstDefined([
+				readValue(attributes, "gen_ai.tool.call.arguments"),
+				readValue(genAI, "gen_ai.tool.call.arguments"),
+				readValue(attributes, "arguments"),
+				readValue(attributes, "input")
+			])),
+			result: parseMaybeJson(firstDefined([
+				readValue(attributes, "gen_ai.tool.call.result"),
+				readValue(genAI, "gen_ai.tool.call.result"),
+				readValue(attributes, "result"),
+				readValue(attributes, "output")
+			])),
+			description: readStringValue(attributes, "gen_ai.tool.description") ?? readStringValue(genAI, "gen_ai.tool.description") ?? readStringValue(toolAttributes, "toolDescription"),
+			toolType: readStringValue(attributes, "gen_ai.tool.type") ?? readStringValue(genAI, "gen_ai.tool.type") ?? readStringValue(toolAttributes, "toolType"),
+			attributes,
+			span
+		};
+	};
 	const filterSpanNames = (sourceSpans, kind) => {
 		return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
 	};
@@ -5076,13 +5244,26 @@ function buildTraceTree(spans, checkpoints) {
 			return spans.filter((s) => s.kind === kind);
 		},
 		findToolCallSpans() {
-			return spans.filter((s) => s.kind === "tool");
+			return spans.filter(isToolCallSpan);
 		},
 		listToolCallSpanNames() {
-			return filterSpanNames(spans, "tool");
+			return spans.filter(isToolCallSpan).map(getPreferredToolCallName);
 		},
 		hasToolCallSpan(name) {
-			return spans.some((s) => s.kind === "tool" && s.name === name);
+			return spans.some((s) => {
+				return isToolCallSpan(s) && toolCallSpanMatchesName(s, name);
+			});
+		},
+		getToolCallSpans(name) {
+			return spans.filter((span) => {
+				return isToolCallSpan(span) && toolCallSpanMatchesName(span, name);
+			}).map(buildToolCallSpan);
+		},
+		getToolCallSpanCount(toolName) {
+			return countToolCallSpans(toolName);
+		},
+		hasToolCallSpanCount(toolName, expectedCalls) {
+			return countToolCallSpans(toolName) === expectedCalls;
 		},
 		listSpanNames(kind) {
 			return filterSpanNames(spans, kind);

package/dist/{runOrchestration-BS-WxTee.mjs → runOrchestration-B5An-AEi.mjs} RENAMED Viewed

@@ -1,8 +1,8 @@
-import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-CLkC-4Z1.mjs";
+import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-C3XVZHRC.mjs";
+import { Result, resultify } from "t-result";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import { existsSync } from "node:fs";
-import { Result, resultify } from "t-result";
 import { fileURLToPath } from "node:url";
 import { spawn } from "node:child_process";
 //#region ../runner/src/chartValidation.ts

package/dist/{runner-Bz5ZPqmm.mjs → runner-BJXz_V_V.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-DW-11txl.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-DW-11txl.mjs → runner-C9J-1fkp.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Bf5RzM8O.mjs";
-import "./src-BjMMDm_O.mjs";
+import { n as createRunner } from "./cli-Dkp2-rBm.mjs";
+import "./src-8dGXUULC.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-BjMMDm_O.mjs → src-8dGXUULC.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-CLkC-4Z1.mjs";
-import "./cli-Bf5RzM8O.mjs";
+import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-C3XVZHRC.mjs";
+import "./cli-Dkp2-rBm.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.58.4",
+  "version": "0.59.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -374,8 +374,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
   `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
   `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
-  `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
-  `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
+  `trace.hasToolCallSpan(name)`,
+  `trace.getToolCallSpans(name)`,
+  `trace.getToolCallSpanCount(toolName)`,
+  `trace.hasToolCallSpanCount(toolName, expectedCalls)`,
+  `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
+  `trace.flattenDfs()`.
+  The tool-call helpers include both `kind: 'tool'` spans and imported
+  execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts
+  match the span `name` as well as GenAI/Mastra identity attributes such as
+  `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer
+  those tool identity attributes when present. `getToolCallSpans(name)`
+  returns one normalized object per matching call, including parsed
+  `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and
+  the original `span`.
 - `traceDisplay` promotes selected span attributes into the trace tree and
   detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
   user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -385,9 +397,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
   attribute paths. The default `steps` path reads an array from
   `span.attributes.steps`; if it is missing, direct child `model_step` spans are
-  shown as that call's steps. `latencyMs` is time to first token; duration,
-  total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
-  to broaden the filter,
+  shown as that call's steps. Tool calls are aggregated from the configured
+  `toolCalls` path plus step-level `toolCalls` on authored step arrays or
+  direct `model_step` child spans, including Mastra's serialized
+  `mastra.model_step.output` format, and child `tool_call` execution spans
+  under each model step. `latencyMs` is time to first token; duration, total
+  tokens, output tokens/sec, and USD costs are derived. Override `kinds` to
+  broaden the filter,
   override `attributes.<field>` for non-default primitive span shapes, configure
   model-keyed `pricing` to derive USD costs from token counts, with nested
   `providers` entries for provider-specific rates, add `costCurrencies` to show