@ls-stack/agent-eval 0.58.4 → 0.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-CLkC-4Z1.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Bf5RzM8O.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-BjMMDm_O.mjs";
1
+ import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-C3XVZHRC.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Dkp2-rBm.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-8dGXUULC.mjs";
4
4
  export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-CLkC-4Z1.mjs";
2
- import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BS-WxTee.mjs";
1
+ import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
2
+ import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-B5An-AEi.mjs";
3
3
  import { z } from "zod/v4";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -2,6 +2,7 @@ import { createRequire, registerHooks } from "node:module";
2
2
  import { AsyncLocalStorage } from "node:async_hooks";
3
3
  import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
4
4
  import { z, z as z$1 } from "zod/v4";
5
+ import { resultify } from "t-result";
5
6
  import dayjs from "dayjs";
6
7
  import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
7
8
  import { mkdir, readFile, readdir, rename, rm, writeFile } from "node:fs/promises";
@@ -10,7 +11,6 @@ import { createHash, randomUUID } from "node:crypto";
10
11
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
11
12
  import { existsSync } from "node:fs";
12
13
  import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
13
- import { resultify } from "t-result";
14
14
  import { fileURLToPath, pathToFileURL } from "node:url";
15
15
  //#region ../sdk/src/defineEval.ts
16
16
  const evalRegistry = /* @__PURE__ */ new Map();
@@ -1903,7 +1903,7 @@ function getEvalTitle(evalLike) {
1903
1903
  }
1904
1904
  //#endregion
1905
1905
  //#region ../shared/src/utils/getNestedAttribute.ts
1906
- function isRecord$5(value) {
1906
+ function isRecord$6(value) {
1907
1907
  return typeof value === "object" && value !== null;
1908
1908
  }
1909
1909
  /**
@@ -1918,14 +1918,14 @@ function getNestedAttribute(value, path) {
1918
1918
  const parts = path.split(".");
1919
1919
  let current = value;
1920
1920
  for (const part of parts) {
1921
- if (!isRecord$5(current) || !(part in current)) return;
1921
+ if (!isRecord$6(current) || !(part in current)) return;
1922
1922
  current = current[part];
1923
1923
  }
1924
1924
  return current;
1925
1925
  }
1926
1926
  //#endregion
1927
1927
  //#region ../shared/src/utils/deriveCallAttributes.ts
1928
- function isRecord$4(value) {
1928
+ function isRecord$5(value) {
1929
1929
  return typeof value === "object" && value !== null;
1930
1930
  }
1931
1931
  function mergeNestedAttribute$1(value, path, attributeValue) {
@@ -1938,7 +1938,7 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
1938
1938
  continue;
1939
1939
  }
1940
1940
  const nextValue = current[part];
1941
- const nextRecord = isRecord$4(nextValue) ? { ...nextValue } : {};
1941
+ const nextRecord = isRecord$5(nextValue) ? { ...nextValue } : {};
1942
1942
  current[part] = nextRecord;
1943
1943
  current = nextRecord;
1944
1944
  }
@@ -1963,7 +1963,7 @@ function applyDerivedAttributesForKind(params) {
1963
1963
  return;
1964
1964
  }
1965
1965
  })();
1966
- if (!isRecord$4(values)) continue;
1966
+ if (!isRecord$5(values)) continue;
1967
1967
  for (const [path, value] of Object.entries(values)) {
1968
1968
  if (value === void 0) continue;
1969
1969
  attributes = mergeNestedAttribute$1(attributes, path, value);
@@ -2025,6 +2025,13 @@ function readString$2(attributes, path) {
2025
2025
  const raw = getNestedAttribute(attributes, path);
2026
2026
  return typeof raw === "string" && raw.length > 0 ? raw : null;
2027
2027
  }
2028
+ function isRecord$4(value) {
2029
+ return typeof value === "object" && value !== null && !Array.isArray(value);
2030
+ }
2031
+ function readRecordValue(value, key) {
2032
+ if (!isRecord$4(value)) return void 0;
2033
+ return value[key];
2034
+ }
2028
2035
  function computeTokenCost(tokens, usdPerMillion) {
2029
2036
  if (tokens === null) return null;
2030
2037
  if (tokens === 0) return 0;
@@ -2235,6 +2242,76 @@ function buildModelStepsByParent(spans) {
2235
2242
  }
2236
2243
  return stepsByParent;
2237
2244
  }
2245
+ function buildChildrenByParent(spans) {
2246
+ const childrenByParent = /* @__PURE__ */ new Map();
2247
+ for (const span of spans) {
2248
+ if (span.parentId === null) continue;
2249
+ const current = childrenByParent.get(span.parentId);
2250
+ if (current === void 0) {
2251
+ childrenByParent.set(span.parentId, [span]);
2252
+ continue;
2253
+ }
2254
+ current.push(span);
2255
+ }
2256
+ return childrenByParent;
2257
+ }
2258
+ function appendToolCallValues(out, value) {
2259
+ if (Array.isArray(value)) {
2260
+ out.push(...value);
2261
+ return value.length > 0;
2262
+ }
2263
+ if (value === void 0 || value === null) return false;
2264
+ out.push(value);
2265
+ return true;
2266
+ }
2267
+ function parseJsonRecord(value) {
2268
+ if (typeof value !== "string") return null;
2269
+ const parsed = resultify(() => JSON.parse(value));
2270
+ if (parsed.error || !isRecord$4(parsed.value)) return null;
2271
+ return parsed.value;
2272
+ }
2273
+ function readMastraModelStepOutput(step) {
2274
+ return parseJsonRecord(readRecordValue(readRecordValue(readRecordValue(step, "attributes"), "genAI"), "mastra.model_step.output"));
2275
+ }
2276
+ function isTraceSpan(value) {
2277
+ return isRecord$4(value) && typeof value.id === "string" && typeof value.kind === "string" && typeof value.name === "string";
2278
+ }
2279
+ function toolCallSpanToEntry(span) {
2280
+ const attrs = span.attributes;
2281
+ const genAI = readRecordValue(attrs, "genAI");
2282
+ return {
2283
+ id: span.id,
2284
+ name: span.name,
2285
+ kind: span.kind,
2286
+ status: span.status,
2287
+ input: getNestedAttribute(attrs, "input"),
2288
+ output: getNestedAttribute(attrs, "output"),
2289
+ arguments: readRecordValue(genAI, "gen_ai.tool.call.arguments"),
2290
+ result: readRecordValue(genAI, "gen_ai.tool.call.result")
2291
+ };
2292
+ }
2293
+ function appendToolCallsFromStep({ out, step, childrenByParent }) {
2294
+ let foundStepCalls = false;
2295
+ foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "toolCalls")) || foundStepCalls;
2296
+ foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "output.toolCalls")) || foundStepCalls;
2297
+ foundStepCalls = appendToolCallValues(out, getNestedAttribute(step, "attributes.output.toolCalls")) || foundStepCalls;
2298
+ const mastraOutput = readMastraModelStepOutput(step);
2299
+ if (!foundStepCalls && mastraOutput !== null) foundStepCalls = appendToolCallValues(out, mastraOutput.toolCalls) || foundStepCalls;
2300
+ if (!isTraceSpan(step)) return;
2301
+ const childToolSpans = childrenByParent.get(step.id)?.filter((child) => child.kind === "tool_call") ?? [];
2302
+ if (childToolSpans.length === 0) return;
2303
+ out.push(...childToolSpans.map((child) => toolCallSpanToEntry(child)));
2304
+ }
2305
+ function readToolCalls({ attributes, path, stepDetails, childrenByParent }) {
2306
+ const out = [];
2307
+ appendToolCallValues(out, getNestedAttribute(attributes, path));
2308
+ if (stepDetails !== null) for (const step of stepDetails) appendToolCallsFromStep({
2309
+ out,
2310
+ step,
2311
+ childrenByParent
2312
+ });
2313
+ return out.length > 0 ? out : void 0;
2314
+ }
2238
2315
  function collectWarnings$1(span) {
2239
2316
  const out = [];
2240
2317
  if (span.warning) out.push(span.warning);
@@ -2278,6 +2355,7 @@ function pickError$1(span) {
2278
2355
  function extractLlmCalls(spans, config) {
2279
2356
  const kindSet = new Set(config.kinds);
2280
2357
  const modelStepsByParent = buildModelStepsByParent(spans);
2358
+ const childrenByParent = buildChildrenByParent(spans);
2281
2359
  const result = [];
2282
2360
  for (const span of spans) {
2283
2361
  if (!kindSet.has(span.kind)) continue;
@@ -2336,6 +2414,8 @@ function extractLlmCalls(spans, config) {
2336
2414
  placements: metric.placements
2337
2415
  });
2338
2416
  }
2417
+ const childModelSteps = modelStepsByParent.get(span.id) ?? [];
2418
+ const stepInfo = readSteps(attrs, config.attributes.steps, childModelSteps);
2339
2419
  result.push({
2340
2420
  id: span.id,
2341
2421
  name: span.name,
@@ -2363,13 +2443,18 @@ function extractLlmCalls(spans, config) {
2363
2443
  cachedInputCostUsd,
2364
2444
  cacheCreationInputCostUsd,
2365
2445
  reasoningCostUsd,
2366
- ...readSteps(attrs, config.attributes.steps, modelStepsByParent.get(span.id) ?? []),
2446
+ ...stepInfo,
2367
2447
  finishReason: readString$2(attrs, config.attributes.finishReason),
2368
2448
  durationMs,
2369
2449
  input: getNestedAttribute(attrs, config.attributes.input),
2370
2450
  output: getNestedAttribute(attrs, config.attributes.output),
2371
2451
  reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
2372
- toolCalls: getNestedAttribute(attrs, config.attributes.toolCalls),
2452
+ toolCalls: readToolCalls({
2453
+ attributes: attrs,
2454
+ path: config.attributes.toolCalls,
2455
+ stepDetails: stepInfo.stepDetails,
2456
+ childrenByParent
2457
+ }),
2373
2458
  metrics,
2374
2459
  warnings: collectWarnings$1(span),
2375
2460
  error: pickError$1(span)
@@ -5057,6 +5142,89 @@ function buildTraceTree(spans, checkpoints) {
5057
5142
  visit(null);
5058
5143
  return result;
5059
5144
  };
5145
+ const isRecord = (value) => {
5146
+ return typeof value === "object" && value !== null;
5147
+ };
5148
+ const readRecordValue = (value, key) => {
5149
+ if (!isRecord(value)) return void 0;
5150
+ const child = value[key];
5151
+ return isRecord(child) ? child : void 0;
5152
+ };
5153
+ const readStringValue = (value, key) => {
5154
+ if (!isRecord(value)) return void 0;
5155
+ const child = value[key];
5156
+ return typeof child === "string" && child.length > 0 ? child : void 0;
5157
+ };
5158
+ const readValue = (value, key) => {
5159
+ if (!isRecord(value)) return void 0;
5160
+ return value[key];
5161
+ };
5162
+ const parseMaybeJson = (value) => {
5163
+ if (typeof value !== "string") return value;
5164
+ const parsed = resultify(() => JSON.parse(value));
5165
+ return parsed.error ? value : parsed.value;
5166
+ };
5167
+ const firstDefined = (values) => {
5168
+ return values.find((value) => value !== void 0);
5169
+ };
5170
+ const getToolCallMetadata = (span) => {
5171
+ const attributes = span.attributes;
5172
+ return {
5173
+ attributes,
5174
+ genAI: readRecordValue(attributes, "genAI"),
5175
+ mastra: readRecordValue(attributes, "mastra"),
5176
+ toolAttributes: readRecordValue(attributes, "attributes")
5177
+ };
5178
+ };
5179
+ const isToolCallSpan = (span) => {
5180
+ const { attributes, genAI, mastra } = getToolCallMetadata(span);
5181
+ return span.kind === "tool" || span.kind === "tool_call" || readStringValue(attributes, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "mastra.span.type") === "tool_call" || readStringValue(mastra, "type") === "tool_call" || readStringValue(mastra, "entityType") === "tool";
5182
+ };
5183
+ const getToolCallIdentityNames = (span) => {
5184
+ const { attributes, genAI, mastra } = getToolCallMetadata(span);
5185
+ return [
5186
+ readStringValue(attributes, "gen_ai.tool.name"),
5187
+ readStringValue(genAI, "gen_ai.tool.name"),
5188
+ readStringValue(mastra, "entityName"),
5189
+ readStringValue(mastra, "entityId"),
5190
+ span.name
5191
+ ].filter((name) => name !== void 0);
5192
+ };
5193
+ const getPreferredToolCallName = (span) => {
5194
+ return getToolCallIdentityNames(span)[0] ?? span.name;
5195
+ };
5196
+ const toolCallSpanMatchesName = (span, toolName) => {
5197
+ return getToolCallIdentityNames(span).includes(toolName);
5198
+ };
5199
+ const countToolCallSpans = (toolName) => {
5200
+ return spans.filter((span) => {
5201
+ return isToolCallSpan(span) && toolCallSpanMatchesName(span, toolName);
5202
+ }).length;
5203
+ };
5204
+ const buildToolCallSpan = (span) => {
5205
+ const { attributes, genAI, toolAttributes } = getToolCallMetadata(span);
5206
+ return {
5207
+ name: getPreferredToolCallName(span),
5208
+ spanName: span.name,
5209
+ kind: span.kind,
5210
+ arguments: parseMaybeJson(firstDefined([
5211
+ readValue(attributes, "gen_ai.tool.call.arguments"),
5212
+ readValue(genAI, "gen_ai.tool.call.arguments"),
5213
+ readValue(attributes, "arguments"),
5214
+ readValue(attributes, "input")
5215
+ ])),
5216
+ result: parseMaybeJson(firstDefined([
5217
+ readValue(attributes, "gen_ai.tool.call.result"),
5218
+ readValue(genAI, "gen_ai.tool.call.result"),
5219
+ readValue(attributes, "result"),
5220
+ readValue(attributes, "output")
5221
+ ])),
5222
+ description: readStringValue(attributes, "gen_ai.tool.description") ?? readStringValue(genAI, "gen_ai.tool.description") ?? readStringValue(toolAttributes, "toolDescription"),
5223
+ toolType: readStringValue(attributes, "gen_ai.tool.type") ?? readStringValue(genAI, "gen_ai.tool.type") ?? readStringValue(toolAttributes, "toolType"),
5224
+ attributes,
5225
+ span
5226
+ };
5227
+ };
5060
5228
  const filterSpanNames = (sourceSpans, kind) => {
5061
5229
  return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
5062
5230
  };
@@ -5076,13 +5244,26 @@ function buildTraceTree(spans, checkpoints) {
5076
5244
  return spans.filter((s) => s.kind === kind);
5077
5245
  },
5078
5246
  findToolCallSpans() {
5079
- return spans.filter((s) => s.kind === "tool");
5247
+ return spans.filter(isToolCallSpan);
5080
5248
  },
5081
5249
  listToolCallSpanNames() {
5082
- return filterSpanNames(spans, "tool");
5250
+ return spans.filter(isToolCallSpan).map(getPreferredToolCallName);
5083
5251
  },
5084
5252
  hasToolCallSpan(name) {
5085
- return spans.some((s) => s.kind === "tool" && s.name === name);
5253
+ return spans.some((s) => {
5254
+ return isToolCallSpan(s) && toolCallSpanMatchesName(s, name);
5255
+ });
5256
+ },
5257
+ getToolCallSpans(name) {
5258
+ return spans.filter((span) => {
5259
+ return isToolCallSpan(span) && toolCallSpanMatchesName(span, name);
5260
+ }).map(buildToolCallSpan);
5261
+ },
5262
+ getToolCallSpanCount(toolName) {
5263
+ return countToolCallSpans(toolName);
5264
+ },
5265
+ hasToolCallSpanCount(toolName, expectedCalls) {
5266
+ return countToolCallSpans(toolName) === expectedCalls;
5086
5267
  },
5087
5268
  listSpanNames(kind) {
5088
5269
  return filterSpanNames(spans, kind);
@@ -1,8 +1,8 @@
1
- import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-CLkC-4Z1.mjs";
1
+ import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-C3XVZHRC.mjs";
2
+ import { Result, resultify } from "t-result";
2
3
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
4
  import { dirname, join } from "node:path";
4
5
  import { existsSync } from "node:fs";
5
- import { Result, resultify } from "t-result";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { spawn } from "node:child_process";
8
8
  //#region ../runner/src/chartValidation.ts
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DW-11txl.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Bf5RzM8O.mjs";
2
- import "./src-BjMMDm_O.mjs";
1
+ import { n as createRunner } from "./cli-Dkp2-rBm.mjs";
2
+ import "./src-8dGXUULC.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-CLkC-4Z1.mjs";
2
- import "./cli-Bf5RzM8O.mjs";
1
+ import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-C3XVZHRC.mjs";
2
+ import "./cli-Dkp2-rBm.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.58.4",
3
+ "version": "0.59.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1",
37
- "@agent-evals/shared": "0.0.1"
36
+ "@agent-evals/shared": "0.0.1",
37
+ "@agent-evals/sdk": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -374,8 +374,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
374
374
  Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
375
375
  `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
376
376
  `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
377
- `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
378
- `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
377
+ `trace.hasToolCallSpan(name)`,
378
+ `trace.getToolCallSpans(name)`,
379
+ `trace.getToolCallSpanCount(toolName)`,
380
+ `trace.hasToolCallSpanCount(toolName, expectedCalls)`,
381
+ `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
382
+ `trace.flattenDfs()`.
383
+ The tool-call helpers include both `kind: 'tool'` spans and imported
384
+ execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts
385
+ match the span `name` as well as GenAI/Mastra identity attributes such as
386
+ `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer
387
+ those tool identity attributes when present. `getToolCallSpans(name)`
388
+ returns one normalized object per matching call, including parsed
389
+ `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and
390
+ the original `span`.
379
391
  - `traceDisplay` promotes selected span attributes into the trace tree and
380
392
  detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
381
393
  user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -385,9 +397,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
385
397
  `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
386
398
  attribute paths. The default `steps` path reads an array from
387
399
  `span.attributes.steps`; if it is missing, direct child `model_step` spans are
388
- shown as that call's steps. `latencyMs` is time to first token; duration,
389
- total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
390
- to broaden the filter,
400
+ shown as that call's steps. Tool calls are aggregated from the configured
401
+ `toolCalls` path plus step-level `toolCalls` on authored step arrays or
402
+ direct `model_step` child spans, including Mastra's serialized
403
+ `mastra.model_step.output` format, and child `tool_call` execution spans
404
+ under each model step. `latencyMs` is time to first token; duration, total
405
+ tokens, output tokens/sec, and USD costs are derived. Override `kinds` to
406
+ broaden the filter,
391
407
  override `attributes.<field>` for non-default primitive span shapes, configure
392
408
  model-keyed `pricing` to derive USD costs from token counts, with nested
393
409
  `providers` entries for provider-specific rates, add `costCurrencies` to show