@ls-stack/agent-eval 0.61.2 → 0.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-DXQ_LDQw.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-CE1teCsp.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-zWPuRQmP.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-CPBIcMP-.mjs";
2
+ import { t as runCli } from "./cli-CCHcjbC1.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { Ft as runWithEvalRegistry, I as configureEvalRunLogs, St as resolveLlmCallsConfig, _ as createFsCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, g as createBufferedCacheStore, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as getCacheRetentionOptions, xt as resolveApiCallsConfig } from "./runExecution-D-CnSRYy.mjs";
1
+ import { Ft as runWithEvalRegistry, I as configureEvalRunLogs, St as resolveLlmCallsConfig, _ as createFsCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, g as createBufferedCacheStore, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as getCacheRetentionOptions, xt as resolveApiCallsConfig } from "./runExecution-C24aYsk3.mjs";
2
2
  //#region ../runner/src/caseChild.ts
3
3
  let fatalErrorReported = false;
4
4
  let disconnectExpected = false;
@@ -1,5 +1,5 @@
1
- import { Dt as caseRowSchema, Pt as getEvalRegistry, St as resolveLlmCallsConfig, Tt as getCaseRowCaseKey, _ as createFsCacheStore, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as getCacheRetentionOptions, wt as buildEvalKey, xt as resolveApiCallsConfig } from "./runExecution-D-CnSRYy.mjs";
2
- import { C as parseEvalDiscovery, S as loadIsolatedEvalRegistry, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as parseManualInputValues, c as getLatestRunInfos, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, l as loadPersistedCaseDetail, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as generateRunId, p as persistCaseDetail, s as getLastRunStatuses, u as loadPersistedRunSnapshot, v as runTouchesEval, w as validateCharts, x as deriveEvalFreshness, y as buildManualInputDescriptor } from "./runOrchestration-Basvyp4u.mjs";
1
+ import { Dt as caseRowSchema, Pt as getEvalRegistry, St as resolveLlmCallsConfig, Tt as getCaseRowCaseKey, _ as createFsCacheStore, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as getCacheRetentionOptions, wt as buildEvalKey, xt as resolveApiCallsConfig } from "./runExecution-C24aYsk3.mjs";
2
+ import { C as parseEvalDiscovery, S as loadIsolatedEvalRegistry, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as parseManualInputValues, c as getLatestRunInfos, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, l as loadPersistedCaseDetail, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as generateRunId, p as persistCaseDetail, s as getLastRunStatuses, u as loadPersistedRunSnapshot, v as runTouchesEval, w as validateCharts, x as deriveEvalFreshness, y as buildManualInputDescriptor } from "./runOrchestration-9XKoYcP9.mjs";
3
3
  import { parseEnv } from "node:util";
4
4
  import { resultify } from "t-result";
5
5
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
@@ -2243,8 +2243,8 @@ async function commandApp(args) {
2243
2243
  const { serve } = await import("@hono/node-server");
2244
2244
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2245
2245
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2246
- const appModule = await import("./app-Dm_9ZTVa.mjs");
2247
- const runnerModule = await import("./runner-B6UT1K7L.mjs");
2246
+ const appModule = await import("./app-CByWi7LX.mjs");
2247
+ const runnerModule = await import("./runner-5eU-FLHV.mjs");
2248
2248
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2249
2249
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2250
2250
  await runnerModule.initRunner({ loadEnv: args.loadEnv });
package/dist/index.d.mts CHANGED
@@ -377,7 +377,12 @@ declare const evalChartsConfigSchema$1: z.ZodArray<z.ZodObject<{
377
377
  /** Ordered list of history charts rendered for an eval. */
378
378
  type EvalChartsConfig$1 = z.infer<typeof evalChartsConfigSchema$1>; //#endregion
379
379
  //#region ../shared/src/schemas/config.d.ts
380
- /** Built-in eval-level output/column keys. */
380
+ /**
381
+ * Built-in eval-level output/column keys.
382
+ *
383
+ * `costUsd` controls the default LLM cost family: actual billed cost plus the
384
+ * normalized `costUsdWithoutCache` and `costUsdWarmedCache` chart outputs.
385
+ */
381
386
  declare const defaultConfigKeySchema: z.ZodEnum<{
382
387
  apiCalls: "apiCalls";
383
388
  costUsd: "costUsd";
@@ -3684,7 +3689,12 @@ declare const trialSelectionModeSchema: z.ZodEnum<{
3684
3689
  }>;
3685
3690
  /** Strategy used to collapse repeated trials into one stored case result. */
3686
3691
  type TrialSelectionMode = z.infer<typeof trialSelectionModeSchema>;
3687
- /** Built-in eval-level output/column keys. */
3692
+ /**
3693
+ * Built-in eval-level output/column keys.
3694
+ *
3695
+ * `costUsd` controls the default LLM cost family: actual billed cost plus the
3696
+ * normalized `costUsdWithoutCache` and `costUsdWarmedCache` chart outputs.
3697
+ */
3688
3698
  /** Removal config for built-in eval-level outputs and UI metadata. */
3689
3699
  declare const removeDefaultConfigSchema: z.ZodUnion<readonly [z.ZodLiteral<true>, z.ZodArray<z.ZodEnum<{
3690
3700
  costUsd: "costUsd";
@@ -4285,7 +4295,9 @@ type AgentEvalsConfig$1 = {
4285
4295
  * Defaults are derived from trace spans using the resolved `llmCalls` and
4286
4296
  * `apiCalls` extraction configs. Set to `true` to remove all defaults, or
4287
4297
  * pass specific keys such as `['costUsd', 'apiCalls']` to remove only those
4288
- * defaults globally. Per-eval removal is additive.
4298
+ * defaults globally. Removing `costUsd` removes the whole default cost
4299
+ * family, including normalized no-cache and warmed-cache outputs. Per-eval
4300
+ * removal is additive.
4289
4301
  */
4290
4302
  removeDefaultConfig?: RemoveDefaultConfig;
4291
4303
  /**
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as startEvalBackgroundJob, A as manualInputFileValueSchema, B as getCurrentScope, C as hashCacheKey, D as serializeCacheRecording, E as deserializeCacheValue, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as evalExpect, N as EvalAssertionError, O as serializeCacheValue, P as EvalRuntimeUsageError, Pt as getEvalRegistry, Q as setScopeCacheContext, R as evalLog, S as evalTracer, T as deserializeCacheRecording, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as captureEvalSpanError, it as extractApiCalls, j as readManualInputFile, k as repoFile, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKeySync, x as evalSpan, y as buildTraceTree, z as evalTime } from "./runExecution-D-CnSRYy.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-CPBIcMP-.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-SixIk0b7.mjs";
1
+ import { $ as startEvalBackgroundJob, A as manualInputFileValueSchema, B as getCurrentScope, C as hashCacheKey, D as serializeCacheRecording, E as deserializeCacheValue, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as evalExpect, N as EvalAssertionError, O as serializeCacheValue, P as EvalRuntimeUsageError, Pt as getEvalRegistry, Q as setScopeCacheContext, R as evalLog, S as evalTracer, T as deserializeCacheRecording, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as captureEvalSpanError, it as extractApiCalls, j as readManualInputFile, k as repoFile, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKeySync, x as evalSpan, y as buildTraceTree, z as evalTime } from "./runExecution-C24aYsk3.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-CCHcjbC1.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-C8n7QANC.mjs";
4
4
  export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as manualInputDescriptorSchema, I as configureEvalRunLogs, Mt as columnDefSchema, Ot as evalStatAggregateSchema, _ as createFsCacheStore, bt as runSummarySchema, et as createRunRequestSchema, jt as evalChartsConfigSchema, kt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as getCacheRetentionOptions, wt as buildEvalKey, yt as runManifestSchema } from "./runExecution-D-CnSRYy.mjs";
2
- import { C as parseEvalDiscovery, h as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-Basvyp4u.mjs";
1
+ import { At as manualInputDescriptorSchema, I as configureEvalRunLogs, Mt as columnDefSchema, Ot as evalStatAggregateSchema, _ as createFsCacheStore, bt as runSummarySchema, et as createRunRequestSchema, jt as evalChartsConfigSchema, kt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as getCacheRetentionOptions, wt as buildEvalKey, yt as runManifestSchema } from "./runExecution-C24aYsk3.mjs";
2
+ import { C as parseEvalDiscovery, h as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-9XKoYcP9.mjs";
3
3
  import { z } from "zod";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -991,7 +991,12 @@ function getCaseRowCaseKey(row) {
991
991
  //#region ../shared/src/schemas/config.ts
992
992
  /** Strategy used to collapse repeated trials into one stored case result. */
993
993
  const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
994
- /** Built-in eval-level output/column keys. */
994
+ /**
995
+ * Built-in eval-level output/column keys.
996
+ *
997
+ * `costUsd` controls the default LLM cost family: actual billed cost plus the
998
+ * normalized `costUsdWithoutCache` and `costUsdWarmedCache` chart outputs.
999
+ */
995
1000
  const defaultConfigKeySchema = z.enum([
996
1001
  "apiCalls",
997
1002
  "costUsd",
@@ -6301,21 +6306,40 @@ const costNumberFormat = {
6301
6306
  prefix: "$",
6302
6307
  maxDecimalPlaces: 4
6303
6308
  };
6304
- const DEFAULT_COLUMNS = {
6305
- apiCalls: {
6306
- label: "API Calls",
6309
+ const DEFAULT_COST_COLUMNS = {
6310
+ costUsd: {
6311
+ label: "Cost",
6307
6312
  format: "number",
6308
- numberFormat: countNumberFormat,
6313
+ numberFormat: costNumberFormat,
6309
6314
  align: "right",
6310
6315
  hideIfNoValue: true
6311
6316
  },
6312
- costUsd: {
6313
- label: "Cost",
6317
+ costUsdWithoutCache: {
6318
+ label: "Cost Without Cache",
6314
6319
  format: "number",
6315
6320
  numberFormat: costNumberFormat,
6316
6321
  align: "right",
6322
+ hideInTable: true,
6317
6323
  hideIfNoValue: true
6318
6324
  },
6325
+ costUsdWarmedCache: {
6326
+ label: "Cost Warmed Cache",
6327
+ format: "number",
6328
+ numberFormat: costNumberFormat,
6329
+ align: "right",
6330
+ hideInTable: true,
6331
+ hideIfNoValue: true
6332
+ }
6333
+ };
6334
+ const DEFAULT_COLUMNS = {
6335
+ apiCalls: {
6336
+ label: "API Calls",
6337
+ format: "number",
6338
+ numberFormat: countNumberFormat,
6339
+ align: "right",
6340
+ hideIfNoValue: true
6341
+ },
6342
+ costUsd: { ...DEFAULT_COST_COLUMNS.costUsd },
6319
6343
  llmTurns: {
6320
6344
  label: "LLM Turns",
6321
6345
  format: "number",
@@ -6389,8 +6413,16 @@ function mergeDefaultColumns(params) {
6389
6413
  };
6390
6414
  return Object.keys(merged).length > 0 ? merged : void 0;
6391
6415
  }
6416
+ const defaults = {};
6417
+ for (const key of activeKeys) {
6418
+ defaults[key] = DEFAULT_COLUMNS[key];
6419
+ if (key === "costUsd") {
6420
+ defaults.costUsdWithoutCache = DEFAULT_COST_COLUMNS.costUsdWithoutCache;
6421
+ defaults.costUsdWarmedCache = DEFAULT_COST_COLUMNS.costUsdWarmedCache;
6422
+ }
6423
+ }
6392
6424
  return {
6393
- ...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
6425
+ ...defaults,
6394
6426
  ...params.globalColumns,
6395
6427
  ...params.columns
6396
6428
  };
@@ -6445,13 +6477,29 @@ function appendDefaultCharts(params) {
6445
6477
  hideIfNoValue: true,
6446
6478
  dedupeConsecutiveValues: true,
6447
6479
  type: "area",
6448
- metrics: [{
6449
- source: "column",
6450
- key: "costUsd",
6451
- aggregate: "avg",
6452
- label: "Cost",
6453
- color: "warning"
6454
- }]
6480
+ metrics: [
6481
+ {
6482
+ source: "column",
6483
+ key: "costUsd",
6484
+ aggregate: "avg",
6485
+ label: "Actual",
6486
+ color: "warning"
6487
+ },
6488
+ {
6489
+ source: "column",
6490
+ key: "costUsdWithoutCache",
6491
+ aggregate: "avg",
6492
+ label: "Without Cache",
6493
+ color: "error"
6494
+ },
6495
+ {
6496
+ source: "column",
6497
+ key: "costUsdWarmedCache",
6498
+ aggregate: "avg",
6499
+ label: "Warmed Cache",
6500
+ color: "success"
6501
+ }
6502
+ ]
6455
6503
  });
6456
6504
  const inputTokenMetrics = [
6457
6505
  activeKeys.has("inputTokens") ? {
@@ -6538,10 +6586,24 @@ function getMaxLlmTurns(calls) {
6538
6586
  }
6539
6587
  function assignIfMissing(params) {
6540
6588
  if (!params.activeKeys.has(params.key)) return;
6589
+ assignOutputIfMissing({
6590
+ outputs: params.outputs,
6591
+ key: params.key,
6592
+ value: params.value
6593
+ });
6594
+ }
6595
+ function assignOutputIfMissing(params) {
6541
6596
  if (params.key in params.outputs) return;
6542
6597
  if (params.value === void 0) return;
6543
6598
  params.outputs[params.key] = params.value;
6544
6599
  }
6600
+ function sumSimulatedCost(params) {
6601
+ return sumNullable(params.calls.map((call) => simulateLlmCallCost({
6602
+ entry: call,
6603
+ pricing: params.llmCallsConfig.pricing,
6604
+ scenario: params.scenario
6605
+ }).totalCostUsd));
6606
+ }
6545
6607
  function addDefaultOutputs(params) {
6546
6608
  const activeKeys = new Set(getActiveDefaultConfigKeys(params));
6547
6609
  if (activeKeys.size === 0) return;
@@ -6566,6 +6628,26 @@ function addDefaultOutputs(params) {
6566
6628
  value: sumNullable(calls.map((call) => call.costUsd)),
6567
6629
  activeKeys
6568
6630
  });
6631
+ if (activeKeys.has("costUsd")) {
6632
+ assignOutputIfMissing({
6633
+ outputs: params.outputs,
6634
+ key: "costUsdWithoutCache",
6635
+ value: sumSimulatedCost({
6636
+ calls,
6637
+ llmCallsConfig: params.llmCallsConfig,
6638
+ scenario: "noCache"
6639
+ })
6640
+ });
6641
+ assignOutputIfMissing({
6642
+ outputs: params.outputs,
6643
+ key: "costUsdWarmedCache",
6644
+ value: sumSimulatedCost({
6645
+ calls,
6646
+ llmCallsConfig: params.llmCallsConfig,
6647
+ scenario: "withBaseCaching"
6648
+ })
6649
+ });
6650
+ }
6569
6651
  assignIfMissing({
6570
6652
  outputs: params.outputs,
6571
6653
  key: "inputTokens",
@@ -1,4 +1,4 @@
1
- import { Ct as buildCaseKey, Dt as caseRowSchema, Et as caseDetailSchema, Ft as runWithEvalRegistry, Tt as getCaseRowCaseKey, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as commitPendingCacheWrites, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, yt as runManifestSchema } from "./runExecution-D-CnSRYy.mjs";
1
+ import { Ct as buildCaseKey, Dt as caseRowSchema, Et as caseDetailSchema, Ft as runWithEvalRegistry, Tt as getCaseRowCaseKey, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as commitPendingCacheWrites, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, yt as runManifestSchema } from "./runExecution-C24aYsk3.mjs";
2
2
  import { Result, resultify } from "t-result";
3
3
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join } from "node:path";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DwNb5TCb.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-BEQGkHF0.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-CPBIcMP-.mjs";
2
- import "./src-SixIk0b7.mjs";
1
+ import { n as createRunner } from "./cli-CCHcjbC1.mjs";
2
+ import "./src-C8n7QANC.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance({ loadEnv = true } = {}) {
@@ -1,5 +1,5 @@
1
- import { Nt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-D-CnSRYy.mjs";
2
- import "./cli-CPBIcMP-.mjs";
1
+ import { Nt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C24aYsk3.mjs";
2
+ import "./cli-CCHcjbC1.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.61.2",
3
+ "version": "0.62.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1",
37
- "@agent-evals/shared": "0.0.1"
36
+ "@agent-evals/shared": "0.0.1",
37
+ "@agent-evals/sdk": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -109,7 +109,7 @@ export async function runRefundWorkflow(input: RefundInput) {
109
109
  }
110
110
  ```
111
111
 
112
- Span `kind` values are open-ended strings. Use familiar kinds such as `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they fit, and preserve external tracer kinds such as `mastra.workflow.step` when they are more specific. Only the `input` and `output` span attributes are promoted automatically in the trace tree; use `traceDisplay` for other span attributes such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and charts are derived from matching LLM spans by default. Prefer `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span `costUsd` attributes.
112
+ Span `kind` values are open-ended strings. Use familiar kinds such as `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they fit, and preserve external tracer kinds such as `mastra.workflow.step` when they are more specific. Only the `input` and `output` span attributes are promoted automatically in the trace tree; use `traceDisplay` for other span attributes such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and charts are derived from matching LLM spans by default. Prefer `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span `costUsd` attributes and derive normalized cost outputs for no-cache and warmed-cache chart comparisons.
113
113
 
114
114
  Use `captureEvalSpanError(error)` for recoverable errors on the active `evalTracer.span(...)`, such as optional model/tool failures that fall back and continue. You can pass one error, multiple error arguments, or an array. The span is still marked `error`. Pass `'warning'` or `{ level: 'warning' }` as the final argument for diagnostics that should not change an otherwise successful span's status.
115
115
 
@@ -212,7 +212,7 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape (forma
212
212
  - `tracingAssertions` is a single function that can be authored globally or locally on one eval when a finished-trace invariant should pass or fail the case without creating a fake score column. It receives the same `{ trace, input, case }` context as `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it. Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`, `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`, `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`, `trace.hasToolCallSpan(name)`, `trace.getToolCallSpans(name)`, `trace.getToolCallSpanCount(toolName)`, `trace.hasToolCallSpanCount(toolName, expectedCalls)`, `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`. The tool-call helpers include both `kind: 'tool'` spans and imported execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts match the span `name` as well as GenAI/Mastra identity attributes such as `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer those tool identity attributes when present. `getToolCallSpans(name)` returns one normalized object per matching call, including parsed `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and the original `span`.
213
213
  - `traceDisplay` promotes selected span attributes into the trace tree and detail pane; it supports aggregation across subtrees (`scope`, `mode`) and user-defined `transform(...)` for derived views (e.g. currency conversion). See the `TraceDisplayInputConfig` type.
214
214
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are summarized for review. Defaults to `kind: 'llm'` spans with `model`, `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional attribute paths. The default `steps` path reads an array from `span.attributes.steps`; if it is missing, direct child `model_step` spans are shown as that call's steps. Tool calls are aggregated from the configured `toolCalls` path plus step-level `toolCalls` on authored step arrays or direct `model_step` child spans, including Mastra's serialized `mastra.model_step.output` format, and child `tool_call` execution spans under each model step. `latencyMs` is time to first token; duration, total tokens, output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter, override `attributes.<field>` for non-default primitive span shapes, configure model-keyed `pricing` to derive USD costs from token counts, with nested `providers` entries for provider-specific rates, add `costCurrencies` to show converted cost columns in the expanded breakdown table only, add `derivedAttributes` to persist computed values back onto matching LLM spans before trace consumers run, and add entries to `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` can be a keyed map for one-off fields or one callback that returns multiple path/value pairs. Derived keys are dot-paths under `span.attributes`; return `undefined` to skip one span or one returned key.
215
- - Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. The web UI fills in baseline run-health stats (`cases`, `passRate`, `duration`) and a pass-rate/duration history chart when an eval has not already authored equivalent run-health UI. If discovery metadata is missing but saved runs contain runtime columns such as `costUsd`, `inputTokens`, or `apiCalls`, the single-eval page can infer the standard usage stats and charts from those saved run values. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
215
+ - Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `costUsdWithoutCache`, `costUsdWarmedCache`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. The web UI fills in baseline run-health stats (`cases`, `passRate`, `duration`) and a pass-rate/duration history chart when an eval has not already authored equivalent run-health UI. If discovery metadata is missing but saved runs contain runtime columns such as `costUsd`, `inputTokens`, or `apiCalls`, the single-eval page can infer the standard usage stats and charts from those saved run values. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. The default LLM cost chart compares actual billed cost, no-cache normalized cost, and warmed-cache normalized cost. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `costUsdWithoutCache` bills all input tokens at the base input rate; `costUsdWarmedCache` treats cacheable tokens as warmed cache reads. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`; removing `costUsd` removes the default cost family.
216
216
  - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `routeAlias`, `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read from conventional attribute paths. Override `kinds` or `attributes.<field>` for external tracers. Set a per-span `routeAlias` attribute such as `/v3/tabs/:id` to group dynamic URL paths in API-call route labels and endpoint charts while preserving original URLs in row details. Add `derivedAttributes` as a keyed map or object-returning callback for computed persisted API span attributes, and add `metrics` with the same formats and placements as LLM-call metrics.
217
217
  - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use `runLogs: { captureConsole: false }` to keep console output in the terminal without persisting console calls to case details. Manual `evalLog(...)` calls are still captured. Captured log locations store the selected user-facing source frame and the full JavaScript stack so agents can inspect additional frames in persisted artifacts when diagnosing where a log came from.
218
218