@agentv/core 2.8.0-next.1 → 2.9.0-next.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-P2465XAH.js";
9
+ } from "../../chunk-7Q4PH265.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1686,14 +1686,16 @@ function computeTraceSummary(messages) {
1686
1686
  }
1687
1687
  const toolNames = Object.keys(toolCallCounts).sort();
1688
1688
  return {
1689
- eventCount: totalToolCalls,
1690
- toolNames,
1691
- toolCallsByName: toolCallCounts,
1692
- errorCount: 0,
1689
+ trace: {
1690
+ eventCount: totalToolCalls,
1691
+ toolNames,
1692
+ toolCallsByName: toolCallCounts,
1693
+ errorCount: 0,
1694
+ llmCallCount,
1695
+ ...hasAnyDuration ? { toolDurations } : {}
1696
+ },
1693
1697
  startTime: earliestStart?.toISOString(),
1694
- endTime: latestEnd?.toISOString(),
1695
- llmCallCount,
1696
- ...hasAnyDuration ? { toolDurations } : {}
1698
+ endTime: latestEnd?.toISOString()
1697
1699
  };
1698
1700
  }
1699
1701
  var DEFAULT_EXPLORATION_TOOLS = [
@@ -1716,9 +1718,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
1716
1718
  );
1717
1719
  return explorationCalls / summary.eventCount;
1718
1720
  }
1719
- function tokensPerTool(summary) {
1720
- if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
1721
- const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
1721
+ function tokensPerTool(summary, tokenUsage) {
1722
+ if (!tokenUsage || summary.eventCount === 0) return void 0;
1723
+ const totalTokens = tokenUsage.input + tokenUsage.output;
1722
1724
  return totalTokens / summary.eventCount;
1723
1725
  }
1724
1726
  function avgToolDurationMs(summary) {
@@ -1734,16 +1736,15 @@ function avgToolDurationMs(summary) {
1734
1736
  if (totalCalls === 0) return void 0;
1735
1737
  return totalDuration / totalCalls;
1736
1738
  }
1737
- function mergeExecutionMetrics(summary, metrics) {
1738
- if (!metrics) return summary;
1739
+ function mergeExecutionMetrics(computed, metrics) {
1740
+ if (!metrics) return computed;
1739
1741
  return {
1740
- ...summary,
1742
+ trace: computed.trace,
1741
1743
  tokenUsage: metrics.tokenUsage,
1742
1744
  costUsd: metrics.costUsd,
1743
1745
  durationMs: metrics.durationMs,
1744
- // Provider-level timing takes precedence over span-derived timing
1745
- startTime: metrics.startTime ?? summary.startTime,
1746
- endTime: metrics.endTime ?? summary.endTime
1746
+ startTime: metrics.startTime ?? computed.startTime,
1747
+ endTime: metrics.endTime ?? computed.endTime
1747
1748
  };
1748
1749
  }
1749
1750
 
@@ -4815,10 +4816,13 @@ async function invokeModel(options) {
4815
4816
  }
4816
4817
  function mapResponse(result) {
4817
4818
  const content = result.text ?? "";
4819
+ const rawUsage = result.totalUsage ?? result.usage;
4820
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
4818
4821
  return {
4819
4822
  raw: result,
4820
- usage: toJsonObject(result.totalUsage ?? result.usage),
4821
- output: [{ role: "assistant", content }]
4823
+ usage: toJsonObject(rawUsage),
4824
+ output: [{ role: "assistant", content }],
4825
+ tokenUsage
4822
4826
  };
4823
4827
  }
4824
4828
  function toJsonObject(value) {
@@ -11208,6 +11212,8 @@ async function createTargetProxy(options) {
11208
11212
  const token = (0, import_node_crypto7.randomBytes)(32).toString("hex");
11209
11213
  let callCount = 0;
11210
11214
  let isShutdown = false;
11215
+ let totalInputTokens = 0;
11216
+ let totalOutputTokens = 0;
11211
11217
  const targetsList = availableTargets ?? [defaultProvider.targetName];
11212
11218
  function resolveProvider(targetName) {
11213
11219
  if (targetName === void 0 || targetName === defaultProvider.targetName) {
@@ -11286,11 +11292,16 @@ async function createTargetProxy(options) {
11286
11292
  evalCaseId: request.evalCaseId ?? "proxy",
11287
11293
  attempt: request.attempt ?? 1
11288
11294
  });
11295
+ if (response.tokenUsage) {
11296
+ totalInputTokens += response.tokenUsage.input;
11297
+ totalOutputTokens += response.tokenUsage.output;
11298
+ }
11289
11299
  const output = response.output ?? [];
11290
11300
  const rawText = extractLastAssistantContent(output);
11291
11301
  const result = {
11292
11302
  output,
11293
- rawText
11303
+ rawText,
11304
+ tokenUsage: response.tokenUsage
11294
11305
  };
11295
11306
  sendJson(res, 200, result);
11296
11307
  } catch (error) {
@@ -11337,10 +11348,15 @@ async function createTargetProxy(options) {
11337
11348
  evalCaseId: request.evalCaseId ?? "proxy",
11338
11349
  attempt: request.attempt ?? 1
11339
11350
  });
11351
+ if (response.tokenUsage) {
11352
+ totalInputTokens += response.tokenUsage.input;
11353
+ totalOutputTokens += response.tokenUsage.output;
11354
+ }
11340
11355
  const output = response.output ?? [];
11341
11356
  responses.push({
11342
11357
  output,
11343
- rawText: extractLastAssistantContent(output)
11358
+ rawText: extractLastAssistantContent(output),
11359
+ tokenUsage: response.tokenUsage
11344
11360
  });
11345
11361
  } catch (error) {
11346
11362
  const message = error instanceof Error ? error.message : String(error);
@@ -11379,7 +11395,8 @@ async function createTargetProxy(options) {
11379
11395
  },
11380
11396
  getUsageMetadata: () => ({
11381
11397
  callCount,
11382
- maxCalls
11398
+ maxCalls,
11399
+ tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
11383
11400
  })
11384
11401
  };
11385
11402
  }
@@ -11504,6 +11521,11 @@ var CodeEvaluator = class {
11504
11521
  ),
11505
11522
  input: context2.evalCase.input,
11506
11523
  trace: context2.trace ?? null,
11524
+ tokenUsage: context2.tokenUsage ?? null,
11525
+ costUsd: context2.costUsd ?? null,
11526
+ durationMs: context2.durationMs ?? null,
11527
+ startTime: context2.startTime ?? null,
11528
+ endTime: context2.endTime ?? null,
11507
11529
  fileChanges: context2.fileChanges ?? null,
11508
11530
  workspacePath: context2.workspacePath ?? null,
11509
11531
  config: this.config ?? null
@@ -11562,7 +11584,8 @@ var CodeEvaluator = class {
11562
11584
  expectedAspectCount: hits.length + misses.length || 1,
11563
11585
  reasoning,
11564
11586
  evaluatorRawRequest,
11565
- ...details ? { details } : {}
11587
+ ...details ? { details } : {},
11588
+ tokenUsage: proxyUsage?.tokenUsage
11566
11589
  };
11567
11590
  } catch (error) {
11568
11591
  const message = error instanceof Error ? error.message : String(error);
@@ -11584,7 +11607,8 @@ var CodeEvaluator = class {
11584
11607
  }
11585
11608
  } : {},
11586
11609
  error: message
11587
- }
11610
+ },
11611
+ tokenUsage: proxyUsage?.tokenUsage
11588
11612
  };
11589
11613
  } finally {
11590
11614
  if (proxyShutdown) {
@@ -11748,7 +11772,7 @@ ${context2.fileChanges}`;
11748
11772
  target: judgeProvider.targetName
11749
11773
  };
11750
11774
  try {
11751
- const { data } = await this.runWithRetry({
11775
+ const { data, tokenUsage } = await this.runWithRetry({
11752
11776
  context: context2,
11753
11777
  judgeProvider,
11754
11778
  systemPrompt,
@@ -11767,7 +11791,8 @@ ${context2.fileChanges}`;
11767
11791
  misses,
11768
11792
  expectedAspectCount,
11769
11793
  reasoning,
11770
- evaluatorRawRequest
11794
+ evaluatorRawRequest,
11795
+ tokenUsage
11771
11796
  };
11772
11797
  } catch {
11773
11798
  return {
@@ -11797,7 +11822,7 @@ ${context2.fileChanges}`;
11797
11822
  systemPrompt,
11798
11823
  target: judgeProvider.targetName
11799
11824
  };
11800
- const { data } = await this.runWithRetry({
11825
+ const { data, tokenUsage } = await this.runWithRetry({
11801
11826
  context: context2,
11802
11827
  judgeProvider,
11803
11828
  systemPrompt,
@@ -11812,7 +11837,8 @@ ${context2.fileChanges}`;
11812
11837
  misses,
11813
11838
  expectedAspectCount: rubrics.length,
11814
11839
  reasoning: data.overall_reasoning,
11815
- evaluatorRawRequest
11840
+ evaluatorRawRequest,
11841
+ tokenUsage
11816
11842
  };
11817
11843
  }
11818
11844
  /**
@@ -11827,7 +11853,7 @@ ${context2.fileChanges}`;
11827
11853
  systemPrompt,
11828
11854
  target: judgeProvider.targetName
11829
11855
  };
11830
- const { data } = await this.runWithRetry({
11856
+ const { data, tokenUsage } = await this.runWithRetry({
11831
11857
  context: context2,
11832
11858
  judgeProvider,
11833
11859
  systemPrompt,
@@ -11843,7 +11869,8 @@ ${context2.fileChanges}`;
11843
11869
  expectedAspectCount: rubrics.length,
11844
11870
  reasoning: data.overall_reasoning,
11845
11871
  evaluatorRawRequest,
11846
- details
11872
+ details,
11873
+ tokenUsage
11847
11874
  };
11848
11875
  }
11849
11876
  /**
@@ -11927,15 +11954,17 @@ ${context2.fileChanges}`;
11927
11954
  try {
11928
11955
  const model = judgeProvider.asLanguageModel?.();
11929
11956
  if (model) {
11930
- const { text } = await (0, import_ai2.generateText)({
11957
+ const result = await (0, import_ai2.generateText)({
11931
11958
  model,
11932
11959
  system: systemPrompt,
11933
11960
  prompt: userPrompt,
11934
11961
  ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11935
11962
  ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11936
11963
  });
11937
- const data2 = schema.parse(parseJsonFromText(text));
11938
- return { data: data2 };
11964
+ const data2 = schema.parse(parseJsonFromText(result.text));
11965
+ const rawUsage = result.usage;
11966
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
11967
+ return { data: data2, tokenUsage };
11939
11968
  }
11940
11969
  const response = await judgeProvider.invoke({
11941
11970
  question: userPrompt,
@@ -11946,7 +11975,7 @@ ${context2.fileChanges}`;
11946
11975
  temperature: this.temperature
11947
11976
  });
11948
11977
  const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
11949
- return { data, providerResponse: response };
11978
+ return { data, providerResponse: response, tokenUsage: response.tokenUsage };
11950
11979
  } catch (e) {
11951
11980
  lastError = e instanceof Error ? e : new Error(String(e));
11952
11981
  }
@@ -12152,7 +12181,8 @@ var CompositeEvaluator = class {
12152
12181
  reasoning: member.result.reasoning,
12153
12182
  evaluatorRawRequest: member.result.evaluatorRawRequest,
12154
12183
  scores: member.result.scores,
12155
- details: member.result.details
12184
+ details: member.result.details,
12185
+ tokenUsage: member.result.tokenUsage
12156
12186
  });
12157
12187
  }
12158
12188
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
@@ -12200,7 +12230,8 @@ var CompositeEvaluator = class {
12200
12230
  reasoning: member.result.reasoning,
12201
12231
  evaluatorRawRequest: member.result.evaluatorRawRequest,
12202
12232
  scores: member.result.scores,
12203
- details: member.result.details
12233
+ details: member.result.details,
12234
+ tokenUsage: member.result.tokenUsage
12204
12235
  });
12205
12236
  }
12206
12237
  const totalCount = results.length;
@@ -12379,7 +12410,7 @@ var CostEvaluator = class {
12379
12410
  }
12380
12411
  evaluate(context2) {
12381
12412
  const { budget } = this.config;
12382
- const costUsd = context2.trace?.costUsd;
12413
+ const costUsd = context2.costUsd;
12383
12414
  if (costUsd === void 0) {
12384
12415
  return {
12385
12416
  score: 0,
@@ -12422,7 +12453,7 @@ var ExecutionMetricsEvaluator = class {
12422
12453
  this.config = options.config;
12423
12454
  }
12424
12455
  evaluate(context2) {
12425
- const { trace: trace2 } = context2;
12456
+ const { trace: trace2, tokenUsage, costUsd, durationMs } = context2;
12426
12457
  const {
12427
12458
  max_tool_calls,
12428
12459
  max_llm_calls,
@@ -12432,7 +12463,8 @@ var ExecutionMetricsEvaluator = class {
12432
12463
  target_exploration_ratio,
12433
12464
  exploration_tolerance = 0.2
12434
12465
  } = this.config;
12435
- if (!trace2) {
12466
+ const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
12467
+ if (needsTrace && !trace2) {
12436
12468
  return {
12437
12469
  score: 0,
12438
12470
  verdict: "fail",
@@ -12447,11 +12479,12 @@ var ExecutionMetricsEvaluator = class {
12447
12479
  }
12448
12480
  };
12449
12481
  }
12482
+ const narrowedTrace = trace2;
12450
12483
  const hits = [];
12451
12484
  const misses = [];
12452
12485
  const actualMetrics = {};
12453
- if (max_tool_calls !== void 0) {
12454
- const toolCalls = trace2.eventCount;
12486
+ if (max_tool_calls !== void 0 && narrowedTrace) {
12487
+ const toolCalls = narrowedTrace.eventCount;
12455
12488
  actualMetrics.tool_calls = toolCalls;
12456
12489
  if (toolCalls <= max_tool_calls) {
12457
12490
  hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
@@ -12459,8 +12492,8 @@ var ExecutionMetricsEvaluator = class {
12459
12492
  misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
12460
12493
  }
12461
12494
  }
12462
- if (max_llm_calls !== void 0) {
12463
- const llmCalls = trace2.llmCallCount;
12495
+ if (max_llm_calls !== void 0 && narrowedTrace) {
12496
+ const llmCalls = narrowedTrace.llmCallCount;
12464
12497
  if (llmCalls === void 0) {
12465
12498
  misses.push("LLM call count data not available");
12466
12499
  } else {
@@ -12473,7 +12506,6 @@ var ExecutionMetricsEvaluator = class {
12473
12506
  }
12474
12507
  }
12475
12508
  if (max_tokens !== void 0) {
12476
- const tokenUsage = trace2.tokenUsage;
12477
12509
  if (!tokenUsage) {
12478
12510
  misses.push("Token usage data not available");
12479
12511
  } else {
@@ -12487,7 +12519,6 @@ var ExecutionMetricsEvaluator = class {
12487
12519
  }
12488
12520
  }
12489
12521
  if (max_cost_usd !== void 0) {
12490
- const costUsd = trace2.costUsd;
12491
12522
  if (costUsd === void 0) {
12492
12523
  misses.push("Cost data not available");
12493
12524
  } else {
@@ -12501,7 +12532,6 @@ var ExecutionMetricsEvaluator = class {
12501
12532
  }
12502
12533
  }
12503
12534
  if (max_duration_ms !== void 0) {
12504
- const durationMs = trace2.durationMs;
12505
12535
  if (durationMs === void 0) {
12506
12536
  misses.push("Duration data not available");
12507
12537
  } else {
@@ -12513,8 +12543,8 @@ var ExecutionMetricsEvaluator = class {
12513
12543
  }
12514
12544
  }
12515
12545
  }
12516
- if (target_exploration_ratio !== void 0) {
12517
- const ratio = explorationRatio(trace2);
12546
+ if (target_exploration_ratio !== void 0 && narrowedTrace) {
12547
+ const ratio = explorationRatio(narrowedTrace);
12518
12548
  if (ratio === void 0) {
12519
12549
  misses.push("Exploration ratio not available (no tool calls)");
12520
12550
  } else {
@@ -13028,7 +13058,7 @@ var LatencyEvaluator = class {
13028
13058
  }
13029
13059
  evaluate(context2) {
13030
13060
  const { threshold } = this.config;
13031
- const durationMs = context2.trace?.durationMs;
13061
+ const durationMs = context2.durationMs;
13032
13062
  if (durationMs === void 0) {
13033
13063
  return {
13034
13064
  score: 0,
@@ -13673,7 +13703,7 @@ var TokenUsageEvaluator = class {
13673
13703
  this.config = options.config;
13674
13704
  }
13675
13705
  evaluate(context2) {
13676
- const usage = context2.trace?.tokenUsage;
13706
+ const usage = context2.tokenUsage;
13677
13707
  const maxTotal = this.config.max_total;
13678
13708
  const maxInput = this.config.max_input;
13679
13709
  const maxOutput = this.config.max_output;
@@ -15366,7 +15396,7 @@ async function runEvaluation(options) {
15366
15396
  caseCost = trialCostSum;
15367
15397
  }
15368
15398
  } else {
15369
- caseCost = result.trace?.costUsd;
15399
+ caseCost = result.costUsd;
15370
15400
  }
15371
15401
  if (caseCost !== void 0) {
15372
15402
  cumulativeBudgetCost += caseCost;
@@ -15527,17 +15557,18 @@ async function runBatchEvaluation(options) {
15527
15557
  const providerResponse = batchResponse[i];
15528
15558
  const output = providerResponse.output;
15529
15559
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15530
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
15531
- eventCount: 0,
15532
- toolNames: [],
15533
- toolCallsByName: {},
15534
- errorCount: 0
15535
- } : void 0;
15536
- const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
15560
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15561
+ const merged = computed ? mergeExecutionMetrics(computed, {
15537
15562
  tokenUsage: providerResponse.tokenUsage,
15538
15563
  costUsd: providerResponse.costUsd,
15539
15564
  durationMs: providerResponse.durationMs
15540
15565
  }) : void 0;
15566
+ const trace2 = merged?.trace;
15567
+ const costUsd = merged?.costUsd;
15568
+ const durationMs = merged?.durationMs;
15569
+ const tokenUsage = merged?.tokenUsage;
15570
+ const startTime = merged?.startTime;
15571
+ const endTime = merged?.endTime;
15541
15572
  const candidate = extractLastAssistantContent2(output);
15542
15573
  const providerError = extractProviderError(providerResponse);
15543
15574
  let result;
@@ -15556,6 +15587,11 @@ async function runBatchEvaluation(options) {
15556
15587
  agentTimeoutMs,
15557
15588
  output,
15558
15589
  trace: trace2,
15590
+ costUsd,
15591
+ durationMs,
15592
+ tokenUsage,
15593
+ startTime,
15594
+ endTime,
15559
15595
  targetResolver,
15560
15596
  availableTargets
15561
15597
  });
@@ -15792,17 +15828,18 @@ async function runEvalCase(options) {
15792
15828
  }
15793
15829
  const output = providerResponse.output;
15794
15830
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15795
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
15796
- eventCount: 0,
15797
- toolNames: [],
15798
- toolCallsByName: {},
15799
- errorCount: 0
15800
- } : void 0;
15801
- const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
15831
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15832
+ const merged = computed ? mergeExecutionMetrics(computed, {
15802
15833
  tokenUsage: providerResponse.tokenUsage,
15803
15834
  costUsd: providerResponse.costUsd,
15804
15835
  durationMs: providerResponse.durationMs
15805
15836
  }) : void 0;
15837
+ const trace2 = merged?.trace;
15838
+ const costUsd = merged?.costUsd;
15839
+ const durationMs = merged?.durationMs;
15840
+ const tokenUsage = merged?.tokenUsage;
15841
+ const startTime = merged?.startTime;
15842
+ const endTime = merged?.endTime;
15806
15843
  const candidate = extractLastAssistantContent2(output);
15807
15844
  let fileChanges;
15808
15845
  if (baselineCommit && workspacePath) {
@@ -15847,6 +15884,11 @@ async function runEvalCase(options) {
15847
15884
  agentTimeoutMs,
15848
15885
  output,
15849
15886
  trace: trace2,
15887
+ costUsd,
15888
+ durationMs,
15889
+ tokenUsage,
15890
+ startTime,
15891
+ endTime,
15850
15892
  targetResolver,
15851
15893
  availableTargets,
15852
15894
  fileChanges,
@@ -15903,7 +15945,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
15903
15945
  };
15904
15946
  const result = await runEvalCase(trialOptions);
15905
15947
  allResults.push(result);
15906
- const trialCost = result.trace?.costUsd;
15948
+ const trialCost = result.costUsd;
15907
15949
  const trialVerdict = scoreToVerdict(result.score);
15908
15950
  const trial = {
15909
15951
  attempt,
@@ -15959,6 +16001,11 @@ async function evaluateCandidate(options) {
15959
16001
  agentTimeoutMs,
15960
16002
  output,
15961
16003
  trace: trace2,
16004
+ costUsd,
16005
+ durationMs,
16006
+ tokenUsage,
16007
+ startTime,
16008
+ endTime,
15962
16009
  targetResolver,
15963
16010
  availableTargets,
15964
16011
  fileChanges,
@@ -15979,6 +16026,11 @@ async function evaluateCandidate(options) {
15979
16026
  agentTimeoutMs,
15980
16027
  output,
15981
16028
  trace: trace2,
16029
+ costUsd,
16030
+ durationMs,
16031
+ tokenUsage,
16032
+ startTime,
16033
+ endTime,
15982
16034
  targetResolver,
15983
16035
  availableTargets,
15984
16036
  fileChanges,
@@ -16022,6 +16074,11 @@ async function evaluateCandidate(options) {
16022
16074
  answer: candidate,
16023
16075
  target: target.name,
16024
16076
  reasoning: score.reasoning,
16077
+ tokenUsage,
16078
+ costUsd,
16079
+ durationMs,
16080
+ startTime,
16081
+ endTime,
16025
16082
  requests,
16026
16083
  input,
16027
16084
  scores,
@@ -16045,6 +16102,11 @@ async function runEvaluatorsForCase(options) {
16045
16102
  agentTimeoutMs,
16046
16103
  output,
16047
16104
  trace: trace2,
16105
+ costUsd,
16106
+ durationMs,
16107
+ tokenUsage,
16108
+ startTime,
16109
+ endTime,
16048
16110
  targetResolver,
16049
16111
  availableTargets,
16050
16112
  fileChanges,
@@ -16066,6 +16128,11 @@ async function runEvaluatorsForCase(options) {
16066
16128
  agentTimeoutMs,
16067
16129
  output,
16068
16130
  trace: trace2,
16131
+ costUsd,
16132
+ durationMs,
16133
+ tokenUsage,
16134
+ startTime,
16135
+ endTime,
16069
16136
  targetResolver,
16070
16137
  availableTargets,
16071
16138
  fileChanges,
@@ -16088,6 +16155,11 @@ async function runEvaluatorsForCase(options) {
16088
16155
  judgeProvider,
16089
16156
  output,
16090
16157
  trace: trace2,
16158
+ tokenUsage,
16159
+ costUsd,
16160
+ durationMs,
16161
+ startTime,
16162
+ endTime,
16091
16163
  targetResolver,
16092
16164
  availableTargets,
16093
16165
  fileChanges,
@@ -16111,6 +16183,11 @@ async function runEvaluatorList(options) {
16111
16183
  agentTimeoutMs,
16112
16184
  output,
16113
16185
  trace: trace2,
16186
+ costUsd,
16187
+ durationMs,
16188
+ tokenUsage,
16189
+ startTime,
16190
+ endTime,
16114
16191
  targetResolver,
16115
16192
  availableTargets,
16116
16193
  fileChanges,
@@ -16129,6 +16206,11 @@ async function runEvaluatorList(options) {
16129
16206
  judgeProvider,
16130
16207
  output,
16131
16208
  trace: trace2,
16209
+ tokenUsage,
16210
+ costUsd,
16211
+ durationMs,
16212
+ startTime,
16213
+ endTime,
16132
16214
  targetResolver,
16133
16215
  availableTargets,
16134
16216
  fileChanges,
@@ -16168,7 +16250,8 @@ async function runEvaluatorList(options) {
16168
16250
  reasoning: score2.reasoning,
16169
16251
  evaluatorProviderRequest: score2.evaluatorRawRequest,
16170
16252
  details: score2.details,
16171
- scores: mapChildResults(score2.scores)
16253
+ scores: mapChildResults(score2.scores),
16254
+ tokenUsage: score2.tokenUsage
16172
16255
  });
16173
16256
  } catch (error) {
16174
16257
  const message = error instanceof Error ? error.message : String(error);
@@ -16416,7 +16499,8 @@ function mapChildResults(children) {
16416
16499
  reasoning: child.reasoning,
16417
16500
  evaluatorProviderRequest: child.evaluatorRawRequest,
16418
16501
  scores: mapChildResults(child.scores),
16419
- details: child.details
16502
+ details: child.details,
16503
+ tokenUsage: child.tokenUsage
16420
16504
  }));
16421
16505
  }
16422
16506
  function computeWeightedMean(entries) {
@@ -16796,7 +16880,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
16796
16880
  "beforeEachOutput",
16797
16881
  "afterAllOutput",
16798
16882
  "afterEachOutput",
16799
- "fileChanges"
16883
+ "fileChanges",
16884
+ // Promoted execution metrics (debug, not needed for regression comparison)
16885
+ "tokenUsage",
16886
+ "costUsd",
16887
+ "durationMs",
16888
+ "startTime",
16889
+ "endTime"
16800
16890
  ]);
16801
16891
  var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
16802
16892
  function trimEvaluatorResult(result) {
@@ -16919,8 +17009,8 @@ var OtelTraceExporter = class {
16919
17009
  const api = this.api;
16920
17010
  const tracer = this.tracer;
16921
17011
  const captureContent = this.options.captureContent ?? false;
16922
- const startHr = toHrTime(result.trace?.startTime ?? result.timestamp);
16923
- const endHr = toHrTime(result.trace?.endTime ?? result.timestamp);
17012
+ const startHr = toHrTime(result.startTime ?? result.timestamp);
17013
+ const endHr = toHrTime(result.endTime ?? result.timestamp);
16924
17014
  let parentCtx = api.ROOT_CONTEXT;
16925
17015
  const traceparent = process.env.TRACEPARENT;
16926
17016
  if (traceparent && this.W3CPropagator) {
@@ -16949,12 +17039,13 @@ var OtelTraceExporter = class {
16949
17039
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
16950
17040
  rootSpan.setAttribute("agentv.score", result.score);
16951
17041
  if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
17042
+ if (result.durationMs != null)
17043
+ rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
17044
+ if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
16952
17045
  if (result.trace) {
16953
17046
  const t = result.trace;
16954
17047
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
16955
17048
  rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
16956
- if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
16957
- if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
16958
17049
  if (t.llmCallCount != null)
16959
17050
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
16960
17051
  }