@agentv/core 2.8.0-next.1 → 2.9.0-next.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-P2465XAH.js";
20
+ } from "./chunk-7Q4PH265.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -83,14 +83,16 @@ function computeTraceSummary(messages) {
83
83
  }
84
84
  const toolNames = Object.keys(toolCallCounts).sort();
85
85
  return {
86
- eventCount: totalToolCalls,
87
- toolNames,
88
- toolCallsByName: toolCallCounts,
89
- errorCount: 0,
86
+ trace: {
87
+ eventCount: totalToolCalls,
88
+ toolNames,
89
+ toolCallsByName: toolCallCounts,
90
+ errorCount: 0,
91
+ llmCallCount,
92
+ ...hasAnyDuration ? { toolDurations } : {}
93
+ },
90
94
  startTime: earliestStart?.toISOString(),
91
- endTime: latestEnd?.toISOString(),
92
- llmCallCount,
93
- ...hasAnyDuration ? { toolDurations } : {}
95
+ endTime: latestEnd?.toISOString()
94
96
  };
95
97
  }
96
98
  var DEFAULT_EXPLORATION_TOOLS = [
@@ -113,9 +115,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
113
115
  );
114
116
  return explorationCalls / summary.eventCount;
115
117
  }
116
- function tokensPerTool(summary) {
117
- if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
118
- const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
118
+ function tokensPerTool(summary, tokenUsage) {
119
+ if (!tokenUsage || summary.eventCount === 0) return void 0;
120
+ const totalTokens = tokenUsage.input + tokenUsage.output;
119
121
  return totalTokens / summary.eventCount;
120
122
  }
121
123
  function avgToolDurationMs(summary) {
@@ -131,16 +133,15 @@ function avgToolDurationMs(summary) {
131
133
  if (totalCalls === 0) return void 0;
132
134
  return totalDuration / totalCalls;
133
135
  }
134
- function mergeExecutionMetrics(summary, metrics) {
135
- if (!metrics) return summary;
136
+ function mergeExecutionMetrics(computed, metrics) {
137
+ if (!metrics) return computed;
136
138
  return {
137
- ...summary,
139
+ trace: computed.trace,
138
140
  tokenUsage: metrics.tokenUsage,
139
141
  costUsd: metrics.costUsd,
140
142
  durationMs: metrics.durationMs,
141
- // Provider-level timing takes precedence over span-derived timing
142
- startTime: metrics.startTime ?? summary.startTime,
143
- endTime: metrics.endTime ?? summary.endTime
143
+ startTime: metrics.startTime ?? computed.startTime,
144
+ endTime: metrics.endTime ?? computed.endTime
144
145
  };
145
146
  }
146
147
 
@@ -3097,10 +3098,13 @@ async function invokeModel(options) {
3097
3098
  }
3098
3099
  function mapResponse(result) {
3099
3100
  const content = result.text ?? "";
3101
+ const rawUsage = result.totalUsage ?? result.usage;
3102
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
3100
3103
  return {
3101
3104
  raw: result,
3102
- usage: toJsonObject(result.totalUsage ?? result.usage),
3103
- output: [{ role: "assistant", content }]
3105
+ usage: toJsonObject(rawUsage),
3106
+ output: [{ role: "assistant", content }],
3107
+ tokenUsage
3104
3108
  };
3105
3109
  }
3106
3110
  function toJsonObject(value) {
@@ -8393,6 +8397,8 @@ async function createTargetProxy(options) {
8393
8397
  const token = randomBytes(32).toString("hex");
8394
8398
  let callCount = 0;
8395
8399
  let isShutdown = false;
8400
+ let totalInputTokens = 0;
8401
+ let totalOutputTokens = 0;
8396
8402
  const targetsList = availableTargets ?? [defaultProvider.targetName];
8397
8403
  function resolveProvider(targetName) {
8398
8404
  if (targetName === void 0 || targetName === defaultProvider.targetName) {
@@ -8471,11 +8477,16 @@ async function createTargetProxy(options) {
8471
8477
  evalCaseId: request.evalCaseId ?? "proxy",
8472
8478
  attempt: request.attempt ?? 1
8473
8479
  });
8480
+ if (response.tokenUsage) {
8481
+ totalInputTokens += response.tokenUsage.input;
8482
+ totalOutputTokens += response.tokenUsage.output;
8483
+ }
8474
8484
  const output = response.output ?? [];
8475
8485
  const rawText = extractLastAssistantContent2(output);
8476
8486
  const result = {
8477
8487
  output,
8478
- rawText
8488
+ rawText,
8489
+ tokenUsage: response.tokenUsage
8479
8490
  };
8480
8491
  sendJson(res, 200, result);
8481
8492
  } catch (error) {
@@ -8522,10 +8533,15 @@ async function createTargetProxy(options) {
8522
8533
  evalCaseId: request.evalCaseId ?? "proxy",
8523
8534
  attempt: request.attempt ?? 1
8524
8535
  });
8536
+ if (response.tokenUsage) {
8537
+ totalInputTokens += response.tokenUsage.input;
8538
+ totalOutputTokens += response.tokenUsage.output;
8539
+ }
8525
8540
  const output = response.output ?? [];
8526
8541
  responses.push({
8527
8542
  output,
8528
- rawText: extractLastAssistantContent2(output)
8543
+ rawText: extractLastAssistantContent2(output),
8544
+ tokenUsage: response.tokenUsage
8529
8545
  });
8530
8546
  } catch (error) {
8531
8547
  const message = error instanceof Error ? error.message : String(error);
@@ -8564,7 +8580,8 @@ async function createTargetProxy(options) {
8564
8580
  },
8565
8581
  getUsageMetadata: () => ({
8566
8582
  callCount,
8567
- maxCalls
8583
+ maxCalls,
8584
+ tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
8568
8585
  })
8569
8586
  };
8570
8587
  }
@@ -8689,6 +8706,11 @@ var CodeEvaluator = class {
8689
8706
  ),
8690
8707
  input: context.evalCase.input,
8691
8708
  trace: context.trace ?? null,
8709
+ tokenUsage: context.tokenUsage ?? null,
8710
+ costUsd: context.costUsd ?? null,
8711
+ durationMs: context.durationMs ?? null,
8712
+ startTime: context.startTime ?? null,
8713
+ endTime: context.endTime ?? null,
8692
8714
  fileChanges: context.fileChanges ?? null,
8693
8715
  workspacePath: context.workspacePath ?? null,
8694
8716
  config: this.config ?? null
@@ -8747,7 +8769,8 @@ var CodeEvaluator = class {
8747
8769
  expectedAspectCount: hits.length + misses.length || 1,
8748
8770
  reasoning,
8749
8771
  evaluatorRawRequest,
8750
- ...details ? { details } : {}
8772
+ ...details ? { details } : {},
8773
+ tokenUsage: proxyUsage?.tokenUsage
8751
8774
  };
8752
8775
  } catch (error) {
8753
8776
  const message = error instanceof Error ? error.message : String(error);
@@ -8769,7 +8792,8 @@ var CodeEvaluator = class {
8769
8792
  }
8770
8793
  } : {},
8771
8794
  error: message
8772
- }
8795
+ },
8796
+ tokenUsage: proxyUsage?.tokenUsage
8773
8797
  };
8774
8798
  } finally {
8775
8799
  if (proxyShutdown) {
@@ -8904,7 +8928,7 @@ ${context.fileChanges}`;
8904
8928
  target: judgeProvider.targetName
8905
8929
  };
8906
8930
  try {
8907
- const { data } = await this.runWithRetry({
8931
+ const { data, tokenUsage } = await this.runWithRetry({
8908
8932
  context,
8909
8933
  judgeProvider,
8910
8934
  systemPrompt,
@@ -8923,7 +8947,8 @@ ${context.fileChanges}`;
8923
8947
  misses,
8924
8948
  expectedAspectCount,
8925
8949
  reasoning,
8926
- evaluatorRawRequest
8950
+ evaluatorRawRequest,
8951
+ tokenUsage
8927
8952
  };
8928
8953
  } catch {
8929
8954
  return {
@@ -8953,7 +8978,7 @@ ${context.fileChanges}`;
8953
8978
  systemPrompt,
8954
8979
  target: judgeProvider.targetName
8955
8980
  };
8956
- const { data } = await this.runWithRetry({
8981
+ const { data, tokenUsage } = await this.runWithRetry({
8957
8982
  context,
8958
8983
  judgeProvider,
8959
8984
  systemPrompt,
@@ -8968,7 +8993,8 @@ ${context.fileChanges}`;
8968
8993
  misses,
8969
8994
  expectedAspectCount: rubrics.length,
8970
8995
  reasoning: data.overall_reasoning,
8971
- evaluatorRawRequest
8996
+ evaluatorRawRequest,
8997
+ tokenUsage
8972
8998
  };
8973
8999
  }
8974
9000
  /**
@@ -8983,7 +9009,7 @@ ${context.fileChanges}`;
8983
9009
  systemPrompt,
8984
9010
  target: judgeProvider.targetName
8985
9011
  };
8986
- const { data } = await this.runWithRetry({
9012
+ const { data, tokenUsage } = await this.runWithRetry({
8987
9013
  context,
8988
9014
  judgeProvider,
8989
9015
  systemPrompt,
@@ -8999,7 +9025,8 @@ ${context.fileChanges}`;
8999
9025
  expectedAspectCount: rubrics.length,
9000
9026
  reasoning: data.overall_reasoning,
9001
9027
  evaluatorRawRequest,
9002
- details
9028
+ details,
9029
+ tokenUsage
9003
9030
  };
9004
9031
  }
9005
9032
  /**
@@ -9083,15 +9110,17 @@ ${context.fileChanges}`;
9083
9110
  try {
9084
9111
  const model = judgeProvider.asLanguageModel?.();
9085
9112
  if (model) {
9086
- const { text } = await generateText2({
9113
+ const result = await generateText2({
9087
9114
  model,
9088
9115
  system: systemPrompt,
9089
9116
  prompt: userPrompt,
9090
9117
  ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
9091
9118
  ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
9092
9119
  });
9093
- const data2 = schema.parse(parseJsonFromText(text));
9094
- return { data: data2 };
9120
+ const data2 = schema.parse(parseJsonFromText(result.text));
9121
+ const rawUsage = result.usage;
9122
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
9123
+ return { data: data2, tokenUsage };
9095
9124
  }
9096
9125
  const response = await judgeProvider.invoke({
9097
9126
  question: userPrompt,
@@ -9102,7 +9131,7 @@ ${context.fileChanges}`;
9102
9131
  temperature: this.temperature
9103
9132
  });
9104
9133
  const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
9105
- return { data, providerResponse: response };
9134
+ return { data, providerResponse: response, tokenUsage: response.tokenUsage };
9106
9135
  } catch (e) {
9107
9136
  lastError = e instanceof Error ? e : new Error(String(e));
9108
9137
  }
@@ -9308,7 +9337,8 @@ var CompositeEvaluator = class {
9308
9337
  reasoning: member.result.reasoning,
9309
9338
  evaluatorRawRequest: member.result.evaluatorRawRequest,
9310
9339
  scores: member.result.scores,
9311
- details: member.result.details
9340
+ details: member.result.details,
9341
+ tokenUsage: member.result.tokenUsage
9312
9342
  });
9313
9343
  }
9314
9344
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
@@ -9356,7 +9386,8 @@ var CompositeEvaluator = class {
9356
9386
  reasoning: member.result.reasoning,
9357
9387
  evaluatorRawRequest: member.result.evaluatorRawRequest,
9358
9388
  scores: member.result.scores,
9359
- details: member.result.details
9389
+ details: member.result.details,
9390
+ tokenUsage: member.result.tokenUsage
9360
9391
  });
9361
9392
  }
9362
9393
  const totalCount = results.length;
@@ -9535,7 +9566,7 @@ var CostEvaluator = class {
9535
9566
  }
9536
9567
  evaluate(context) {
9537
9568
  const { budget } = this.config;
9538
- const costUsd = context.trace?.costUsd;
9569
+ const costUsd = context.costUsd;
9539
9570
  if (costUsd === void 0) {
9540
9571
  return {
9541
9572
  score: 0,
@@ -9578,7 +9609,7 @@ var ExecutionMetricsEvaluator = class {
9578
9609
  this.config = options.config;
9579
9610
  }
9580
9611
  evaluate(context) {
9581
- const { trace } = context;
9612
+ const { trace, tokenUsage, costUsd, durationMs } = context;
9582
9613
  const {
9583
9614
  max_tool_calls,
9584
9615
  max_llm_calls,
@@ -9588,7 +9619,8 @@ var ExecutionMetricsEvaluator = class {
9588
9619
  target_exploration_ratio,
9589
9620
  exploration_tolerance = 0.2
9590
9621
  } = this.config;
9591
- if (!trace) {
9622
+ const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
9623
+ if (needsTrace && !trace) {
9592
9624
  return {
9593
9625
  score: 0,
9594
9626
  verdict: "fail",
@@ -9603,11 +9635,12 @@ var ExecutionMetricsEvaluator = class {
9603
9635
  }
9604
9636
  };
9605
9637
  }
9638
+ const narrowedTrace = trace;
9606
9639
  const hits = [];
9607
9640
  const misses = [];
9608
9641
  const actualMetrics = {};
9609
- if (max_tool_calls !== void 0) {
9610
- const toolCalls = trace.eventCount;
9642
+ if (max_tool_calls !== void 0 && narrowedTrace) {
9643
+ const toolCalls = narrowedTrace.eventCount;
9611
9644
  actualMetrics.tool_calls = toolCalls;
9612
9645
  if (toolCalls <= max_tool_calls) {
9613
9646
  hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
@@ -9615,8 +9648,8 @@ var ExecutionMetricsEvaluator = class {
9615
9648
  misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
9616
9649
  }
9617
9650
  }
9618
- if (max_llm_calls !== void 0) {
9619
- const llmCalls = trace.llmCallCount;
9651
+ if (max_llm_calls !== void 0 && narrowedTrace) {
9652
+ const llmCalls = narrowedTrace.llmCallCount;
9620
9653
  if (llmCalls === void 0) {
9621
9654
  misses.push("LLM call count data not available");
9622
9655
  } else {
@@ -9629,7 +9662,6 @@ var ExecutionMetricsEvaluator = class {
9629
9662
  }
9630
9663
  }
9631
9664
  if (max_tokens !== void 0) {
9632
- const tokenUsage = trace.tokenUsage;
9633
9665
  if (!tokenUsage) {
9634
9666
  misses.push("Token usage data not available");
9635
9667
  } else {
@@ -9643,7 +9675,6 @@ var ExecutionMetricsEvaluator = class {
9643
9675
  }
9644
9676
  }
9645
9677
  if (max_cost_usd !== void 0) {
9646
- const costUsd = trace.costUsd;
9647
9678
  if (costUsd === void 0) {
9648
9679
  misses.push("Cost data not available");
9649
9680
  } else {
@@ -9657,7 +9688,6 @@ var ExecutionMetricsEvaluator = class {
9657
9688
  }
9658
9689
  }
9659
9690
  if (max_duration_ms !== void 0) {
9660
- const durationMs = trace.durationMs;
9661
9691
  if (durationMs === void 0) {
9662
9692
  misses.push("Duration data not available");
9663
9693
  } else {
@@ -9669,8 +9699,8 @@ var ExecutionMetricsEvaluator = class {
9669
9699
  }
9670
9700
  }
9671
9701
  }
9672
- if (target_exploration_ratio !== void 0) {
9673
- const ratio = explorationRatio(trace);
9702
+ if (target_exploration_ratio !== void 0 && narrowedTrace) {
9703
+ const ratio = explorationRatio(narrowedTrace);
9674
9704
  if (ratio === void 0) {
9675
9705
  misses.push("Exploration ratio not available (no tool calls)");
9676
9706
  } else {
@@ -10184,7 +10214,7 @@ var LatencyEvaluator = class {
10184
10214
  }
10185
10215
  evaluate(context) {
10186
10216
  const { threshold } = this.config;
10187
- const durationMs = context.trace?.durationMs;
10217
+ const durationMs = context.durationMs;
10188
10218
  if (durationMs === void 0) {
10189
10219
  return {
10190
10220
  score: 0,
@@ -10829,7 +10859,7 @@ var TokenUsageEvaluator = class {
10829
10859
  this.config = options.config;
10830
10860
  }
10831
10861
  evaluate(context) {
10832
- const usage = context.trace?.tokenUsage;
10862
+ const usage = context.tokenUsage;
10833
10863
  const maxTotal = this.config.max_total;
10834
10864
  const maxInput = this.config.max_input;
10835
10865
  const maxOutput = this.config.max_output;
@@ -12522,7 +12552,7 @@ async function runEvaluation(options) {
12522
12552
  caseCost = trialCostSum;
12523
12553
  }
12524
12554
  } else {
12525
- caseCost = result.trace?.costUsd;
12555
+ caseCost = result.costUsd;
12526
12556
  }
12527
12557
  if (caseCost !== void 0) {
12528
12558
  cumulativeBudgetCost += caseCost;
@@ -12683,17 +12713,18 @@ async function runBatchEvaluation(options) {
12683
12713
  const providerResponse = batchResponse[i];
12684
12714
  const output = providerResponse.output;
12685
12715
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
12686
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
12687
- eventCount: 0,
12688
- toolNames: [],
12689
- toolCallsByName: {},
12690
- errorCount: 0
12691
- } : void 0;
12692
- const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
12716
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
12717
+ const merged = computed ? mergeExecutionMetrics(computed, {
12693
12718
  tokenUsage: providerResponse.tokenUsage,
12694
12719
  costUsd: providerResponse.costUsd,
12695
12720
  durationMs: providerResponse.durationMs
12696
12721
  }) : void 0;
12722
+ const trace = merged?.trace;
12723
+ const costUsd = merged?.costUsd;
12724
+ const durationMs = merged?.durationMs;
12725
+ const tokenUsage = merged?.tokenUsage;
12726
+ const startTime = merged?.startTime;
12727
+ const endTime = merged?.endTime;
12697
12728
  const candidate = extractLastAssistantContent(output);
12698
12729
  const providerError = extractProviderError(providerResponse);
12699
12730
  let result;
@@ -12712,6 +12743,11 @@ async function runBatchEvaluation(options) {
12712
12743
  agentTimeoutMs,
12713
12744
  output,
12714
12745
  trace,
12746
+ costUsd,
12747
+ durationMs,
12748
+ tokenUsage,
12749
+ startTime,
12750
+ endTime,
12715
12751
  targetResolver,
12716
12752
  availableTargets
12717
12753
  });
@@ -12948,17 +12984,18 @@ async function runEvalCase(options) {
12948
12984
  }
12949
12985
  const output = providerResponse.output;
12950
12986
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
12951
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
12952
- eventCount: 0,
12953
- toolNames: [],
12954
- toolCallsByName: {},
12955
- errorCount: 0
12956
- } : void 0;
12957
- const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
12987
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
12988
+ const merged = computed ? mergeExecutionMetrics(computed, {
12958
12989
  tokenUsage: providerResponse.tokenUsage,
12959
12990
  costUsd: providerResponse.costUsd,
12960
12991
  durationMs: providerResponse.durationMs
12961
12992
  }) : void 0;
12993
+ const trace = merged?.trace;
12994
+ const costUsd = merged?.costUsd;
12995
+ const durationMs = merged?.durationMs;
12996
+ const tokenUsage = merged?.tokenUsage;
12997
+ const startTime = merged?.startTime;
12998
+ const endTime = merged?.endTime;
12962
12999
  const candidate = extractLastAssistantContent(output);
12963
13000
  let fileChanges;
12964
13001
  if (baselineCommit && workspacePath) {
@@ -13003,6 +13040,11 @@ async function runEvalCase(options) {
13003
13040
  agentTimeoutMs,
13004
13041
  output,
13005
13042
  trace,
13043
+ costUsd,
13044
+ durationMs,
13045
+ tokenUsage,
13046
+ startTime,
13047
+ endTime,
13006
13048
  targetResolver,
13007
13049
  availableTargets,
13008
13050
  fileChanges,
@@ -13059,7 +13101,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13059
13101
  };
13060
13102
  const result = await runEvalCase(trialOptions);
13061
13103
  allResults.push(result);
13062
- const trialCost = result.trace?.costUsd;
13104
+ const trialCost = result.costUsd;
13063
13105
  const trialVerdict = scoreToVerdict(result.score);
13064
13106
  const trial = {
13065
13107
  attempt,
@@ -13115,6 +13157,11 @@ async function evaluateCandidate(options) {
13115
13157
  agentTimeoutMs,
13116
13158
  output,
13117
13159
  trace,
13160
+ costUsd,
13161
+ durationMs,
13162
+ tokenUsage,
13163
+ startTime,
13164
+ endTime,
13118
13165
  targetResolver,
13119
13166
  availableTargets,
13120
13167
  fileChanges,
@@ -13135,6 +13182,11 @@ async function evaluateCandidate(options) {
13135
13182
  agentTimeoutMs,
13136
13183
  output,
13137
13184
  trace,
13185
+ costUsd,
13186
+ durationMs,
13187
+ tokenUsage,
13188
+ startTime,
13189
+ endTime,
13138
13190
  targetResolver,
13139
13191
  availableTargets,
13140
13192
  fileChanges,
@@ -13178,6 +13230,11 @@ async function evaluateCandidate(options) {
13178
13230
  answer: candidate,
13179
13231
  target: target.name,
13180
13232
  reasoning: score.reasoning,
13233
+ tokenUsage,
13234
+ costUsd,
13235
+ durationMs,
13236
+ startTime,
13237
+ endTime,
13181
13238
  requests,
13182
13239
  input,
13183
13240
  scores,
@@ -13201,6 +13258,11 @@ async function runEvaluatorsForCase(options) {
13201
13258
  agentTimeoutMs,
13202
13259
  output,
13203
13260
  trace,
13261
+ costUsd,
13262
+ durationMs,
13263
+ tokenUsage,
13264
+ startTime,
13265
+ endTime,
13204
13266
  targetResolver,
13205
13267
  availableTargets,
13206
13268
  fileChanges,
@@ -13222,6 +13284,11 @@ async function runEvaluatorsForCase(options) {
13222
13284
  agentTimeoutMs,
13223
13285
  output,
13224
13286
  trace,
13287
+ costUsd,
13288
+ durationMs,
13289
+ tokenUsage,
13290
+ startTime,
13291
+ endTime,
13225
13292
  targetResolver,
13226
13293
  availableTargets,
13227
13294
  fileChanges,
@@ -13244,6 +13311,11 @@ async function runEvaluatorsForCase(options) {
13244
13311
  judgeProvider,
13245
13312
  output,
13246
13313
  trace,
13314
+ tokenUsage,
13315
+ costUsd,
13316
+ durationMs,
13317
+ startTime,
13318
+ endTime,
13247
13319
  targetResolver,
13248
13320
  availableTargets,
13249
13321
  fileChanges,
@@ -13267,6 +13339,11 @@ async function runEvaluatorList(options) {
13267
13339
  agentTimeoutMs,
13268
13340
  output,
13269
13341
  trace,
13342
+ costUsd,
13343
+ durationMs,
13344
+ tokenUsage,
13345
+ startTime,
13346
+ endTime,
13270
13347
  targetResolver,
13271
13348
  availableTargets,
13272
13349
  fileChanges,
@@ -13285,6 +13362,11 @@ async function runEvaluatorList(options) {
13285
13362
  judgeProvider,
13286
13363
  output,
13287
13364
  trace,
13365
+ tokenUsage,
13366
+ costUsd,
13367
+ durationMs,
13368
+ startTime,
13369
+ endTime,
13288
13370
  targetResolver,
13289
13371
  availableTargets,
13290
13372
  fileChanges,
@@ -13324,7 +13406,8 @@ async function runEvaluatorList(options) {
13324
13406
  reasoning: score2.reasoning,
13325
13407
  evaluatorProviderRequest: score2.evaluatorRawRequest,
13326
13408
  details: score2.details,
13327
- scores: mapChildResults(score2.scores)
13409
+ scores: mapChildResults(score2.scores),
13410
+ tokenUsage: score2.tokenUsage
13328
13411
  });
13329
13412
  } catch (error) {
13330
13413
  const message = error instanceof Error ? error.message : String(error);
@@ -13572,7 +13655,8 @@ function mapChildResults(children) {
13572
13655
  reasoning: child.reasoning,
13573
13656
  evaluatorProviderRequest: child.evaluatorRawRequest,
13574
13657
  scores: mapChildResults(child.scores),
13575
- details: child.details
13658
+ details: child.details,
13659
+ tokenUsage: child.tokenUsage
13576
13660
  }));
13577
13661
  }
13578
13662
  function computeWeightedMean(entries) {
@@ -13952,7 +14036,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
13952
14036
  "beforeEachOutput",
13953
14037
  "afterAllOutput",
13954
14038
  "afterEachOutput",
13955
- "fileChanges"
14039
+ "fileChanges",
14040
+ // Promoted execution metrics (debug, not needed for regression comparison)
14041
+ "tokenUsage",
14042
+ "costUsd",
14043
+ "durationMs",
14044
+ "startTime",
14045
+ "endTime"
13956
14046
  ]);
13957
14047
  var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
13958
14048
  function trimEvaluatorResult(result) {
@@ -14075,8 +14165,8 @@ var OtelTraceExporter = class {
14075
14165
  const api = this.api;
14076
14166
  const tracer = this.tracer;
14077
14167
  const captureContent = this.options.captureContent ?? false;
14078
- const startHr = toHrTime(result.trace?.startTime ?? result.timestamp);
14079
- const endHr = toHrTime(result.trace?.endTime ?? result.timestamp);
14168
+ const startHr = toHrTime(result.startTime ?? result.timestamp);
14169
+ const endHr = toHrTime(result.endTime ?? result.timestamp);
14080
14170
  let parentCtx = api.ROOT_CONTEXT;
14081
14171
  const traceparent = process.env.TRACEPARENT;
14082
14172
  if (traceparent && this.W3CPropagator) {
@@ -14105,12 +14195,13 @@ var OtelTraceExporter = class {
14105
14195
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
14106
14196
  rootSpan.setAttribute("agentv.score", result.score);
14107
14197
  if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
14198
+ if (result.durationMs != null)
14199
+ rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
14200
+ if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
14108
14201
  if (result.trace) {
14109
14202
  const t = result.trace;
14110
14203
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
14111
14204
  rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
14112
- if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
14113
- if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
14114
14205
  if (t.llmCallCount != null)
14115
14206
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
14116
14207
  }