@agentv/core 2.7.1-next.6 → 2.9.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-5SV2QC6V.js";
20
+ } from "./chunk-7Q4PH265.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -83,14 +83,16 @@ function computeTraceSummary(messages) {
83
83
  }
84
84
  const toolNames = Object.keys(toolCallCounts).sort();
85
85
  return {
86
- eventCount: totalToolCalls,
87
- toolNames,
88
- toolCallsByName: toolCallCounts,
89
- errorCount: 0,
86
+ trace: {
87
+ eventCount: totalToolCalls,
88
+ toolNames,
89
+ toolCallsByName: toolCallCounts,
90
+ errorCount: 0,
91
+ llmCallCount,
92
+ ...hasAnyDuration ? { toolDurations } : {}
93
+ },
90
94
  startTime: earliestStart?.toISOString(),
91
- endTime: latestEnd?.toISOString(),
92
- llmCallCount,
93
- ...hasAnyDuration ? { toolDurations } : {}
95
+ endTime: latestEnd?.toISOString()
94
96
  };
95
97
  }
96
98
  var DEFAULT_EXPLORATION_TOOLS = [
@@ -113,9 +115,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
113
115
  );
114
116
  return explorationCalls / summary.eventCount;
115
117
  }
116
- function tokensPerTool(summary) {
117
- if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
118
- const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
118
+ function tokensPerTool(summary, tokenUsage) {
119
+ if (!tokenUsage || summary.eventCount === 0) return void 0;
120
+ const totalTokens = tokenUsage.input + tokenUsage.output;
119
121
  return totalTokens / summary.eventCount;
120
122
  }
121
123
  function avgToolDurationMs(summary) {
@@ -131,16 +133,15 @@ function avgToolDurationMs(summary) {
131
133
  if (totalCalls === 0) return void 0;
132
134
  return totalDuration / totalCalls;
133
135
  }
134
- function mergeExecutionMetrics(summary, metrics) {
135
- if (!metrics) return summary;
136
+ function mergeExecutionMetrics(computed, metrics) {
137
+ if (!metrics) return computed;
136
138
  return {
137
- ...summary,
139
+ trace: computed.trace,
138
140
  tokenUsage: metrics.tokenUsage,
139
141
  costUsd: metrics.costUsd,
140
142
  durationMs: metrics.durationMs,
141
- // Provider-level timing takes precedence over span-derived timing
142
- startTime: metrics.startTime ?? summary.startTime,
143
- endTime: metrics.endTime ?? summary.endTime
143
+ startTime: metrics.startTime ?? computed.startTime,
144
+ endTime: metrics.endTime ?? computed.endTime
144
145
  };
145
146
  }
146
147
 
@@ -538,6 +539,24 @@ function extractCacheConfig(suite) {
538
539
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
539
540
  return { enabled: cache, cachePath: resolvedCachePath };
540
541
  }
542
+ function extractTotalBudgetUsd(suite) {
543
+ const execution = suite.execution;
544
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
545
+ return void 0;
546
+ }
547
+ const executionObj = execution;
548
+ const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
549
+ if (rawBudget === void 0 || rawBudget === null) {
550
+ return void 0;
551
+ }
552
+ if (typeof rawBudget === "number" && rawBudget > 0) {
553
+ return rawBudget;
554
+ }
555
+ logWarning(
556
+ `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
557
+ );
558
+ return void 0;
559
+ }
541
560
  function logWarning(message) {
542
561
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
543
562
  }
@@ -2595,6 +2614,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2595
2614
  trials: extractTrialsConfig(parsed),
2596
2615
  targets: extractTargetsFromSuite(parsed),
2597
2616
  cacheConfig: extractCacheConfig(parsed),
2617
+ totalBudgetUsd: extractTotalBudgetUsd(parsed),
2598
2618
  ...metadata !== void 0 && { metadata }
2599
2619
  };
2600
2620
  }
@@ -3078,10 +3098,13 @@ async function invokeModel(options) {
3078
3098
  }
3079
3099
  function mapResponse(result) {
3080
3100
  const content = result.text ?? "";
3101
+ const rawUsage = result.totalUsage ?? result.usage;
3102
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
3081
3103
  return {
3082
3104
  raw: result,
3083
- usage: toJsonObject(result.totalUsage ?? result.usage),
3084
- output: [{ role: "assistant", content }]
3105
+ usage: toJsonObject(rawUsage),
3106
+ output: [{ role: "assistant", content }],
3107
+ tokenUsage
3085
3108
  };
3086
3109
  }
3087
3110
  function toJsonObject(value) {
@@ -8374,6 +8397,8 @@ async function createTargetProxy(options) {
8374
8397
  const token = randomBytes(32).toString("hex");
8375
8398
  let callCount = 0;
8376
8399
  let isShutdown = false;
8400
+ let totalInputTokens = 0;
8401
+ let totalOutputTokens = 0;
8377
8402
  const targetsList = availableTargets ?? [defaultProvider.targetName];
8378
8403
  function resolveProvider(targetName) {
8379
8404
  if (targetName === void 0 || targetName === defaultProvider.targetName) {
@@ -8452,11 +8477,16 @@ async function createTargetProxy(options) {
8452
8477
  evalCaseId: request.evalCaseId ?? "proxy",
8453
8478
  attempt: request.attempt ?? 1
8454
8479
  });
8480
+ if (response.tokenUsage) {
8481
+ totalInputTokens += response.tokenUsage.input;
8482
+ totalOutputTokens += response.tokenUsage.output;
8483
+ }
8455
8484
  const output = response.output ?? [];
8456
8485
  const rawText = extractLastAssistantContent2(output);
8457
8486
  const result = {
8458
8487
  output,
8459
- rawText
8488
+ rawText,
8489
+ tokenUsage: response.tokenUsage
8460
8490
  };
8461
8491
  sendJson(res, 200, result);
8462
8492
  } catch (error) {
@@ -8503,10 +8533,15 @@ async function createTargetProxy(options) {
8503
8533
  evalCaseId: request.evalCaseId ?? "proxy",
8504
8534
  attempt: request.attempt ?? 1
8505
8535
  });
8536
+ if (response.tokenUsage) {
8537
+ totalInputTokens += response.tokenUsage.input;
8538
+ totalOutputTokens += response.tokenUsage.output;
8539
+ }
8506
8540
  const output = response.output ?? [];
8507
8541
  responses.push({
8508
8542
  output,
8509
- rawText: extractLastAssistantContent2(output)
8543
+ rawText: extractLastAssistantContent2(output),
8544
+ tokenUsage: response.tokenUsage
8510
8545
  });
8511
8546
  } catch (error) {
8512
8547
  const message = error instanceof Error ? error.message : String(error);
@@ -8545,7 +8580,8 @@ async function createTargetProxy(options) {
8545
8580
  },
8546
8581
  getUsageMetadata: () => ({
8547
8582
  callCount,
8548
- maxCalls
8583
+ maxCalls,
8584
+ tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
8549
8585
  })
8550
8586
  };
8551
8587
  }
@@ -8670,6 +8706,11 @@ var CodeEvaluator = class {
8670
8706
  ),
8671
8707
  input: context.evalCase.input,
8672
8708
  trace: context.trace ?? null,
8709
+ tokenUsage: context.tokenUsage ?? null,
8710
+ costUsd: context.costUsd ?? null,
8711
+ durationMs: context.durationMs ?? null,
8712
+ startTime: context.startTime ?? null,
8713
+ endTime: context.endTime ?? null,
8673
8714
  fileChanges: context.fileChanges ?? null,
8674
8715
  workspacePath: context.workspacePath ?? null,
8675
8716
  config: this.config ?? null
@@ -8728,7 +8769,8 @@ var CodeEvaluator = class {
8728
8769
  expectedAspectCount: hits.length + misses.length || 1,
8729
8770
  reasoning,
8730
8771
  evaluatorRawRequest,
8731
- ...details ? { details } : {}
8772
+ ...details ? { details } : {},
8773
+ tokenUsage: proxyUsage?.tokenUsage
8732
8774
  };
8733
8775
  } catch (error) {
8734
8776
  const message = error instanceof Error ? error.message : String(error);
@@ -8750,7 +8792,8 @@ var CodeEvaluator = class {
8750
8792
  }
8751
8793
  } : {},
8752
8794
  error: message
8753
- }
8795
+ },
8796
+ tokenUsage: proxyUsage?.tokenUsage
8754
8797
  };
8755
8798
  } finally {
8756
8799
  if (proxyShutdown) {
@@ -8885,7 +8928,7 @@ ${context.fileChanges}`;
8885
8928
  target: judgeProvider.targetName
8886
8929
  };
8887
8930
  try {
8888
- const { data } = await this.runWithRetry({
8931
+ const { data, tokenUsage } = await this.runWithRetry({
8889
8932
  context,
8890
8933
  judgeProvider,
8891
8934
  systemPrompt,
@@ -8904,7 +8947,8 @@ ${context.fileChanges}`;
8904
8947
  misses,
8905
8948
  expectedAspectCount,
8906
8949
  reasoning,
8907
- evaluatorRawRequest
8950
+ evaluatorRawRequest,
8951
+ tokenUsage
8908
8952
  };
8909
8953
  } catch {
8910
8954
  return {
@@ -8934,7 +8978,7 @@ ${context.fileChanges}`;
8934
8978
  systemPrompt,
8935
8979
  target: judgeProvider.targetName
8936
8980
  };
8937
- const { data } = await this.runWithRetry({
8981
+ const { data, tokenUsage } = await this.runWithRetry({
8938
8982
  context,
8939
8983
  judgeProvider,
8940
8984
  systemPrompt,
@@ -8949,7 +8993,8 @@ ${context.fileChanges}`;
8949
8993
  misses,
8950
8994
  expectedAspectCount: rubrics.length,
8951
8995
  reasoning: data.overall_reasoning,
8952
- evaluatorRawRequest
8996
+ evaluatorRawRequest,
8997
+ tokenUsage
8953
8998
  };
8954
8999
  }
8955
9000
  /**
@@ -8964,7 +9009,7 @@ ${context.fileChanges}`;
8964
9009
  systemPrompt,
8965
9010
  target: judgeProvider.targetName
8966
9011
  };
8967
- const { data } = await this.runWithRetry({
9012
+ const { data, tokenUsage } = await this.runWithRetry({
8968
9013
  context,
8969
9014
  judgeProvider,
8970
9015
  systemPrompt,
@@ -8980,7 +9025,8 @@ ${context.fileChanges}`;
8980
9025
  expectedAspectCount: rubrics.length,
8981
9026
  reasoning: data.overall_reasoning,
8982
9027
  evaluatorRawRequest,
8983
- details
9028
+ details,
9029
+ tokenUsage
8984
9030
  };
8985
9031
  }
8986
9032
  /**
@@ -9064,15 +9110,17 @@ ${context.fileChanges}`;
9064
9110
  try {
9065
9111
  const model = judgeProvider.asLanguageModel?.();
9066
9112
  if (model) {
9067
- const { text } = await generateText2({
9113
+ const result = await generateText2({
9068
9114
  model,
9069
9115
  system: systemPrompt,
9070
9116
  prompt: userPrompt,
9071
9117
  ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
9072
9118
  ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
9073
9119
  });
9074
- const data2 = schema.parse(parseJsonFromText(text));
9075
- return { data: data2 };
9120
+ const data2 = schema.parse(parseJsonFromText(result.text));
9121
+ const rawUsage = result.usage;
9122
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
9123
+ return { data: data2, tokenUsage };
9076
9124
  }
9077
9125
  const response = await judgeProvider.invoke({
9078
9126
  question: userPrompt,
@@ -9083,7 +9131,7 @@ ${context.fileChanges}`;
9083
9131
  temperature: this.temperature
9084
9132
  });
9085
9133
  const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
9086
- return { data, providerResponse: response };
9134
+ return { data, providerResponse: response, tokenUsage: response.tokenUsage };
9087
9135
  } catch (e) {
9088
9136
  lastError = e instanceof Error ? e : new Error(String(e));
9089
9137
  }
@@ -9289,7 +9337,8 @@ var CompositeEvaluator = class {
9289
9337
  reasoning: member.result.reasoning,
9290
9338
  evaluatorRawRequest: member.result.evaluatorRawRequest,
9291
9339
  scores: member.result.scores,
9292
- details: member.result.details
9340
+ details: member.result.details,
9341
+ tokenUsage: member.result.tokenUsage
9293
9342
  });
9294
9343
  }
9295
9344
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
@@ -9337,7 +9386,8 @@ var CompositeEvaluator = class {
9337
9386
  reasoning: member.result.reasoning,
9338
9387
  evaluatorRawRequest: member.result.evaluatorRawRequest,
9339
9388
  scores: member.result.scores,
9340
- details: member.result.details
9389
+ details: member.result.details,
9390
+ tokenUsage: member.result.tokenUsage
9341
9391
  });
9342
9392
  }
9343
9393
  const totalCount = results.length;
@@ -9516,7 +9566,7 @@ var CostEvaluator = class {
9516
9566
  }
9517
9567
  evaluate(context) {
9518
9568
  const { budget } = this.config;
9519
- const costUsd = context.trace?.costUsd;
9569
+ const costUsd = context.costUsd;
9520
9570
  if (costUsd === void 0) {
9521
9571
  return {
9522
9572
  score: 0,
@@ -9559,7 +9609,7 @@ var ExecutionMetricsEvaluator = class {
9559
9609
  this.config = options.config;
9560
9610
  }
9561
9611
  evaluate(context) {
9562
- const { trace } = context;
9612
+ const { trace, tokenUsage, costUsd, durationMs } = context;
9563
9613
  const {
9564
9614
  max_tool_calls,
9565
9615
  max_llm_calls,
@@ -9569,7 +9619,8 @@ var ExecutionMetricsEvaluator = class {
9569
9619
  target_exploration_ratio,
9570
9620
  exploration_tolerance = 0.2
9571
9621
  } = this.config;
9572
- if (!trace) {
9622
+ const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
9623
+ if (needsTrace && !trace) {
9573
9624
  return {
9574
9625
  score: 0,
9575
9626
  verdict: "fail",
@@ -9584,11 +9635,12 @@ var ExecutionMetricsEvaluator = class {
9584
9635
  }
9585
9636
  };
9586
9637
  }
9638
+ const narrowedTrace = trace;
9587
9639
  const hits = [];
9588
9640
  const misses = [];
9589
9641
  const actualMetrics = {};
9590
- if (max_tool_calls !== void 0) {
9591
- const toolCalls = trace.eventCount;
9642
+ if (max_tool_calls !== void 0 && narrowedTrace) {
9643
+ const toolCalls = narrowedTrace.eventCount;
9592
9644
  actualMetrics.tool_calls = toolCalls;
9593
9645
  if (toolCalls <= max_tool_calls) {
9594
9646
  hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
@@ -9596,8 +9648,8 @@ var ExecutionMetricsEvaluator = class {
9596
9648
  misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
9597
9649
  }
9598
9650
  }
9599
- if (max_llm_calls !== void 0) {
9600
- const llmCalls = trace.llmCallCount;
9651
+ if (max_llm_calls !== void 0 && narrowedTrace) {
9652
+ const llmCalls = narrowedTrace.llmCallCount;
9601
9653
  if (llmCalls === void 0) {
9602
9654
  misses.push("LLM call count data not available");
9603
9655
  } else {
@@ -9610,7 +9662,6 @@ var ExecutionMetricsEvaluator = class {
9610
9662
  }
9611
9663
  }
9612
9664
  if (max_tokens !== void 0) {
9613
- const tokenUsage = trace.tokenUsage;
9614
9665
  if (!tokenUsage) {
9615
9666
  misses.push("Token usage data not available");
9616
9667
  } else {
@@ -9624,7 +9675,6 @@ var ExecutionMetricsEvaluator = class {
9624
9675
  }
9625
9676
  }
9626
9677
  if (max_cost_usd !== void 0) {
9627
- const costUsd = trace.costUsd;
9628
9678
  if (costUsd === void 0) {
9629
9679
  misses.push("Cost data not available");
9630
9680
  } else {
@@ -9638,7 +9688,6 @@ var ExecutionMetricsEvaluator = class {
9638
9688
  }
9639
9689
  }
9640
9690
  if (max_duration_ms !== void 0) {
9641
- const durationMs = trace.durationMs;
9642
9691
  if (durationMs === void 0) {
9643
9692
  misses.push("Duration data not available");
9644
9693
  } else {
@@ -9650,8 +9699,8 @@ var ExecutionMetricsEvaluator = class {
9650
9699
  }
9651
9700
  }
9652
9701
  }
9653
- if (target_exploration_ratio !== void 0) {
9654
- const ratio = explorationRatio(trace);
9702
+ if (target_exploration_ratio !== void 0 && narrowedTrace) {
9703
+ const ratio = explorationRatio(narrowedTrace);
9655
9704
  if (ratio === void 0) {
9656
9705
  misses.push("Exploration ratio not available (no tool calls)");
9657
9706
  } else {
@@ -10165,7 +10214,7 @@ var LatencyEvaluator = class {
10165
10214
  }
10166
10215
  evaluate(context) {
10167
10216
  const { threshold } = this.config;
10168
- const durationMs = context.trace?.durationMs;
10217
+ const durationMs = context.durationMs;
10169
10218
  if (durationMs === void 0) {
10170
10219
  return {
10171
10220
  score: 0,
@@ -10810,7 +10859,7 @@ var TokenUsageEvaluator = class {
10810
10859
  this.config = options.config;
10811
10860
  }
10812
10861
  evaluate(context) {
10813
- const usage = context.trace?.tokenUsage;
10862
+ const usage = context.tokenUsage;
10814
10863
  const maxTotal = this.config.max_total;
10815
10864
  const maxInput = this.config.max_input;
10816
10865
  const maxOutput = this.config.max_output;
@@ -12255,7 +12304,8 @@ async function runEvaluation(options) {
12255
12304
  keepWorkspaces,
12256
12305
  cleanupWorkspaces,
12257
12306
  trials,
12258
- streamCallbacks
12307
+ streamCallbacks,
12308
+ totalBudgetUsd
12259
12309
  } = options;
12260
12310
  let useCache = options.useCache;
12261
12311
  if (trials && trials.count > 1 && useCache) {
@@ -12428,10 +12478,39 @@ async function runEvaluation(options) {
12428
12478
  let nextWorkerId = 1;
12429
12479
  const workerIdByEvalId = /* @__PURE__ */ new Map();
12430
12480
  let beforeAllOutputAttached = false;
12481
+ let cumulativeBudgetCost = 0;
12482
+ let budgetExhausted = false;
12431
12483
  const promises = filteredEvalCases.map(
12432
12484
  (evalCase) => limit(async () => {
12433
12485
  const workerId = nextWorkerId++;
12434
12486
  workerIdByEvalId.set(evalCase.id, workerId);
12487
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
12488
+ const budgetResult = {
12489
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
12490
+ testId: evalCase.id,
12491
+ dataset: evalCase.dataset,
12492
+ score: 0,
12493
+ hits: [],
12494
+ misses: [],
12495
+ answer: "",
12496
+ target: target.name,
12497
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
12498
+ budgetExceeded: true
12499
+ };
12500
+ if (onProgress) {
12501
+ await onProgress({
12502
+ workerId,
12503
+ testId: evalCase.id,
12504
+ status: "failed",
12505
+ completedAt: Date.now(),
12506
+ error: budgetResult.error
12507
+ });
12508
+ }
12509
+ if (onResult) {
12510
+ await onResult(budgetResult);
12511
+ }
12512
+ return budgetResult;
12513
+ }
12435
12514
  if (onProgress) {
12436
12515
  await onProgress({
12437
12516
  workerId,
@@ -12465,6 +12544,23 @@ async function runEvaluation(options) {
12465
12544
  typeRegistry
12466
12545
  };
12467
12546
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
12547
+ if (totalBudgetUsd !== void 0) {
12548
+ let caseCost;
12549
+ if (result.trials && result.trials.length > 0) {
12550
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
12551
+ if (trialCostSum > 0) {
12552
+ caseCost = trialCostSum;
12553
+ }
12554
+ } else {
12555
+ caseCost = result.costUsd;
12556
+ }
12557
+ if (caseCost !== void 0) {
12558
+ cumulativeBudgetCost += caseCost;
12559
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
12560
+ budgetExhausted = true;
12561
+ }
12562
+ }
12563
+ }
12468
12564
  if (beforeAllOutput && !beforeAllOutputAttached) {
12469
12565
  result = { ...result, beforeAllOutput };
12470
12566
  beforeAllOutputAttached = true;
@@ -12617,17 +12713,18 @@ async function runBatchEvaluation(options) {
12617
12713
  const providerResponse = batchResponse[i];
12618
12714
  const output = providerResponse.output;
12619
12715
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
12620
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
12621
- eventCount: 0,
12622
- toolNames: [],
12623
- toolCallsByName: {},
12624
- errorCount: 0
12625
- } : void 0;
12626
- const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
12716
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
12717
+ const merged = computed ? mergeExecutionMetrics(computed, {
12627
12718
  tokenUsage: providerResponse.tokenUsage,
12628
12719
  costUsd: providerResponse.costUsd,
12629
12720
  durationMs: providerResponse.durationMs
12630
12721
  }) : void 0;
12722
+ const trace = merged?.trace;
12723
+ const costUsd = merged?.costUsd;
12724
+ const durationMs = merged?.durationMs;
12725
+ const tokenUsage = merged?.tokenUsage;
12726
+ const startTime = merged?.startTime;
12727
+ const endTime = merged?.endTime;
12631
12728
  const candidate = extractLastAssistantContent(output);
12632
12729
  const providerError = extractProviderError(providerResponse);
12633
12730
  let result;
@@ -12646,6 +12743,11 @@ async function runBatchEvaluation(options) {
12646
12743
  agentTimeoutMs,
12647
12744
  output,
12648
12745
  trace,
12746
+ costUsd,
12747
+ durationMs,
12748
+ tokenUsage,
12749
+ startTime,
12750
+ endTime,
12649
12751
  targetResolver,
12650
12752
  availableTargets
12651
12753
  });
@@ -12882,17 +12984,18 @@ async function runEvalCase(options) {
12882
12984
  }
12883
12985
  const output = providerResponse.output;
12884
12986
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
12885
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
12886
- eventCount: 0,
12887
- toolNames: [],
12888
- toolCallsByName: {},
12889
- errorCount: 0
12890
- } : void 0;
12891
- const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
12987
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
12988
+ const merged = computed ? mergeExecutionMetrics(computed, {
12892
12989
  tokenUsage: providerResponse.tokenUsage,
12893
12990
  costUsd: providerResponse.costUsd,
12894
12991
  durationMs: providerResponse.durationMs
12895
12992
  }) : void 0;
12993
+ const trace = merged?.trace;
12994
+ const costUsd = merged?.costUsd;
12995
+ const durationMs = merged?.durationMs;
12996
+ const tokenUsage = merged?.tokenUsage;
12997
+ const startTime = merged?.startTime;
12998
+ const endTime = merged?.endTime;
12896
12999
  const candidate = extractLastAssistantContent(output);
12897
13000
  let fileChanges;
12898
13001
  if (baselineCommit && workspacePath) {
@@ -12937,6 +13040,11 @@ async function runEvalCase(options) {
12937
13040
  agentTimeoutMs,
12938
13041
  output,
12939
13042
  trace,
13043
+ costUsd,
13044
+ durationMs,
13045
+ tokenUsage,
13046
+ startTime,
13047
+ endTime,
12940
13048
  targetResolver,
12941
13049
  availableTargets,
12942
13050
  fileChanges,
@@ -12993,7 +13101,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
12993
13101
  };
12994
13102
  const result = await runEvalCase(trialOptions);
12995
13103
  allResults.push(result);
12996
- const trialCost = result.trace?.costUsd;
13104
+ const trialCost = result.costUsd;
12997
13105
  const trialVerdict = scoreToVerdict(result.score);
12998
13106
  const trial = {
12999
13107
  attempt,
@@ -13049,6 +13157,11 @@ async function evaluateCandidate(options) {
13049
13157
  agentTimeoutMs,
13050
13158
  output,
13051
13159
  trace,
13160
+ costUsd,
13161
+ durationMs,
13162
+ tokenUsage,
13163
+ startTime,
13164
+ endTime,
13052
13165
  targetResolver,
13053
13166
  availableTargets,
13054
13167
  fileChanges,
@@ -13069,6 +13182,11 @@ async function evaluateCandidate(options) {
13069
13182
  agentTimeoutMs,
13070
13183
  output,
13071
13184
  trace,
13185
+ costUsd,
13186
+ durationMs,
13187
+ tokenUsage,
13188
+ startTime,
13189
+ endTime,
13072
13190
  targetResolver,
13073
13191
  availableTargets,
13074
13192
  fileChanges,
@@ -13112,6 +13230,11 @@ async function evaluateCandidate(options) {
13112
13230
  answer: candidate,
13113
13231
  target: target.name,
13114
13232
  reasoning: score.reasoning,
13233
+ tokenUsage,
13234
+ costUsd,
13235
+ durationMs,
13236
+ startTime,
13237
+ endTime,
13115
13238
  requests,
13116
13239
  input,
13117
13240
  scores,
@@ -13135,6 +13258,11 @@ async function runEvaluatorsForCase(options) {
13135
13258
  agentTimeoutMs,
13136
13259
  output,
13137
13260
  trace,
13261
+ costUsd,
13262
+ durationMs,
13263
+ tokenUsage,
13264
+ startTime,
13265
+ endTime,
13138
13266
  targetResolver,
13139
13267
  availableTargets,
13140
13268
  fileChanges,
@@ -13156,6 +13284,11 @@ async function runEvaluatorsForCase(options) {
13156
13284
  agentTimeoutMs,
13157
13285
  output,
13158
13286
  trace,
13287
+ costUsd,
13288
+ durationMs,
13289
+ tokenUsage,
13290
+ startTime,
13291
+ endTime,
13159
13292
  targetResolver,
13160
13293
  availableTargets,
13161
13294
  fileChanges,
@@ -13178,6 +13311,11 @@ async function runEvaluatorsForCase(options) {
13178
13311
  judgeProvider,
13179
13312
  output,
13180
13313
  trace,
13314
+ tokenUsage,
13315
+ costUsd,
13316
+ durationMs,
13317
+ startTime,
13318
+ endTime,
13181
13319
  targetResolver,
13182
13320
  availableTargets,
13183
13321
  fileChanges,
@@ -13201,6 +13339,11 @@ async function runEvaluatorList(options) {
13201
13339
  agentTimeoutMs,
13202
13340
  output,
13203
13341
  trace,
13342
+ costUsd,
13343
+ durationMs,
13344
+ tokenUsage,
13345
+ startTime,
13346
+ endTime,
13204
13347
  targetResolver,
13205
13348
  availableTargets,
13206
13349
  fileChanges,
@@ -13219,6 +13362,11 @@ async function runEvaluatorList(options) {
13219
13362
  judgeProvider,
13220
13363
  output,
13221
13364
  trace,
13365
+ tokenUsage,
13366
+ costUsd,
13367
+ durationMs,
13368
+ startTime,
13369
+ endTime,
13222
13370
  targetResolver,
13223
13371
  availableTargets,
13224
13372
  fileChanges,
@@ -13258,7 +13406,8 @@ async function runEvaluatorList(options) {
13258
13406
  reasoning: score2.reasoning,
13259
13407
  evaluatorProviderRequest: score2.evaluatorRawRequest,
13260
13408
  details: score2.details,
13261
- scores: mapChildResults(score2.scores)
13409
+ scores: mapChildResults(score2.scores),
13410
+ tokenUsage: score2.tokenUsage
13262
13411
  });
13263
13412
  } catch (error) {
13264
13413
  const message = error instanceof Error ? error.message : String(error);
@@ -13506,7 +13655,8 @@ function mapChildResults(children) {
13506
13655
  reasoning: child.reasoning,
13507
13656
  evaluatorProviderRequest: child.evaluatorRawRequest,
13508
13657
  scores: mapChildResults(child.scores),
13509
- details: child.details
13658
+ details: child.details,
13659
+ tokenUsage: child.tokenUsage
13510
13660
  }));
13511
13661
  }
13512
13662
  function computeWeightedMean(entries) {
@@ -13886,7 +14036,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
13886
14036
  "beforeEachOutput",
13887
14037
  "afterAllOutput",
13888
14038
  "afterEachOutput",
13889
- "fileChanges"
14039
+ "fileChanges",
14040
+ // Promoted execution metrics (debug, not needed for regression comparison)
14041
+ "tokenUsage",
14042
+ "costUsd",
14043
+ "durationMs",
14044
+ "startTime",
14045
+ "endTime"
13890
14046
  ]);
13891
14047
  var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
13892
14048
  function trimEvaluatorResult(result) {
@@ -14009,8 +14165,8 @@ var OtelTraceExporter = class {
14009
14165
  const api = this.api;
14010
14166
  const tracer = this.tracer;
14011
14167
  const captureContent = this.options.captureContent ?? false;
14012
- const startHr = toHrTime(result.trace?.startTime ?? result.timestamp);
14013
- const endHr = toHrTime(result.trace?.endTime ?? result.timestamp);
14168
+ const startHr = toHrTime(result.startTime ?? result.timestamp);
14169
+ const endHr = toHrTime(result.endTime ?? result.timestamp);
14014
14170
  let parentCtx = api.ROOT_CONTEXT;
14015
14171
  const traceparent = process.env.TRACEPARENT;
14016
14172
  if (traceparent && this.W3CPropagator) {
@@ -14039,12 +14195,13 @@ var OtelTraceExporter = class {
14039
14195
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
14040
14196
  rootSpan.setAttribute("agentv.score", result.score);
14041
14197
  if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
14198
+ if (result.durationMs != null)
14199
+ rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
14200
+ if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
14042
14201
  if (result.trace) {
14043
14202
  const t = result.trace;
14044
14203
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
14045
14204
  rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
14046
- if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
14047
- if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
14048
14205
  if (t.llmCallCount != null)
14049
14206
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
14050
14207
  }