npm - @agentv/core - Versions diffs - 2.7.1-next.6 → 2.9.0-next.1 - Mend

@agentv/core 2.7.1-next.6 → 2.9.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-5SV2QC6V.js → chunk-7Q4PH265.js} +6 -18
package/dist/chunk-7Q4PH265.js.map +1 -0
package/dist/evaluation/validation/index.cjs +4 -11
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +2 -2
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +234 -89
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +54 -22
package/dist/index.d.ts +54 -22
package/dist/index.js +230 -73
package/dist/index.js.map +1 -1
package/package.json +6 -4
package/dist/chunk-5SV2QC6V.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -17,7 +17,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-5SV2QC6V.js";
+} from "./chunk-7Q4PH265.js";
 import {
   OtlpJsonFileExporter
 } from "./chunk-HFSYZHGF.js";
@@ -83,14 +83,16 @@ function computeTraceSummary(messages) {
   }
   const toolNames = Object.keys(toolCallCounts).sort();
   return {
-    eventCount: totalToolCalls,
-    toolNames,
-    toolCallsByName: toolCallCounts,
-    errorCount: 0,
+    trace: {
+      eventCount: totalToolCalls,
+      toolNames,
+      toolCallsByName: toolCallCounts,
+      errorCount: 0,
+      llmCallCount,
+      ...hasAnyDuration ? { toolDurations } : {}
+    },
     startTime: earliestStart?.toISOString(),
-    endTime: latestEnd?.toISOString(),
-    llmCallCount,
-    ...hasAnyDuration ? { toolDurations } : {}
+    endTime: latestEnd?.toISOString()
   };
 }
 var DEFAULT_EXPLORATION_TOOLS = [
@@ -113,9 +115,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
   );
   return explorationCalls / summary.eventCount;
 }
-function tokensPerTool(summary) {
-  if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
-  const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
+function tokensPerTool(summary, tokenUsage) {
+  if (!tokenUsage || summary.eventCount === 0) return void 0;
+  const totalTokens = tokenUsage.input + tokenUsage.output;
   return totalTokens / summary.eventCount;
 }
 function avgToolDurationMs(summary) {
@@ -131,16 +133,15 @@ function avgToolDurationMs(summary) {
   if (totalCalls === 0) return void 0;
   return totalDuration / totalCalls;
 }
-function mergeExecutionMetrics(summary, metrics) {
-  if (!metrics) return summary;
+function mergeExecutionMetrics(computed, metrics) {
+  if (!metrics) return computed;
   return {
-    ...summary,
+    trace: computed.trace,
     tokenUsage: metrics.tokenUsage,
     costUsd: metrics.costUsd,
     durationMs: metrics.durationMs,
-    // Provider-level timing takes precedence over span-derived timing
-    startTime: metrics.startTime ?? summary.startTime,
-    endTime: metrics.endTime ?? summary.endTime
+    startTime: metrics.startTime ?? computed.startTime,
+    endTime: metrics.endTime ?? computed.endTime
   };
 }
@@ -538,6 +539,24 @@ function extractCacheConfig(suite) {
   const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
   return { enabled: cache, cachePath: resolvedCachePath };
 }
+function extractTotalBudgetUsd(suite) {
+  const execution = suite.execution;
+  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
+    return void 0;
+  }
+  const executionObj = execution;
+  const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
+  if (rawBudget === void 0 || rawBudget === null) {
+    return void 0;
+  }
+  if (typeof rawBudget === "number" && rawBudget > 0) {
+    return rawBudget;
+  }
+  logWarning(
+    `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
+  );
+  return void 0;
+}
 function logWarning(message) {
   console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
 }
@@ -2595,6 +2614,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
     trials: extractTrialsConfig(parsed),
     targets: extractTargetsFromSuite(parsed),
     cacheConfig: extractCacheConfig(parsed),
+    totalBudgetUsd: extractTotalBudgetUsd(parsed),
     ...metadata !== void 0 && { metadata }
   };
 }
@@ -3078,10 +3098,13 @@ async function invokeModel(options) {
 }
 function mapResponse(result) {
   const content = result.text ?? "";
+  const rawUsage = result.totalUsage ?? result.usage;
+  const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
   return {
     raw: result,
-    usage: toJsonObject(result.totalUsage ?? result.usage),
-    output: [{ role: "assistant", content }]
+    usage: toJsonObject(rawUsage),
+    output: [{ role: "assistant", content }],
+    tokenUsage
   };
 }
 function toJsonObject(value) {
@@ -8374,6 +8397,8 @@ async function createTargetProxy(options) {
   const token = randomBytes(32).toString("hex");
   let callCount = 0;
   let isShutdown = false;
+  let totalInputTokens = 0;
+  let totalOutputTokens = 0;
   const targetsList = availableTargets ?? [defaultProvider.targetName];
   function resolveProvider(targetName) {
     if (targetName === void 0 || targetName === defaultProvider.targetName) {
@@ -8452,11 +8477,16 @@ async function createTargetProxy(options) {
         evalCaseId: request.evalCaseId ?? "proxy",
         attempt: request.attempt ?? 1
       });
+      if (response.tokenUsage) {
+        totalInputTokens += response.tokenUsage.input;
+        totalOutputTokens += response.tokenUsage.output;
+      }
       const output = response.output ?? [];
       const rawText = extractLastAssistantContent2(output);
       const result = {
         output,
-        rawText
+        rawText,
+        tokenUsage: response.tokenUsage
       };
       sendJson(res, 200, result);
     } catch (error) {
@@ -8503,10 +8533,15 @@ async function createTargetProxy(options) {
             evalCaseId: request.evalCaseId ?? "proxy",
             attempt: request.attempt ?? 1
           });
+          if (response.tokenUsage) {
+            totalInputTokens += response.tokenUsage.input;
+            totalOutputTokens += response.tokenUsage.output;
+          }
           const output = response.output ?? [];
           responses.push({
             output,
-            rawText: extractLastAssistantContent2(output)
+            rawText: extractLastAssistantContent2(output),
+            tokenUsage: response.tokenUsage
           });
         } catch (error) {
           const message = error instanceof Error ? error.message : String(error);
@@ -8545,7 +8580,8 @@ async function createTargetProxy(options) {
     },
     getUsageMetadata: () => ({
       callCount,
-      maxCalls
+      maxCalls,
+      tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
     })
   };
 }
@@ -8670,6 +8706,11 @@ var CodeEvaluator = class {
       ),
       input: context.evalCase.input,
       trace: context.trace ?? null,
+      tokenUsage: context.tokenUsage ?? null,
+      costUsd: context.costUsd ?? null,
+      durationMs: context.durationMs ?? null,
+      startTime: context.startTime ?? null,
+      endTime: context.endTime ?? null,
       fileChanges: context.fileChanges ?? null,
       workspacePath: context.workspacePath ?? null,
       config: this.config ?? null
@@ -8728,7 +8769,8 @@ var CodeEvaluator = class {
         expectedAspectCount: hits.length + misses.length || 1,
         reasoning,
         evaluatorRawRequest,
-        ...details ? { details } : {}
+        ...details ? { details } : {},
+        tokenUsage: proxyUsage?.tokenUsage
       };
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
@@ -8750,7 +8792,8 @@ var CodeEvaluator = class {
             }
           } : {},
           error: message
-        }
+        },
+        tokenUsage: proxyUsage?.tokenUsage
       };
     } finally {
       if (proxyShutdown) {
@@ -8885,7 +8928,7 @@ ${context.fileChanges}`;
       target: judgeProvider.targetName
     };
     try {
-      const { data } = await this.runWithRetry({
+      const { data, tokenUsage } = await this.runWithRetry({
         context,
         judgeProvider,
         systemPrompt,
@@ -8904,7 +8947,8 @@ ${context.fileChanges}`;
         misses,
         expectedAspectCount,
         reasoning,
-        evaluatorRawRequest
+        evaluatorRawRequest,
+        tokenUsage
       };
     } catch {
       return {
@@ -8934,7 +8978,7 @@ ${context.fileChanges}`;
       systemPrompt,
       target: judgeProvider.targetName
     };
-    const { data } = await this.runWithRetry({
+    const { data, tokenUsage } = await this.runWithRetry({
       context,
       judgeProvider,
       systemPrompt,
@@ -8949,7 +8993,8 @@ ${context.fileChanges}`;
       misses,
       expectedAspectCount: rubrics.length,
       reasoning: data.overall_reasoning,
-      evaluatorRawRequest
+      evaluatorRawRequest,
+      tokenUsage
     };
   }
   /**
@@ -8964,7 +9009,7 @@ ${context.fileChanges}`;
       systemPrompt,
       target: judgeProvider.targetName
     };
-    const { data } = await this.runWithRetry({
+    const { data, tokenUsage } = await this.runWithRetry({
       context,
       judgeProvider,
       systemPrompt,
@@ -8980,7 +9025,8 @@ ${context.fileChanges}`;
       expectedAspectCount: rubrics.length,
       reasoning: data.overall_reasoning,
       evaluatorRawRequest,
-      details
+      details,
+      tokenUsage
     };
   }
   /**
@@ -9064,15 +9110,17 @@ ${context.fileChanges}`;
       try {
         const model = judgeProvider.asLanguageModel?.();
         if (model) {
-          const { text } = await generateText2({
+          const result = await generateText2({
             model,
             system: systemPrompt,
             prompt: userPrompt,
             ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
             ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
           });
-          const data2 = schema.parse(parseJsonFromText(text));
-          return { data: data2 };
+          const data2 = schema.parse(parseJsonFromText(result.text));
+          const rawUsage = result.usage;
+          const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
+          return { data: data2, tokenUsage };
         }
         const response = await judgeProvider.invoke({
           question: userPrompt,
@@ -9083,7 +9131,7 @@ ${context.fileChanges}`;
           temperature: this.temperature
         });
         const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
-        return { data, providerResponse: response };
+        return { data, providerResponse: response, tokenUsage: response.tokenUsage };
       } catch (e) {
         lastError = e instanceof Error ? e : new Error(String(e));
       }
@@ -9289,7 +9337,8 @@ var CompositeEvaluator = class {
         reasoning: member.result.reasoning,
         evaluatorRawRequest: member.result.evaluatorRawRequest,
         scores: member.result.scores,
-        details: member.result.details
+        details: member.result.details,
+        tokenUsage: member.result.tokenUsage
       });
     }
     const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
@@ -9337,7 +9386,8 @@ var CompositeEvaluator = class {
         reasoning: member.result.reasoning,
         evaluatorRawRequest: member.result.evaluatorRawRequest,
         scores: member.result.scores,
-        details: member.result.details
+        details: member.result.details,
+        tokenUsage: member.result.tokenUsage
       });
     }
     const totalCount = results.length;
@@ -9516,7 +9566,7 @@ var CostEvaluator = class {
   }
   evaluate(context) {
     const { budget } = this.config;
-    const costUsd = context.trace?.costUsd;
+    const costUsd = context.costUsd;
     if (costUsd === void 0) {
       return {
         score: 0,
@@ -9559,7 +9609,7 @@ var ExecutionMetricsEvaluator = class {
     this.config = options.config;
   }
   evaluate(context) {
-    const { trace } = context;
+    const { trace, tokenUsage, costUsd, durationMs } = context;
     const {
       max_tool_calls,
       max_llm_calls,
@@ -9569,7 +9619,8 @@ var ExecutionMetricsEvaluator = class {
       target_exploration_ratio,
       exploration_tolerance = 0.2
     } = this.config;
-    if (!trace) {
+    const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
+    if (needsTrace && !trace) {
       return {
         score: 0,
         verdict: "fail",
@@ -9584,11 +9635,12 @@ var ExecutionMetricsEvaluator = class {
         }
       };
     }
+    const narrowedTrace = trace;
     const hits = [];
     const misses = [];
     const actualMetrics = {};
-    if (max_tool_calls !== void 0) {
-      const toolCalls = trace.eventCount;
+    if (max_tool_calls !== void 0 && narrowedTrace) {
+      const toolCalls = narrowedTrace.eventCount;
       actualMetrics.tool_calls = toolCalls;
       if (toolCalls <= max_tool_calls) {
         hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
@@ -9596,8 +9648,8 @@ var ExecutionMetricsEvaluator = class {
         misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
       }
     }
-    if (max_llm_calls !== void 0) {
-      const llmCalls = trace.llmCallCount;
+    if (max_llm_calls !== void 0 && narrowedTrace) {
+      const llmCalls = narrowedTrace.llmCallCount;
       if (llmCalls === void 0) {
         misses.push("LLM call count data not available");
       } else {
@@ -9610,7 +9662,6 @@ var ExecutionMetricsEvaluator = class {
       }
     }
     if (max_tokens !== void 0) {
-      const tokenUsage = trace.tokenUsage;
       if (!tokenUsage) {
         misses.push("Token usage data not available");
       } else {
@@ -9624,7 +9675,6 @@ var ExecutionMetricsEvaluator = class {
       }
     }
     if (max_cost_usd !== void 0) {
-      const costUsd = trace.costUsd;
       if (costUsd === void 0) {
         misses.push("Cost data not available");
       } else {
@@ -9638,7 +9688,6 @@ var ExecutionMetricsEvaluator = class {
       }
     }
     if (max_duration_ms !== void 0) {
-      const durationMs = trace.durationMs;
       if (durationMs === void 0) {
         misses.push("Duration data not available");
       } else {
@@ -9650,8 +9699,8 @@ var ExecutionMetricsEvaluator = class {
         }
       }
     }
-    if (target_exploration_ratio !== void 0) {
-      const ratio = explorationRatio(trace);
+    if (target_exploration_ratio !== void 0 && narrowedTrace) {
+      const ratio = explorationRatio(narrowedTrace);
       if (ratio === void 0) {
         misses.push("Exploration ratio not available (no tool calls)");
       } else {
@@ -10165,7 +10214,7 @@ var LatencyEvaluator = class {
   }
   evaluate(context) {
     const { threshold } = this.config;
-    const durationMs = context.trace?.durationMs;
+    const durationMs = context.durationMs;
     if (durationMs === void 0) {
       return {
         score: 0,
@@ -10810,7 +10859,7 @@ var TokenUsageEvaluator = class {
     this.config = options.config;
   }
   evaluate(context) {
-    const usage = context.trace?.tokenUsage;
+    const usage = context.tokenUsage;
     const maxTotal = this.config.max_total;
     const maxInput = this.config.max_input;
     const maxOutput = this.config.max_output;
@@ -12255,7 +12304,8 @@ async function runEvaluation(options) {
     keepWorkspaces,
     cleanupWorkspaces,
     trials,
-    streamCallbacks
+    streamCallbacks,
+    totalBudgetUsd
   } = options;
   let useCache = options.useCache;
   if (trials && trials.count > 1 && useCache) {
@@ -12428,10 +12478,39 @@ async function runEvaluation(options) {
   let nextWorkerId = 1;
   const workerIdByEvalId = /* @__PURE__ */ new Map();
   let beforeAllOutputAttached = false;
+  let cumulativeBudgetCost = 0;
+  let budgetExhausted = false;
   const promises = filteredEvalCases.map(
     (evalCase) => limit(async () => {
       const workerId = nextWorkerId++;
       workerIdByEvalId.set(evalCase.id, workerId);
+      if (totalBudgetUsd !== void 0 && budgetExhausted) {
+        const budgetResult = {
+          timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
+          testId: evalCase.id,
+          dataset: evalCase.dataset,
+          score: 0,
+          hits: [],
+          misses: [],
+          answer: "",
+          target: target.name,
+          error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
+          budgetExceeded: true
+        };
+        if (onProgress) {
+          await onProgress({
+            workerId,
+            testId: evalCase.id,
+            status: "failed",
+            completedAt: Date.now(),
+            error: budgetResult.error
+          });
+        }
+        if (onResult) {
+          await onResult(budgetResult);
+        }
+        return budgetResult;
+      }
       if (onProgress) {
         await onProgress({
           workerId,
@@ -12465,6 +12544,23 @@ async function runEvaluation(options) {
           typeRegistry
         };
         let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
+        if (totalBudgetUsd !== void 0) {
+          let caseCost;
+          if (result.trials && result.trials.length > 0) {
+            const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
+            if (trialCostSum > 0) {
+              caseCost = trialCostSum;
+            }
+          } else {
+            caseCost = result.costUsd;
+          }
+          if (caseCost !== void 0) {
+            cumulativeBudgetCost += caseCost;
+            if (cumulativeBudgetCost >= totalBudgetUsd) {
+              budgetExhausted = true;
+            }
+          }
+        }
         if (beforeAllOutput && !beforeAllOutputAttached) {
           result = { ...result, beforeAllOutput };
           beforeAllOutputAttached = true;
@@ -12617,17 +12713,18 @@ async function runBatchEvaluation(options) {
     const providerResponse = batchResponse[i];
     const output = providerResponse.output;
     const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
-    const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
-      eventCount: 0,
-      toolNames: [],
-      toolCallsByName: {},
-      errorCount: 0
-    } : void 0;
-    const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
+    const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
+    const merged = computed ? mergeExecutionMetrics(computed, {
       tokenUsage: providerResponse.tokenUsage,
       costUsd: providerResponse.costUsd,
       durationMs: providerResponse.durationMs
     }) : void 0;
+    const trace = merged?.trace;
+    const costUsd = merged?.costUsd;
+    const durationMs = merged?.durationMs;
+    const tokenUsage = merged?.tokenUsage;
+    const startTime = merged?.startTime;
+    const endTime = merged?.endTime;
     const candidate = extractLastAssistantContent(output);
     const providerError = extractProviderError(providerResponse);
     let result;
@@ -12646,6 +12743,11 @@ async function runBatchEvaluation(options) {
         agentTimeoutMs,
         output,
         trace,
+        costUsd,
+        durationMs,
+        tokenUsage,
+        startTime,
+        endTime,
         targetResolver,
         availableTargets
       });
@@ -12882,17 +12984,18 @@ async function runEvalCase(options) {
   }
   const output = providerResponse.output;
   const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
-  const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
-    eventCount: 0,
-    toolNames: [],
-    toolCallsByName: {},
-    errorCount: 0
-  } : void 0;
-  const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
+  const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
+  const merged = computed ? mergeExecutionMetrics(computed, {
     tokenUsage: providerResponse.tokenUsage,
     costUsd: providerResponse.costUsd,
     durationMs: providerResponse.durationMs
   }) : void 0;
+  const trace = merged?.trace;
+  const costUsd = merged?.costUsd;
+  const durationMs = merged?.durationMs;
+  const tokenUsage = merged?.tokenUsage;
+  const startTime = merged?.startTime;
+  const endTime = merged?.endTime;
   const candidate = extractLastAssistantContent(output);
   let fileChanges;
   if (baselineCommit && workspacePath) {
@@ -12937,6 +13040,11 @@ async function runEvalCase(options) {
       agentTimeoutMs,
       output,
       trace,
+      costUsd,
+      durationMs,
+      tokenUsage,
+      startTime,
+      endTime,
       targetResolver,
       availableTargets,
       fileChanges,
@@ -12993,7 +13101,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
     };
     const result = await runEvalCase(trialOptions);
     allResults.push(result);
-    const trialCost = result.trace?.costUsd;
+    const trialCost = result.costUsd;
     const trialVerdict = scoreToVerdict(result.score);
     const trial = {
       attempt,
@@ -13049,6 +13157,11 @@ async function evaluateCandidate(options) {
     agentTimeoutMs,
     output,
     trace,
+    costUsd,
+    durationMs,
+    tokenUsage,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13069,6 +13182,11 @@ async function evaluateCandidate(options) {
     agentTimeoutMs,
     output,
     trace,
+    costUsd,
+    durationMs,
+    tokenUsage,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13112,6 +13230,11 @@ async function evaluateCandidate(options) {
     answer: candidate,
     target: target.name,
     reasoning: score.reasoning,
+    tokenUsage,
+    costUsd,
+    durationMs,
+    startTime,
+    endTime,
     requests,
     input,
     scores,
@@ -13135,6 +13258,11 @@ async function runEvaluatorsForCase(options) {
     agentTimeoutMs,
     output,
     trace,
+    costUsd,
+    durationMs,
+    tokenUsage,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13156,6 +13284,11 @@ async function runEvaluatorsForCase(options) {
       agentTimeoutMs,
       output,
       trace,
+      costUsd,
+      durationMs,
+      tokenUsage,
+      startTime,
+      endTime,
       targetResolver,
       availableTargets,
       fileChanges,
@@ -13178,6 +13311,11 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     output,
     trace,
+    tokenUsage,
+    costUsd,
+    durationMs,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13201,6 +13339,11 @@ async function runEvaluatorList(options) {
     agentTimeoutMs,
     output,
     trace,
+    costUsd,
+    durationMs,
+    tokenUsage,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13219,6 +13362,11 @@ async function runEvaluatorList(options) {
     judgeProvider,
     output,
     trace,
+    tokenUsage,
+    costUsd,
+    durationMs,
+    startTime,
+    endTime,
     targetResolver,
     availableTargets,
     fileChanges,
@@ -13258,7 +13406,8 @@ async function runEvaluatorList(options) {
         reasoning: score2.reasoning,
         evaluatorProviderRequest: score2.evaluatorRawRequest,
         details: score2.details,
-        scores: mapChildResults(score2.scores)
+        scores: mapChildResults(score2.scores),
+        tokenUsage: score2.tokenUsage
       });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
@@ -13506,7 +13655,8 @@ function mapChildResults(children) {
     reasoning: child.reasoning,
     evaluatorProviderRequest: child.evaluatorRawRequest,
     scores: mapChildResults(child.scores),
-    details: child.details
+    details: child.details,
+    tokenUsage: child.tokenUsage
   }));
 }
 function computeWeightedMean(entries) {
@@ -13886,7 +14036,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
   "beforeEachOutput",
   "afterAllOutput",
   "afterEachOutput",
-  "fileChanges"
+  "fileChanges",
+  // Promoted execution metrics (debug, not needed for regression comparison)
+  "tokenUsage",
+  "costUsd",
+  "durationMs",
+  "startTime",
+  "endTime"
 ]);
 var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
 function trimEvaluatorResult(result) {
@@ -14009,8 +14165,8 @@ var OtelTraceExporter = class {
     const api = this.api;
     const tracer = this.tracer;
     const captureContent = this.options.captureContent ?? false;
-    const startHr = toHrTime(result.trace?.startTime ?? result.timestamp);
-    const endHr = toHrTime(result.trace?.endTime ?? result.timestamp);
+    const startHr = toHrTime(result.startTime ?? result.timestamp);
+    const endHr = toHrTime(result.endTime ?? result.timestamp);
     let parentCtx = api.ROOT_CONTEXT;
     const traceparent = process.env.TRACEPARENT;
     if (traceparent && this.W3CPropagator) {
@@ -14039,12 +14195,13 @@ var OtelTraceExporter = class {
         if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
         rootSpan.setAttribute("agentv.score", result.score);
         if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
+        if (result.durationMs != null)
+          rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
+        if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
         if (result.trace) {
           const t = result.trace;
           rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
           rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
-          if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
-          if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
           if (t.llmCallCount != null)
             rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
         }