@agentv/core 2.7.1-next.6 → 2.9.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1686,14 +1686,16 @@ function computeTraceSummary(messages) {
1686
1686
  }
1687
1687
  const toolNames = Object.keys(toolCallCounts).sort();
1688
1688
  return {
1689
- eventCount: totalToolCalls,
1690
- toolNames,
1691
- toolCallsByName: toolCallCounts,
1692
- errorCount: 0,
1689
+ trace: {
1690
+ eventCount: totalToolCalls,
1691
+ toolNames,
1692
+ toolCallsByName: toolCallCounts,
1693
+ errorCount: 0,
1694
+ llmCallCount,
1695
+ ...hasAnyDuration ? { toolDurations } : {}
1696
+ },
1693
1697
  startTime: earliestStart?.toISOString(),
1694
- endTime: latestEnd?.toISOString(),
1695
- llmCallCount,
1696
- ...hasAnyDuration ? { toolDurations } : {}
1698
+ endTime: latestEnd?.toISOString()
1697
1699
  };
1698
1700
  }
1699
1701
  var DEFAULT_EXPLORATION_TOOLS = [
@@ -1716,9 +1718,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
1716
1718
  );
1717
1719
  return explorationCalls / summary.eventCount;
1718
1720
  }
1719
- function tokensPerTool(summary) {
1720
- if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
1721
- const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
1721
+ function tokensPerTool(summary, tokenUsage) {
1722
+ if (!tokenUsage || summary.eventCount === 0) return void 0;
1723
+ const totalTokens = tokenUsage.input + tokenUsage.output;
1722
1724
  return totalTokens / summary.eventCount;
1723
1725
  }
1724
1726
  function avgToolDurationMs(summary) {
@@ -1734,16 +1736,15 @@ function avgToolDurationMs(summary) {
1734
1736
  if (totalCalls === 0) return void 0;
1735
1737
  return totalDuration / totalCalls;
1736
1738
  }
1737
- function mergeExecutionMetrics(summary, metrics) {
1738
- if (!metrics) return summary;
1739
+ function mergeExecutionMetrics(computed, metrics) {
1740
+ if (!metrics) return computed;
1739
1741
  return {
1740
- ...summary,
1742
+ trace: computed.trace,
1741
1743
  tokenUsage: metrics.tokenUsage,
1742
1744
  costUsd: metrics.costUsd,
1743
1745
  durationMs: metrics.durationMs,
1744
- // Provider-level timing takes precedence over span-derived timing
1745
- startTime: metrics.startTime ?? summary.startTime,
1746
- endTime: metrics.endTime ?? summary.endTime
1746
+ startTime: metrics.startTime ?? computed.startTime,
1747
+ endTime: metrics.endTime ?? computed.endTime
1747
1748
  };
1748
1749
  }
1749
1750
 
@@ -2141,6 +2142,24 @@ function extractCacheConfig(suite) {
2141
2142
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
2142
2143
  return { enabled: cache, cachePath: resolvedCachePath };
2143
2144
  }
2145
+ function extractTotalBudgetUsd(suite) {
2146
+ const execution = suite.execution;
2147
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2148
+ return void 0;
2149
+ }
2150
+ const executionObj = execution;
2151
+ const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
2152
+ if (rawBudget === void 0 || rawBudget === null) {
2153
+ return void 0;
2154
+ }
2155
+ if (typeof rawBudget === "number" && rawBudget > 0) {
2156
+ return rawBudget;
2157
+ }
2158
+ logWarning(
2159
+ `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
2160
+ );
2161
+ return void 0;
2162
+ }
2144
2163
  function logWarning(message) {
2145
2164
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
2146
2165
  }
@@ -4198,6 +4217,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4198
4217
  trials: extractTrialsConfig(parsed),
4199
4218
  targets: extractTargetsFromSuite(parsed),
4200
4219
  cacheConfig: extractCacheConfig(parsed),
4220
+ totalBudgetUsd: extractTotalBudgetUsd(parsed),
4201
4221
  ...metadata !== void 0 && { metadata }
4202
4222
  };
4203
4223
  }
@@ -4796,10 +4816,13 @@ async function invokeModel(options) {
4796
4816
  }
4797
4817
  function mapResponse(result) {
4798
4818
  const content = result.text ?? "";
4819
+ const rawUsage = result.totalUsage ?? result.usage;
4820
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
4799
4821
  return {
4800
4822
  raw: result,
4801
- usage: toJsonObject(result.totalUsage ?? result.usage),
4802
- output: [{ role: "assistant", content }]
4823
+ usage: toJsonObject(rawUsage),
4824
+ output: [{ role: "assistant", content }],
4825
+ tokenUsage
4803
4826
  };
4804
4827
  }
4805
4828
  function toJsonObject(value) {
@@ -8335,10 +8358,8 @@ var CliHealthcheckInputSchema = import_zod3.z.union([
8335
8358
  var CliTargetInputSchema = import_zod3.z.object({
8336
8359
  name: import_zod3.z.string().min(1, "target name is required"),
8337
8360
  provider: import_zod3.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
8338
- // Command - required (accept both naming conventions)
8339
- command: import_zod3.z.string().optional(),
8340
- command_template: import_zod3.z.string().optional(),
8341
- commandTemplate: import_zod3.z.string().optional(),
8361
+ // Command - required
8362
+ command: import_zod3.z.string(),
8342
8363
  // Files format - optional
8343
8364
  files_format: import_zod3.z.string().optional(),
8344
8365
  filesFormat: import_zod3.z.string().optional(),
@@ -8368,12 +8389,7 @@ var CliTargetInputSchema = import_zod3.z.object({
8368
8389
  workers: import_zod3.z.number().int().min(1).optional(),
8369
8390
  provider_batching: import_zod3.z.boolean().optional(),
8370
8391
  providerBatching: import_zod3.z.boolean().optional()
8371
- }).refine(
8372
- (data) => data.command !== void 0 || data.command_template !== void 0 || data.commandTemplate !== void 0,
8373
- {
8374
- message: "'command' is required"
8375
- }
8376
- );
8392
+ });
8377
8393
  var CliHealthcheckHttpSchema = import_zod3.z.object({
8378
8394
  url: import_zod3.z.string().min(1),
8379
8395
  timeoutMs: import_zod3.z.number().positive().optional()
@@ -8431,11 +8447,7 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
8431
8447
  }
8432
8448
  function normalizeCliTargetInput(input, env, evalFilePath) {
8433
8449
  const targetName = input.name;
8434
- const commandSource = input.command ?? input.command_template ?? input.commandTemplate;
8435
- if (commandSource === void 0) {
8436
- throw new Error(`${targetName}: 'command' is required`);
8437
- }
8438
- const command = resolveString(commandSource, env, `${targetName} CLI command`, true);
8450
+ const command = resolveString(input.command, env, `${targetName} CLI command`, true);
8439
8451
  const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
8440
8452
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
8441
8453
  const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
@@ -9228,8 +9240,7 @@ function resolveCliConfig(target, env, evalFilePath) {
9228
9240
  return normalized;
9229
9241
  }
9230
9242
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
9231
- const commandSource = target.command ?? target.command_template ?? target.commandTemplate;
9232
- const command = commandSource ? resolveString(commandSource, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
9243
+ const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
9233
9244
  const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
9234
9245
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
9235
9246
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
@@ -11201,6 +11212,8 @@ async function createTargetProxy(options) {
11201
11212
  const token = (0, import_node_crypto7.randomBytes)(32).toString("hex");
11202
11213
  let callCount = 0;
11203
11214
  let isShutdown = false;
11215
+ let totalInputTokens = 0;
11216
+ let totalOutputTokens = 0;
11204
11217
  const targetsList = availableTargets ?? [defaultProvider.targetName];
11205
11218
  function resolveProvider(targetName) {
11206
11219
  if (targetName === void 0 || targetName === defaultProvider.targetName) {
@@ -11279,11 +11292,16 @@ async function createTargetProxy(options) {
11279
11292
  evalCaseId: request.evalCaseId ?? "proxy",
11280
11293
  attempt: request.attempt ?? 1
11281
11294
  });
11295
+ if (response.tokenUsage) {
11296
+ totalInputTokens += response.tokenUsage.input;
11297
+ totalOutputTokens += response.tokenUsage.output;
11298
+ }
11282
11299
  const output = response.output ?? [];
11283
11300
  const rawText = extractLastAssistantContent(output);
11284
11301
  const result = {
11285
11302
  output,
11286
- rawText
11303
+ rawText,
11304
+ tokenUsage: response.tokenUsage
11287
11305
  };
11288
11306
  sendJson(res, 200, result);
11289
11307
  } catch (error) {
@@ -11330,10 +11348,15 @@ async function createTargetProxy(options) {
11330
11348
  evalCaseId: request.evalCaseId ?? "proxy",
11331
11349
  attempt: request.attempt ?? 1
11332
11350
  });
11351
+ if (response.tokenUsage) {
11352
+ totalInputTokens += response.tokenUsage.input;
11353
+ totalOutputTokens += response.tokenUsage.output;
11354
+ }
11333
11355
  const output = response.output ?? [];
11334
11356
  responses.push({
11335
11357
  output,
11336
- rawText: extractLastAssistantContent(output)
11358
+ rawText: extractLastAssistantContent(output),
11359
+ tokenUsage: response.tokenUsage
11337
11360
  });
11338
11361
  } catch (error) {
11339
11362
  const message = error instanceof Error ? error.message : String(error);
@@ -11372,7 +11395,8 @@ async function createTargetProxy(options) {
11372
11395
  },
11373
11396
  getUsageMetadata: () => ({
11374
11397
  callCount,
11375
- maxCalls
11398
+ maxCalls,
11399
+ tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
11376
11400
  })
11377
11401
  };
11378
11402
  }
@@ -11497,6 +11521,11 @@ var CodeEvaluator = class {
11497
11521
  ),
11498
11522
  input: context2.evalCase.input,
11499
11523
  trace: context2.trace ?? null,
11524
+ tokenUsage: context2.tokenUsage ?? null,
11525
+ costUsd: context2.costUsd ?? null,
11526
+ durationMs: context2.durationMs ?? null,
11527
+ startTime: context2.startTime ?? null,
11528
+ endTime: context2.endTime ?? null,
11500
11529
  fileChanges: context2.fileChanges ?? null,
11501
11530
  workspacePath: context2.workspacePath ?? null,
11502
11531
  config: this.config ?? null
@@ -11555,7 +11584,8 @@ var CodeEvaluator = class {
11555
11584
  expectedAspectCount: hits.length + misses.length || 1,
11556
11585
  reasoning,
11557
11586
  evaluatorRawRequest,
11558
- ...details ? { details } : {}
11587
+ ...details ? { details } : {},
11588
+ tokenUsage: proxyUsage?.tokenUsage
11559
11589
  };
11560
11590
  } catch (error) {
11561
11591
  const message = error instanceof Error ? error.message : String(error);
@@ -11577,7 +11607,8 @@ var CodeEvaluator = class {
11577
11607
  }
11578
11608
  } : {},
11579
11609
  error: message
11580
- }
11610
+ },
11611
+ tokenUsage: proxyUsage?.tokenUsage
11581
11612
  };
11582
11613
  } finally {
11583
11614
  if (proxyShutdown) {
@@ -11741,7 +11772,7 @@ ${context2.fileChanges}`;
11741
11772
  target: judgeProvider.targetName
11742
11773
  };
11743
11774
  try {
11744
- const { data } = await this.runWithRetry({
11775
+ const { data, tokenUsage } = await this.runWithRetry({
11745
11776
  context: context2,
11746
11777
  judgeProvider,
11747
11778
  systemPrompt,
@@ -11760,7 +11791,8 @@ ${context2.fileChanges}`;
11760
11791
  misses,
11761
11792
  expectedAspectCount,
11762
11793
  reasoning,
11763
- evaluatorRawRequest
11794
+ evaluatorRawRequest,
11795
+ tokenUsage
11764
11796
  };
11765
11797
  } catch {
11766
11798
  return {
@@ -11790,7 +11822,7 @@ ${context2.fileChanges}`;
11790
11822
  systemPrompt,
11791
11823
  target: judgeProvider.targetName
11792
11824
  };
11793
- const { data } = await this.runWithRetry({
11825
+ const { data, tokenUsage } = await this.runWithRetry({
11794
11826
  context: context2,
11795
11827
  judgeProvider,
11796
11828
  systemPrompt,
@@ -11805,7 +11837,8 @@ ${context2.fileChanges}`;
11805
11837
  misses,
11806
11838
  expectedAspectCount: rubrics.length,
11807
11839
  reasoning: data.overall_reasoning,
11808
- evaluatorRawRequest
11840
+ evaluatorRawRequest,
11841
+ tokenUsage
11809
11842
  };
11810
11843
  }
11811
11844
  /**
@@ -11820,7 +11853,7 @@ ${context2.fileChanges}`;
11820
11853
  systemPrompt,
11821
11854
  target: judgeProvider.targetName
11822
11855
  };
11823
- const { data } = await this.runWithRetry({
11856
+ const { data, tokenUsage } = await this.runWithRetry({
11824
11857
  context: context2,
11825
11858
  judgeProvider,
11826
11859
  systemPrompt,
@@ -11836,7 +11869,8 @@ ${context2.fileChanges}`;
11836
11869
  expectedAspectCount: rubrics.length,
11837
11870
  reasoning: data.overall_reasoning,
11838
11871
  evaluatorRawRequest,
11839
- details
11872
+ details,
11873
+ tokenUsage
11840
11874
  };
11841
11875
  }
11842
11876
  /**
@@ -11920,15 +11954,17 @@ ${context2.fileChanges}`;
11920
11954
  try {
11921
11955
  const model = judgeProvider.asLanguageModel?.();
11922
11956
  if (model) {
11923
- const { text } = await (0, import_ai2.generateText)({
11957
+ const result = await (0, import_ai2.generateText)({
11924
11958
  model,
11925
11959
  system: systemPrompt,
11926
11960
  prompt: userPrompt,
11927
11961
  ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11928
11962
  ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11929
11963
  });
11930
- const data2 = schema.parse(parseJsonFromText(text));
11931
- return { data: data2 };
11964
+ const data2 = schema.parse(parseJsonFromText(result.text));
11965
+ const rawUsage = result.usage;
11966
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
11967
+ return { data: data2, tokenUsage };
11932
11968
  }
11933
11969
  const response = await judgeProvider.invoke({
11934
11970
  question: userPrompt,
@@ -11939,7 +11975,7 @@ ${context2.fileChanges}`;
11939
11975
  temperature: this.temperature
11940
11976
  });
11941
11977
  const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
11942
- return { data, providerResponse: response };
11978
+ return { data, providerResponse: response, tokenUsage: response.tokenUsage };
11943
11979
  } catch (e) {
11944
11980
  lastError = e instanceof Error ? e : new Error(String(e));
11945
11981
  }
@@ -12145,7 +12181,8 @@ var CompositeEvaluator = class {
12145
12181
  reasoning: member.result.reasoning,
12146
12182
  evaluatorRawRequest: member.result.evaluatorRawRequest,
12147
12183
  scores: member.result.scores,
12148
- details: member.result.details
12184
+ details: member.result.details,
12185
+ tokenUsage: member.result.tokenUsage
12149
12186
  });
12150
12187
  }
12151
12188
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
@@ -12193,7 +12230,8 @@ var CompositeEvaluator = class {
12193
12230
  reasoning: member.result.reasoning,
12194
12231
  evaluatorRawRequest: member.result.evaluatorRawRequest,
12195
12232
  scores: member.result.scores,
12196
- details: member.result.details
12233
+ details: member.result.details,
12234
+ tokenUsage: member.result.tokenUsage
12197
12235
  });
12198
12236
  }
12199
12237
  const totalCount = results.length;
@@ -12372,7 +12410,7 @@ var CostEvaluator = class {
12372
12410
  }
12373
12411
  evaluate(context2) {
12374
12412
  const { budget } = this.config;
12375
- const costUsd = context2.trace?.costUsd;
12413
+ const costUsd = context2.costUsd;
12376
12414
  if (costUsd === void 0) {
12377
12415
  return {
12378
12416
  score: 0,
@@ -12415,7 +12453,7 @@ var ExecutionMetricsEvaluator = class {
12415
12453
  this.config = options.config;
12416
12454
  }
12417
12455
  evaluate(context2) {
12418
- const { trace: trace2 } = context2;
12456
+ const { trace: trace2, tokenUsage, costUsd, durationMs } = context2;
12419
12457
  const {
12420
12458
  max_tool_calls,
12421
12459
  max_llm_calls,
@@ -12425,7 +12463,8 @@ var ExecutionMetricsEvaluator = class {
12425
12463
  target_exploration_ratio,
12426
12464
  exploration_tolerance = 0.2
12427
12465
  } = this.config;
12428
- if (!trace2) {
12466
+ const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
12467
+ if (needsTrace && !trace2) {
12429
12468
  return {
12430
12469
  score: 0,
12431
12470
  verdict: "fail",
@@ -12440,11 +12479,12 @@ var ExecutionMetricsEvaluator = class {
12440
12479
  }
12441
12480
  };
12442
12481
  }
12482
+ const narrowedTrace = trace2;
12443
12483
  const hits = [];
12444
12484
  const misses = [];
12445
12485
  const actualMetrics = {};
12446
- if (max_tool_calls !== void 0) {
12447
- const toolCalls = trace2.eventCount;
12486
+ if (max_tool_calls !== void 0 && narrowedTrace) {
12487
+ const toolCalls = narrowedTrace.eventCount;
12448
12488
  actualMetrics.tool_calls = toolCalls;
12449
12489
  if (toolCalls <= max_tool_calls) {
12450
12490
  hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
@@ -12452,8 +12492,8 @@ var ExecutionMetricsEvaluator = class {
12452
12492
  misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
12453
12493
  }
12454
12494
  }
12455
- if (max_llm_calls !== void 0) {
12456
- const llmCalls = trace2.llmCallCount;
12495
+ if (max_llm_calls !== void 0 && narrowedTrace) {
12496
+ const llmCalls = narrowedTrace.llmCallCount;
12457
12497
  if (llmCalls === void 0) {
12458
12498
  misses.push("LLM call count data not available");
12459
12499
  } else {
@@ -12466,7 +12506,6 @@ var ExecutionMetricsEvaluator = class {
12466
12506
  }
12467
12507
  }
12468
12508
  if (max_tokens !== void 0) {
12469
- const tokenUsage = trace2.tokenUsage;
12470
12509
  if (!tokenUsage) {
12471
12510
  misses.push("Token usage data not available");
12472
12511
  } else {
@@ -12480,7 +12519,6 @@ var ExecutionMetricsEvaluator = class {
12480
12519
  }
12481
12520
  }
12482
12521
  if (max_cost_usd !== void 0) {
12483
- const costUsd = trace2.costUsd;
12484
12522
  if (costUsd === void 0) {
12485
12523
  misses.push("Cost data not available");
12486
12524
  } else {
@@ -12494,7 +12532,6 @@ var ExecutionMetricsEvaluator = class {
12494
12532
  }
12495
12533
  }
12496
12534
  if (max_duration_ms !== void 0) {
12497
- const durationMs = trace2.durationMs;
12498
12535
  if (durationMs === void 0) {
12499
12536
  misses.push("Duration data not available");
12500
12537
  } else {
@@ -12506,8 +12543,8 @@ var ExecutionMetricsEvaluator = class {
12506
12543
  }
12507
12544
  }
12508
12545
  }
12509
- if (target_exploration_ratio !== void 0) {
12510
- const ratio = explorationRatio(trace2);
12546
+ if (target_exploration_ratio !== void 0 && narrowedTrace) {
12547
+ const ratio = explorationRatio(narrowedTrace);
12511
12548
  if (ratio === void 0) {
12512
12549
  misses.push("Exploration ratio not available (no tool calls)");
12513
12550
  } else {
@@ -13021,7 +13058,7 @@ var LatencyEvaluator = class {
13021
13058
  }
13022
13059
  evaluate(context2) {
13023
13060
  const { threshold } = this.config;
13024
- const durationMs = context2.trace?.durationMs;
13061
+ const durationMs = context2.durationMs;
13025
13062
  if (durationMs === void 0) {
13026
13063
  return {
13027
13064
  score: 0,
@@ -13666,7 +13703,7 @@ var TokenUsageEvaluator = class {
13666
13703
  this.config = options.config;
13667
13704
  }
13668
13705
  evaluate(context2) {
13669
- const usage = context2.trace?.tokenUsage;
13706
+ const usage = context2.tokenUsage;
13670
13707
  const maxTotal = this.config.max_total;
13671
13708
  const maxInput = this.config.max_input;
13672
13709
  const maxOutput = this.config.max_output;
@@ -15111,7 +15148,8 @@ async function runEvaluation(options) {
15111
15148
  keepWorkspaces,
15112
15149
  cleanupWorkspaces,
15113
15150
  trials,
15114
- streamCallbacks
15151
+ streamCallbacks,
15152
+ totalBudgetUsd
15115
15153
  } = options;
15116
15154
  let useCache = options.useCache;
15117
15155
  if (trials && trials.count > 1 && useCache) {
@@ -15284,10 +15322,39 @@ async function runEvaluation(options) {
15284
15322
  let nextWorkerId = 1;
15285
15323
  const workerIdByEvalId = /* @__PURE__ */ new Map();
15286
15324
  let beforeAllOutputAttached = false;
15325
+ let cumulativeBudgetCost = 0;
15326
+ let budgetExhausted = false;
15287
15327
  const promises = filteredEvalCases.map(
15288
15328
  (evalCase) => limit(async () => {
15289
15329
  const workerId = nextWorkerId++;
15290
15330
  workerIdByEvalId.set(evalCase.id, workerId);
15331
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
15332
+ const budgetResult = {
15333
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15334
+ testId: evalCase.id,
15335
+ dataset: evalCase.dataset,
15336
+ score: 0,
15337
+ hits: [],
15338
+ misses: [],
15339
+ answer: "",
15340
+ target: target.name,
15341
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15342
+ budgetExceeded: true
15343
+ };
15344
+ if (onProgress) {
15345
+ await onProgress({
15346
+ workerId,
15347
+ testId: evalCase.id,
15348
+ status: "failed",
15349
+ completedAt: Date.now(),
15350
+ error: budgetResult.error
15351
+ });
15352
+ }
15353
+ if (onResult) {
15354
+ await onResult(budgetResult);
15355
+ }
15356
+ return budgetResult;
15357
+ }
15291
15358
  if (onProgress) {
15292
15359
  await onProgress({
15293
15360
  workerId,
@@ -15321,6 +15388,23 @@ async function runEvaluation(options) {
15321
15388
  typeRegistry
15322
15389
  };
15323
15390
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
15391
+ if (totalBudgetUsd !== void 0) {
15392
+ let caseCost;
15393
+ if (result.trials && result.trials.length > 0) {
15394
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
15395
+ if (trialCostSum > 0) {
15396
+ caseCost = trialCostSum;
15397
+ }
15398
+ } else {
15399
+ caseCost = result.costUsd;
15400
+ }
15401
+ if (caseCost !== void 0) {
15402
+ cumulativeBudgetCost += caseCost;
15403
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
15404
+ budgetExhausted = true;
15405
+ }
15406
+ }
15407
+ }
15324
15408
  if (beforeAllOutput && !beforeAllOutputAttached) {
15325
15409
  result = { ...result, beforeAllOutput };
15326
15410
  beforeAllOutputAttached = true;
@@ -15473,17 +15557,18 @@ async function runBatchEvaluation(options) {
15473
15557
  const providerResponse = batchResponse[i];
15474
15558
  const output = providerResponse.output;
15475
15559
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15476
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
15477
- eventCount: 0,
15478
- toolNames: [],
15479
- toolCallsByName: {},
15480
- errorCount: 0
15481
- } : void 0;
15482
- const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
15560
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15561
+ const merged = computed ? mergeExecutionMetrics(computed, {
15483
15562
  tokenUsage: providerResponse.tokenUsage,
15484
15563
  costUsd: providerResponse.costUsd,
15485
15564
  durationMs: providerResponse.durationMs
15486
15565
  }) : void 0;
15566
+ const trace2 = merged?.trace;
15567
+ const costUsd = merged?.costUsd;
15568
+ const durationMs = merged?.durationMs;
15569
+ const tokenUsage = merged?.tokenUsage;
15570
+ const startTime = merged?.startTime;
15571
+ const endTime = merged?.endTime;
15487
15572
  const candidate = extractLastAssistantContent2(output);
15488
15573
  const providerError = extractProviderError(providerResponse);
15489
15574
  let result;
@@ -15502,6 +15587,11 @@ async function runBatchEvaluation(options) {
15502
15587
  agentTimeoutMs,
15503
15588
  output,
15504
15589
  trace: trace2,
15590
+ costUsd,
15591
+ durationMs,
15592
+ tokenUsage,
15593
+ startTime,
15594
+ endTime,
15505
15595
  targetResolver,
15506
15596
  availableTargets
15507
15597
  });
@@ -15738,17 +15828,18 @@ async function runEvalCase(options) {
15738
15828
  }
15739
15829
  const output = providerResponse.output;
15740
15830
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15741
- const baseSummary = output ? computeTraceSummary(output) : hasExecutionMetrics ? {
15742
- eventCount: 0,
15743
- toolNames: [],
15744
- toolCallsByName: {},
15745
- errorCount: 0
15746
- } : void 0;
15747
- const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
15831
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15832
+ const merged = computed ? mergeExecutionMetrics(computed, {
15748
15833
  tokenUsage: providerResponse.tokenUsage,
15749
15834
  costUsd: providerResponse.costUsd,
15750
15835
  durationMs: providerResponse.durationMs
15751
15836
  }) : void 0;
15837
+ const trace2 = merged?.trace;
15838
+ const costUsd = merged?.costUsd;
15839
+ const durationMs = merged?.durationMs;
15840
+ const tokenUsage = merged?.tokenUsage;
15841
+ const startTime = merged?.startTime;
15842
+ const endTime = merged?.endTime;
15752
15843
  const candidate = extractLastAssistantContent2(output);
15753
15844
  let fileChanges;
15754
15845
  if (baselineCommit && workspacePath) {
@@ -15793,6 +15884,11 @@ async function runEvalCase(options) {
15793
15884
  agentTimeoutMs,
15794
15885
  output,
15795
15886
  trace: trace2,
15887
+ costUsd,
15888
+ durationMs,
15889
+ tokenUsage,
15890
+ startTime,
15891
+ endTime,
15796
15892
  targetResolver,
15797
15893
  availableTargets,
15798
15894
  fileChanges,
@@ -15849,7 +15945,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
15849
15945
  };
15850
15946
  const result = await runEvalCase(trialOptions);
15851
15947
  allResults.push(result);
15852
- const trialCost = result.trace?.costUsd;
15948
+ const trialCost = result.costUsd;
15853
15949
  const trialVerdict = scoreToVerdict(result.score);
15854
15950
  const trial = {
15855
15951
  attempt,
@@ -15905,6 +16001,11 @@ async function evaluateCandidate(options) {
15905
16001
  agentTimeoutMs,
15906
16002
  output,
15907
16003
  trace: trace2,
16004
+ costUsd,
16005
+ durationMs,
16006
+ tokenUsage,
16007
+ startTime,
16008
+ endTime,
15908
16009
  targetResolver,
15909
16010
  availableTargets,
15910
16011
  fileChanges,
@@ -15925,6 +16026,11 @@ async function evaluateCandidate(options) {
15925
16026
  agentTimeoutMs,
15926
16027
  output,
15927
16028
  trace: trace2,
16029
+ costUsd,
16030
+ durationMs,
16031
+ tokenUsage,
16032
+ startTime,
16033
+ endTime,
15928
16034
  targetResolver,
15929
16035
  availableTargets,
15930
16036
  fileChanges,
@@ -15968,6 +16074,11 @@ async function evaluateCandidate(options) {
15968
16074
  answer: candidate,
15969
16075
  target: target.name,
15970
16076
  reasoning: score.reasoning,
16077
+ tokenUsage,
16078
+ costUsd,
16079
+ durationMs,
16080
+ startTime,
16081
+ endTime,
15971
16082
  requests,
15972
16083
  input,
15973
16084
  scores,
@@ -15991,6 +16102,11 @@ async function runEvaluatorsForCase(options) {
15991
16102
  agentTimeoutMs,
15992
16103
  output,
15993
16104
  trace: trace2,
16105
+ costUsd,
16106
+ durationMs,
16107
+ tokenUsage,
16108
+ startTime,
16109
+ endTime,
15994
16110
  targetResolver,
15995
16111
  availableTargets,
15996
16112
  fileChanges,
@@ -16012,6 +16128,11 @@ async function runEvaluatorsForCase(options) {
16012
16128
  agentTimeoutMs,
16013
16129
  output,
16014
16130
  trace: trace2,
16131
+ costUsd,
16132
+ durationMs,
16133
+ tokenUsage,
16134
+ startTime,
16135
+ endTime,
16015
16136
  targetResolver,
16016
16137
  availableTargets,
16017
16138
  fileChanges,
@@ -16034,6 +16155,11 @@ async function runEvaluatorsForCase(options) {
16034
16155
  judgeProvider,
16035
16156
  output,
16036
16157
  trace: trace2,
16158
+ tokenUsage,
16159
+ costUsd,
16160
+ durationMs,
16161
+ startTime,
16162
+ endTime,
16037
16163
  targetResolver,
16038
16164
  availableTargets,
16039
16165
  fileChanges,
@@ -16057,6 +16183,11 @@ async function runEvaluatorList(options) {
16057
16183
  agentTimeoutMs,
16058
16184
  output,
16059
16185
  trace: trace2,
16186
+ costUsd,
16187
+ durationMs,
16188
+ tokenUsage,
16189
+ startTime,
16190
+ endTime,
16060
16191
  targetResolver,
16061
16192
  availableTargets,
16062
16193
  fileChanges,
@@ -16075,6 +16206,11 @@ async function runEvaluatorList(options) {
16075
16206
  judgeProvider,
16076
16207
  output,
16077
16208
  trace: trace2,
16209
+ tokenUsage,
16210
+ costUsd,
16211
+ durationMs,
16212
+ startTime,
16213
+ endTime,
16078
16214
  targetResolver,
16079
16215
  availableTargets,
16080
16216
  fileChanges,
@@ -16114,7 +16250,8 @@ async function runEvaluatorList(options) {
16114
16250
  reasoning: score2.reasoning,
16115
16251
  evaluatorProviderRequest: score2.evaluatorRawRequest,
16116
16252
  details: score2.details,
16117
- scores: mapChildResults(score2.scores)
16253
+ scores: mapChildResults(score2.scores),
16254
+ tokenUsage: score2.tokenUsage
16118
16255
  });
16119
16256
  } catch (error) {
16120
16257
  const message = error instanceof Error ? error.message : String(error);
@@ -16362,7 +16499,8 @@ function mapChildResults(children) {
16362
16499
  reasoning: child.reasoning,
16363
16500
  evaluatorProviderRequest: child.evaluatorRawRequest,
16364
16501
  scores: mapChildResults(child.scores),
16365
- details: child.details
16502
+ details: child.details,
16503
+ tokenUsage: child.tokenUsage
16366
16504
  }));
16367
16505
  }
16368
16506
  function computeWeightedMean(entries) {
@@ -16742,7 +16880,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
16742
16880
  "beforeEachOutput",
16743
16881
  "afterAllOutput",
16744
16882
  "afterEachOutput",
16745
- "fileChanges"
16883
+ "fileChanges",
16884
+ // Promoted execution metrics (debug, not needed for regression comparison)
16885
+ "tokenUsage",
16886
+ "costUsd",
16887
+ "durationMs",
16888
+ "startTime",
16889
+ "endTime"
16746
16890
  ]);
16747
16891
  var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
16748
16892
  function trimEvaluatorResult(result) {
@@ -16865,8 +17009,8 @@ var OtelTraceExporter = class {
16865
17009
  const api = this.api;
16866
17010
  const tracer = this.tracer;
16867
17011
  const captureContent = this.options.captureContent ?? false;
16868
- const startHr = toHrTime(result.trace?.startTime ?? result.timestamp);
16869
- const endHr = toHrTime(result.trace?.endTime ?? result.timestamp);
17012
+ const startHr = toHrTime(result.startTime ?? result.timestamp);
17013
+ const endHr = toHrTime(result.endTime ?? result.timestamp);
16870
17014
  let parentCtx = api.ROOT_CONTEXT;
16871
17015
  const traceparent = process.env.TRACEPARENT;
16872
17016
  if (traceparent && this.W3CPropagator) {
@@ -16895,12 +17039,13 @@ var OtelTraceExporter = class {
16895
17039
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
16896
17040
  rootSpan.setAttribute("agentv.score", result.score);
16897
17041
  if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
17042
+ if (result.durationMs != null)
17043
+ rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
17044
+ if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
16898
17045
  if (result.trace) {
16899
17046
  const t = result.trace;
16900
17047
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
16901
17048
  rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
16902
- if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
16903
- if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
16904
17049
  if (t.llmCallCount != null)
16905
17050
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
16906
17051
  }