@gleanwork/mcp-server-tester 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2684,6 +2684,56 @@ interface CLIConfig {
2684
2684
  */
2685
2685
  timeout?: number;
2686
2686
  }
2687
+ /**
2688
+ * A cookie to inject into the browser context before running the script.
2689
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
2690
+ */
2691
+ interface BrowserCookie {
2692
+ name: string;
2693
+ value: string;
2694
+ url?: string;
2695
+ domain?: string;
2696
+ path?: string;
2697
+ expires?: number;
2698
+ httpOnly?: boolean;
2699
+ secure?: boolean;
2700
+ sameSite?: 'Strict' | 'Lax' | 'None';
2701
+ partitionKey?: string;
2702
+ }
2703
+ /**
2704
+ * Configuration for a browser-based host.
2705
+ *
2706
+ * Uses Playwright to launch a Chromium instance, inject auth state,
2707
+ * and execute a user-provided script that drives a web-based MCP host
2708
+ * (e.g., claude.ai).
2709
+ */
2710
+ interface BrowserConfig {
2711
+ /**
2712
+ * Path to the browser script (resolved relative to cwd).
2713
+ * The script must default-export an async function
2714
+ * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
2715
+ */
2716
+ script: string;
2717
+ /**
2718
+ * Timeout in milliseconds for the browser script.
2719
+ * @default 120000 (2 minutes)
2720
+ */
2721
+ timeout?: number;
2722
+ /**
2723
+ * Whether to launch in headless mode.
2724
+ * @default true
2725
+ */
2726
+ headless?: boolean;
2727
+ /**
2728
+ * Path to a Playwright storage state JSON file (cookies + localStorage).
2729
+ * Resolved relative to cwd.
2730
+ */
2731
+ storageState?: string;
2732
+ /**
2733
+ * Extra cookies to inject into the browser context.
2734
+ */
2735
+ cookies?: BrowserCookie[];
2736
+ }
2687
2737
  /**
2688
2738
  * Configuration for MCP host simulation
2689
2739
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
2729
2779
  * CLI host configuration (required for 'cli' host type).
2730
2780
  */
2731
2781
  cli?: CLIConfig;
2782
+ /**
2783
+ * Browser host configuration (required for 'browser' host type).
2784
+ */
2785
+ browser?: BrowserConfig;
2732
2786
  }
2733
2787
  /**
2734
2788
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
2770
2824
  * (excludes LLM response time)
2771
2825
  */
2772
2826
  mcpDurationMs?: number;
2827
+ /**
2828
+ * Token usage from the LLM during simulation.
2829
+ * Populated by SDK-based hosts from the AI SDK response.
2830
+ */
2831
+ usage?: UsageMetrics;
2773
2832
  }
2774
2833
  /**
2775
2834
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
3071
3130
  desktop: "desktop";
3072
3131
  }>>;
3073
3132
  provider: z.ZodOptional<z.ZodEnum<{
3074
- openai: "openai";
3075
3133
  anthropic: "anthropic";
3076
- azure: "azure";
3134
+ "vertex-anthropic": "vertex-anthropic";
3135
+ openai: "openai";
3077
3136
  google: "google";
3137
+ azure: "azure";
3078
3138
  mistral: "mistral";
3079
3139
  deepseek: "deepseek";
3080
3140
  openrouter: "openrouter";
3081
3141
  xai: "xai";
3082
- "vertex-anthropic": "vertex-anthropic";
3083
3142
  }>>;
3084
3143
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3085
3144
  model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3136
3195
  threshold: z.ZodOptional<z.ZodNumber>;
3137
3196
  reps: z.ZodOptional<z.ZodNumber>;
3138
3197
  provider: z.ZodOptional<z.ZodEnum<{
3139
- openai: "openai";
3140
3198
  anthropic: "anthropic";
3141
- google: "google";
3142
3199
  "vertex-anthropic": "vertex-anthropic";
3143
3200
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3201
+ openai: "openai";
3202
+ google: "google";
3144
3203
  }>>;
3145
3204
  model: z.ZodOptional<z.ZodString>;
3146
3205
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3163
3222
  threshold: z.ZodOptional<z.ZodNumber>;
3164
3223
  reps: z.ZodOptional<z.ZodNumber>;
3165
3224
  provider: z.ZodOptional<z.ZodEnum<{
3166
- openai: "openai";
3167
3225
  anthropic: "anthropic";
3168
- google: "google";
3169
3226
  "vertex-anthropic": "vertex-anthropic";
3170
3227
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3228
+ openai: "openai";
3229
+ google: "google";
3171
3230
  }>>;
3172
3231
  model: z.ZodOptional<z.ZodString>;
3173
3232
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
3223
3282
  desktop: "desktop";
3224
3283
  }>>;
3225
3284
  provider: z.ZodOptional<z.ZodEnum<{
3226
- openai: "openai";
3227
3285
  anthropic: "anthropic";
3228
- azure: "azure";
3286
+ "vertex-anthropic": "vertex-anthropic";
3287
+ openai: "openai";
3229
3288
  google: "google";
3289
+ azure: "azure";
3230
3290
  mistral: "mistral";
3231
3291
  deepseek: "deepseek";
3232
3292
  openrouter: "openrouter";
3233
3293
  xai: "xai";
3234
- "vertex-anthropic": "vertex-anthropic";
3235
3294
  }>>;
3236
3295
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3237
3296
  model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3288
3347
  threshold: z.ZodOptional<z.ZodNumber>;
3289
3348
  reps: z.ZodOptional<z.ZodNumber>;
3290
3349
  provider: z.ZodOptional<z.ZodEnum<{
3291
- openai: "openai";
3292
3350
  anthropic: "anthropic";
3293
- google: "google";
3294
3351
  "vertex-anthropic": "vertex-anthropic";
3295
3352
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3353
+ openai: "openai";
3354
+ google: "google";
3296
3355
  }>>;
3297
3356
  model: z.ZodOptional<z.ZodString>;
3298
3357
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3315
3374
  threshold: z.ZodOptional<z.ZodNumber>;
3316
3375
  reps: z.ZodOptional<z.ZodNumber>;
3317
3376
  provider: z.ZodOptional<z.ZodEnum<{
3318
- openai: "openai";
3319
3377
  anthropic: "anthropic";
3320
- google: "google";
3321
3378
  "vertex-anthropic": "vertex-anthropic";
3322
3379
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3380
+ openai: "openai";
3381
+ google: "google";
3323
3382
  }>>;
3324
3383
  model: z.ZodOptional<z.ZodString>;
3325
3384
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
3562
3621
  name: string;
3563
3622
  }>;
3564
3623
  };
3624
+ /** Token usage from mcp_host LLM simulation in this iteration */
3625
+ hostUsage?: UsageMetrics;
3565
3626
  }
3566
3627
  /**
3567
3628
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
3710
3771
  name: string;
3711
3772
  }>;
3712
3773
  };
3774
+ /**
3775
+ * Aggregate token usage from mcp_host LLM simulation for this case.
3776
+ * Summed across all iterations. Only populated for mcp_host mode cases.
3777
+ */
3778
+ hostUsage?: UsageMetrics;
3713
3779
  }
3714
3780
  /**
3715
3781
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
3759
3825
  * Expectation type breakdown
3760
3826
  */
3761
3827
  expectationBreakdown: ExpectationBreakdown;
3828
+ /**
3829
+ * Aggregate token usage from all mcp_host LLM simulations in this run.
3830
+ */
3831
+ totalHostUsage?: UsageMetrics;
3762
3832
  };
3763
3833
  /**
3764
3834
  * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
3873
3943
  * Experiment tracking metadata captured at run time.
3874
3944
  */
3875
3945
  metadata?: EvalRunMetadata;
3946
+ /**
3947
+ * Aggregate token usage from all mcp_host LLM simulations across all cases.
3948
+ */
3949
+ totalHostUsage?: UsageMetrics;
3876
3950
  }
3877
3951
  /**
3878
3952
  * Options for running eval dataset
package/dist/index.d.ts CHANGED
@@ -2684,6 +2684,56 @@ interface CLIConfig {
2684
2684
  */
2685
2685
  timeout?: number;
2686
2686
  }
2687
+ /**
2688
+ * A cookie to inject into the browser context before running the script.
2689
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
2690
+ */
2691
+ interface BrowserCookie {
2692
+ name: string;
2693
+ value: string;
2694
+ url?: string;
2695
+ domain?: string;
2696
+ path?: string;
2697
+ expires?: number;
2698
+ httpOnly?: boolean;
2699
+ secure?: boolean;
2700
+ sameSite?: 'Strict' | 'Lax' | 'None';
2701
+ partitionKey?: string;
2702
+ }
2703
+ /**
2704
+ * Configuration for a browser-based host.
2705
+ *
2706
+ * Uses Playwright to launch a Chromium instance, inject auth state,
2707
+ * and execute a user-provided script that drives a web-based MCP host
2708
+ * (e.g., claude.ai).
2709
+ */
2710
+ interface BrowserConfig {
2711
+ /**
2712
+ * Path to the browser script (resolved relative to cwd).
2713
+ * The script must default-export an async function
2714
+ * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
2715
+ */
2716
+ script: string;
2717
+ /**
2718
+ * Timeout in milliseconds for the browser script.
2719
+ * @default 120000 (2 minutes)
2720
+ */
2721
+ timeout?: number;
2722
+ /**
2723
+ * Whether to launch in headless mode.
2724
+ * @default true
2725
+ */
2726
+ headless?: boolean;
2727
+ /**
2728
+ * Path to a Playwright storage state JSON file (cookies + localStorage).
2729
+ * Resolved relative to cwd.
2730
+ */
2731
+ storageState?: string;
2732
+ /**
2733
+ * Extra cookies to inject into the browser context.
2734
+ */
2735
+ cookies?: BrowserCookie[];
2736
+ }
2687
2737
  /**
2688
2738
  * Configuration for MCP host simulation
2689
2739
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
2729
2779
  * CLI host configuration (required for 'cli' host type).
2730
2780
  */
2731
2781
  cli?: CLIConfig;
2782
+ /**
2783
+ * Browser host configuration (required for 'browser' host type).
2784
+ */
2785
+ browser?: BrowserConfig;
2732
2786
  }
2733
2787
  /**
2734
2788
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
2770
2824
  * (excludes LLM response time)
2771
2825
  */
2772
2826
  mcpDurationMs?: number;
2827
+ /**
2828
+ * Token usage from the LLM during simulation.
2829
+ * Populated by SDK-based hosts from the AI SDK response.
2830
+ */
2831
+ usage?: UsageMetrics;
2773
2832
  }
2774
2833
  /**
2775
2834
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
3071
3130
  desktop: "desktop";
3072
3131
  }>>;
3073
3132
  provider: z.ZodOptional<z.ZodEnum<{
3074
- openai: "openai";
3075
3133
  anthropic: "anthropic";
3076
- azure: "azure";
3134
+ "vertex-anthropic": "vertex-anthropic";
3135
+ openai: "openai";
3077
3136
  google: "google";
3137
+ azure: "azure";
3078
3138
  mistral: "mistral";
3079
3139
  deepseek: "deepseek";
3080
3140
  openrouter: "openrouter";
3081
3141
  xai: "xai";
3082
- "vertex-anthropic": "vertex-anthropic";
3083
3142
  }>>;
3084
3143
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3085
3144
  model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3136
3195
  threshold: z.ZodOptional<z.ZodNumber>;
3137
3196
  reps: z.ZodOptional<z.ZodNumber>;
3138
3197
  provider: z.ZodOptional<z.ZodEnum<{
3139
- openai: "openai";
3140
3198
  anthropic: "anthropic";
3141
- google: "google";
3142
3199
  "vertex-anthropic": "vertex-anthropic";
3143
3200
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3201
+ openai: "openai";
3202
+ google: "google";
3144
3203
  }>>;
3145
3204
  model: z.ZodOptional<z.ZodString>;
3146
3205
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3163
3222
  threshold: z.ZodOptional<z.ZodNumber>;
3164
3223
  reps: z.ZodOptional<z.ZodNumber>;
3165
3224
  provider: z.ZodOptional<z.ZodEnum<{
3166
- openai: "openai";
3167
3225
  anthropic: "anthropic";
3168
- google: "google";
3169
3226
  "vertex-anthropic": "vertex-anthropic";
3170
3227
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3228
+ openai: "openai";
3229
+ google: "google";
3171
3230
  }>>;
3172
3231
  model: z.ZodOptional<z.ZodString>;
3173
3232
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
3223
3282
  desktop: "desktop";
3224
3283
  }>>;
3225
3284
  provider: z.ZodOptional<z.ZodEnum<{
3226
- openai: "openai";
3227
3285
  anthropic: "anthropic";
3228
- azure: "azure";
3286
+ "vertex-anthropic": "vertex-anthropic";
3287
+ openai: "openai";
3229
3288
  google: "google";
3289
+ azure: "azure";
3230
3290
  mistral: "mistral";
3231
3291
  deepseek: "deepseek";
3232
3292
  openrouter: "openrouter";
3233
3293
  xai: "xai";
3234
- "vertex-anthropic": "vertex-anthropic";
3235
3294
  }>>;
3236
3295
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3237
3296
  model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3288
3347
  threshold: z.ZodOptional<z.ZodNumber>;
3289
3348
  reps: z.ZodOptional<z.ZodNumber>;
3290
3349
  provider: z.ZodOptional<z.ZodEnum<{
3291
- openai: "openai";
3292
3350
  anthropic: "anthropic";
3293
- google: "google";
3294
3351
  "vertex-anthropic": "vertex-anthropic";
3295
3352
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3353
+ openai: "openai";
3354
+ google: "google";
3296
3355
  }>>;
3297
3356
  model: z.ZodOptional<z.ZodString>;
3298
3357
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3315
3374
  threshold: z.ZodOptional<z.ZodNumber>;
3316
3375
  reps: z.ZodOptional<z.ZodNumber>;
3317
3376
  provider: z.ZodOptional<z.ZodEnum<{
3318
- openai: "openai";
3319
3377
  anthropic: "anthropic";
3320
- google: "google";
3321
3378
  "vertex-anthropic": "vertex-anthropic";
3322
3379
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3380
+ openai: "openai";
3381
+ google: "google";
3323
3382
  }>>;
3324
3383
  model: z.ZodOptional<z.ZodString>;
3325
3384
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
3562
3621
  name: string;
3563
3622
  }>;
3564
3623
  };
3624
+ /** Token usage from mcp_host LLM simulation in this iteration */
3625
+ hostUsage?: UsageMetrics;
3565
3626
  }
3566
3627
  /**
3567
3628
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
3710
3771
  name: string;
3711
3772
  }>;
3712
3773
  };
3774
+ /**
3775
+ * Aggregate token usage from mcp_host LLM simulation for this case.
3776
+ * Summed across all iterations. Only populated for mcp_host mode cases.
3777
+ */
3778
+ hostUsage?: UsageMetrics;
3713
3779
  }
3714
3780
  /**
3715
3781
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
3759
3825
  * Expectation type breakdown
3760
3826
  */
3761
3827
  expectationBreakdown: ExpectationBreakdown;
3828
+ /**
3829
+ * Aggregate token usage from all mcp_host LLM simulations in this run.
3830
+ */
3831
+ totalHostUsage?: UsageMetrics;
3762
3832
  };
3763
3833
  /**
3764
3834
  * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
3873
3943
  * Experiment tracking metadata captured at run time.
3874
3944
  */
3875
3945
  metadata?: EvalRunMetadata;
3946
+ /**
3947
+ * Aggregate token usage from all mcp_host LLM simulations across all cases.
3948
+ */
3949
+ totalHostUsage?: UsageMetrics;
3876
3950
  }
3877
3951
  /**
3878
3952
  * Options for running eval dataset
package/dist/index.js CHANGED
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
4384
4384
 
4385
4385
  // package.json
4386
4386
  var package_default = {
4387
- version: "1.0.0"};
4387
+ version: "1.0.1"};
4388
4388
 
4389
4389
  // src/mcp/clientFactory.ts
4390
4390
  function getRetryAfterDelayMs(err) {
@@ -6931,6 +6931,12 @@ function createVercelOrchestrator() {
6931
6931
  });
6932
6932
  const totalDurationMs = Date.now() - llmStart;
6933
6933
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6934
+ const hostUsage = result.usage ? {
6935
+ inputTokens: result.usage.promptTokens ?? 0,
6936
+ outputTokens: result.usage.completionTokens ?? 0,
6937
+ totalCostUsd: 0,
6938
+ durationMs: llmDurationMs
6939
+ } : void 0;
6934
6940
  const conversationHistory = (result.steps ?? []).map((step) => ({
6935
6941
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6936
6942
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6942,7 +6948,8 @@ function createVercelOrchestrator() {
6942
6948
  scenario,
6943
6949
  llmDurationMs,
6944
6950
  mcpDurationMs,
6945
- conversationHistory
6951
+ conversationHistory,
6952
+ usage: hostUsage
6946
6953
  };
6947
6954
  } catch (err) {
6948
6955
  return {
@@ -7278,6 +7285,32 @@ async function execFileNoThrow(file, args) {
7278
7285
  }
7279
7286
  }
7280
7287
 
7288
+ // src/utils/usageUtils.ts
7289
+ function optionalSum(a, b) {
7290
+ if (a === void 0 && b === void 0) return void 0;
7291
+ return (a ?? 0) + (b ?? 0);
7292
+ }
7293
+ function sumUsage(a, b) {
7294
+ if (!a && !b) return void 0;
7295
+ if (!a) return b ? { ...b } : void 0;
7296
+ if (!b) return { ...a };
7297
+ return {
7298
+ inputTokens: a.inputTokens + b.inputTokens,
7299
+ outputTokens: a.outputTokens + b.outputTokens,
7300
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7301
+ durationMs: a.durationMs + b.durationMs,
7302
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7303
+ cacheReadInputTokens: optionalSum(
7304
+ a.cacheReadInputTokens,
7305
+ b.cacheReadInputTokens
7306
+ ),
7307
+ cacheCreationInputTokens: optionalSum(
7308
+ a.cacheCreationInputTokens,
7309
+ b.cacheCreationInputTokens
7310
+ )
7311
+ };
7312
+ }
7313
+
7281
7314
  // src/evals/evalRunner.ts
7282
7315
  async function executeToolCall(evalCase, mcp) {
7283
7316
  const mode = evalCase.mode || "direct";
@@ -7523,6 +7556,7 @@ async function runSingleIteration(evalCase, context, options) {
7523
7556
  };
7524
7557
  }
7525
7558
  }
7559
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7526
7560
  return {
7527
7561
  id: evalCase.id,
7528
7562
  datasetName: options.datasetName ?? "single-case",
@@ -7539,7 +7573,8 @@ async function runSingleIteration(evalCase, context, options) {
7539
7573
  tags: evalCase.tags,
7540
7574
  toolPrecision,
7541
7575
  toolRecall,
7542
- mcpHostTrace
7576
+ mcpHostTrace,
7577
+ hostUsage
7543
7578
  };
7544
7579
  }
7545
7580
  function isInfrastructureError(err) {
@@ -7575,7 +7610,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7575
7610
  durationMs: result.durationMs,
7576
7611
  error: result.error,
7577
7612
  isInfrastructureError: infraError,
7578
- mcpHostTrace: result.mcpHostTrace
7613
+ mcpHostTrace: result.mcpHostTrace,
7614
+ hostUsage: result.hostUsage
7579
7615
  });
7580
7616
  } catch (err) {
7581
7617
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7608,6 +7644,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7608
7644
  durationMs: 0,
7609
7645
  tags: evalCase.tags
7610
7646
  };
7647
+ const totalHostUsage = iterationResults.reduce(
7648
+ (acc, r) => sumUsage(acc, r.hostUsage),
7649
+ void 0
7650
+ );
7611
7651
  return {
7612
7652
  ...baseResult,
7613
7653
  pass: assertionPassRate >= threshold,
@@ -7616,7 +7656,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7616
7656
  infrastructureErrorRate,
7617
7657
  iterationResults,
7618
7658
  infrastructureErrorCount: infraErrors.length,
7619
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7659
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7660
+ hostUsage: totalHostUsage
7620
7661
  };
7621
7662
  }
7622
7663
  function wilsonCI(k, n) {
@@ -7726,13 +7767,18 @@ async function runEvalDataset(options, context) {
7726
7767
  ...mcpHostModel !== void 0 && { mcpHostModel },
7727
7768
  ...judgeModel !== void 0 && { judgeModel }
7728
7769
  };
7770
+ const runHostUsage = caseResults.reduce(
7771
+ (acc, r) => sumUsage(acc, r.hostUsage),
7772
+ void 0
7773
+ );
7729
7774
  const result = {
7730
7775
  total,
7731
7776
  passed,
7732
7777
  failed: total - passed,
7733
7778
  caseResults,
7734
7779
  durationMs: Date.now() - startTime,
7735
- metadata
7780
+ metadata,
7781
+ totalHostUsage: runHostUsage
7736
7782
  };
7737
7783
  if (baselineResultsFrom) {
7738
7784
  try {