@gleanwork/mcp-server-tester 1.0.0 → 1.0.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2684,6 +2684,56 @@ interface CLIConfig {
2684
2684
  */
2685
2685
  timeout?: number;
2686
2686
  }
2687
+ /**
2688
+ * A cookie to inject into the browser context before running the script.
2689
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
2690
+ */
2691
+ interface BrowserCookie {
2692
+ name: string;
2693
+ value: string;
2694
+ url?: string;
2695
+ domain?: string;
2696
+ path?: string;
2697
+ expires?: number;
2698
+ httpOnly?: boolean;
2699
+ secure?: boolean;
2700
+ sameSite?: 'Strict' | 'Lax' | 'None';
2701
+ partitionKey?: string;
2702
+ }
2703
+ /**
2704
+ * Configuration for a browser-based host.
2705
+ *
2706
+ * Uses Playwright to launch a Chromium instance, inject auth state,
2707
+ * and execute a user-provided script that drives a web-based MCP host
2708
+ * (e.g., claude.ai).
2709
+ */
2710
+ interface BrowserConfig {
2711
+ /**
2712
+ * Path to the browser script (resolved relative to cwd).
2713
+ * The script must default-export an async function
2714
+ * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
2715
+ */
2716
+ script: string;
2717
+ /**
2718
+ * Timeout in milliseconds for the browser script.
2719
+ * @default 120000 (2 minutes)
2720
+ */
2721
+ timeout?: number;
2722
+ /**
2723
+ * Whether to launch in headless mode.
2724
+ * @default true
2725
+ */
2726
+ headless?: boolean;
2727
+ /**
2728
+ * Path to a Playwright storage state JSON file (cookies + localStorage).
2729
+ * Resolved relative to cwd.
2730
+ */
2731
+ storageState?: string;
2732
+ /**
2733
+ * Extra cookies to inject into the browser context.
2734
+ */
2735
+ cookies?: BrowserCookie[];
2736
+ }
2687
2737
  /**
2688
2738
  * Configuration for MCP host simulation
2689
2739
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
2729
2779
  * CLI host configuration (required for 'cli' host type).
2730
2780
  */
2731
2781
  cli?: CLIConfig;
2782
+ /**
2783
+ * Browser host configuration (required for 'browser' host type).
2784
+ */
2785
+ browser?: BrowserConfig;
2732
2786
  }
2733
2787
  /**
2734
2788
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
2770
2824
  * (excludes LLM response time)
2771
2825
  */
2772
2826
  mcpDurationMs?: number;
2827
+ /**
2828
+ * Token usage from the LLM during simulation.
2829
+ * Populated by SDK-based hosts from the AI SDK response.
2830
+ */
2831
+ usage?: UsageMetrics;
2773
2832
  }
2774
2833
  /**
2775
2834
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
3071
3130
  desktop: "desktop";
3072
3131
  }>>;
3073
3132
  provider: z.ZodOptional<z.ZodEnum<{
3074
- openai: "openai";
3075
3133
  anthropic: "anthropic";
3076
- azure: "azure";
3134
+ "vertex-anthropic": "vertex-anthropic";
3135
+ openai: "openai";
3077
3136
  google: "google";
3137
+ azure: "azure";
3078
3138
  mistral: "mistral";
3079
3139
  deepseek: "deepseek";
3080
3140
  openrouter: "openrouter";
3081
3141
  xai: "xai";
3082
- "vertex-anthropic": "vertex-anthropic";
3083
3142
  }>>;
3084
3143
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3085
3144
  model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3136
3195
  threshold: z.ZodOptional<z.ZodNumber>;
3137
3196
  reps: z.ZodOptional<z.ZodNumber>;
3138
3197
  provider: z.ZodOptional<z.ZodEnum<{
3139
- openai: "openai";
3140
3198
  anthropic: "anthropic";
3141
- google: "google";
3142
3199
  "vertex-anthropic": "vertex-anthropic";
3143
3200
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3201
+ openai: "openai";
3202
+ google: "google";
3144
3203
  }>>;
3145
3204
  model: z.ZodOptional<z.ZodString>;
3146
3205
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3163
3222
  threshold: z.ZodOptional<z.ZodNumber>;
3164
3223
  reps: z.ZodOptional<z.ZodNumber>;
3165
3224
  provider: z.ZodOptional<z.ZodEnum<{
3166
- openai: "openai";
3167
3225
  anthropic: "anthropic";
3168
- google: "google";
3169
3226
  "vertex-anthropic": "vertex-anthropic";
3170
3227
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3228
+ openai: "openai";
3229
+ google: "google";
3171
3230
  }>>;
3172
3231
  model: z.ZodOptional<z.ZodString>;
3173
3232
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
3223
3282
  desktop: "desktop";
3224
3283
  }>>;
3225
3284
  provider: z.ZodOptional<z.ZodEnum<{
3226
- openai: "openai";
3227
3285
  anthropic: "anthropic";
3228
- azure: "azure";
3286
+ "vertex-anthropic": "vertex-anthropic";
3287
+ openai: "openai";
3229
3288
  google: "google";
3289
+ azure: "azure";
3230
3290
  mistral: "mistral";
3231
3291
  deepseek: "deepseek";
3232
3292
  openrouter: "openrouter";
3233
3293
  xai: "xai";
3234
- "vertex-anthropic": "vertex-anthropic";
3235
3294
  }>>;
3236
3295
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3237
3296
  model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3288
3347
  threshold: z.ZodOptional<z.ZodNumber>;
3289
3348
  reps: z.ZodOptional<z.ZodNumber>;
3290
3349
  provider: z.ZodOptional<z.ZodEnum<{
3291
- openai: "openai";
3292
3350
  anthropic: "anthropic";
3293
- google: "google";
3294
3351
  "vertex-anthropic": "vertex-anthropic";
3295
3352
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3353
+ openai: "openai";
3354
+ google: "google";
3296
3355
  }>>;
3297
3356
  model: z.ZodOptional<z.ZodString>;
3298
3357
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3315
3374
  threshold: z.ZodOptional<z.ZodNumber>;
3316
3375
  reps: z.ZodOptional<z.ZodNumber>;
3317
3376
  provider: z.ZodOptional<z.ZodEnum<{
3318
- openai: "openai";
3319
3377
  anthropic: "anthropic";
3320
- google: "google";
3321
3378
  "vertex-anthropic": "vertex-anthropic";
3322
3379
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3380
+ openai: "openai";
3381
+ google: "google";
3323
3382
  }>>;
3324
3383
  model: z.ZodOptional<z.ZodString>;
3325
3384
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
3562
3621
  name: string;
3563
3622
  }>;
3564
3623
  };
3624
+ /** Token usage from mcp_host LLM simulation in this iteration */
3625
+ hostUsage?: UsageMetrics;
3565
3626
  }
3566
3627
  /**
3567
3628
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
3710
3771
  name: string;
3711
3772
  }>;
3712
3773
  };
3774
+ /**
3775
+ * Aggregate token usage from mcp_host LLM simulation for this case.
3776
+ * Summed across all iterations. Only populated for mcp_host mode cases.
3777
+ */
3778
+ hostUsage?: UsageMetrics;
3713
3779
  }
3714
3780
  /**
3715
3781
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
3759
3825
  * Expectation type breakdown
3760
3826
  */
3761
3827
  expectationBreakdown: ExpectationBreakdown;
3828
+ /**
3829
+ * Aggregate token usage from all mcp_host LLM simulations in this run.
3830
+ */
3831
+ totalHostUsage?: UsageMetrics;
3762
3832
  };
3763
3833
  /**
3764
3834
  * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
3873
3943
  * Experiment tracking metadata captured at run time.
3874
3944
  */
3875
3945
  metadata?: EvalRunMetadata;
3946
+ /**
3947
+ * Aggregate token usage from all mcp_host LLM simulations across all cases.
3948
+ */
3949
+ totalHostUsage?: UsageMetrics;
3876
3950
  }
3877
3951
  /**
3878
3952
  * Options for running eval dataset
package/dist/index.d.ts CHANGED
@@ -2684,6 +2684,56 @@ interface CLIConfig {
2684
2684
  */
2685
2685
  timeout?: number;
2686
2686
  }
2687
+ /**
2688
+ * A cookie to inject into the browser context before running the script.
2689
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
2690
+ */
2691
+ interface BrowserCookie {
2692
+ name: string;
2693
+ value: string;
2694
+ url?: string;
2695
+ domain?: string;
2696
+ path?: string;
2697
+ expires?: number;
2698
+ httpOnly?: boolean;
2699
+ secure?: boolean;
2700
+ sameSite?: 'Strict' | 'Lax' | 'None';
2701
+ partitionKey?: string;
2702
+ }
2703
+ /**
2704
+ * Configuration for a browser-based host.
2705
+ *
2706
+ * Uses Playwright to launch a Chromium instance, inject auth state,
2707
+ * and execute a user-provided script that drives a web-based MCP host
2708
+ * (e.g., claude.ai).
2709
+ */
2710
+ interface BrowserConfig {
2711
+ /**
2712
+ * Path to the browser script (resolved relative to cwd).
2713
+ * The script must default-export an async function
2714
+ * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
2715
+ */
2716
+ script: string;
2717
+ /**
2718
+ * Timeout in milliseconds for the browser script.
2719
+ * @default 120000 (2 minutes)
2720
+ */
2721
+ timeout?: number;
2722
+ /**
2723
+ * Whether to launch in headless mode.
2724
+ * @default true
2725
+ */
2726
+ headless?: boolean;
2727
+ /**
2728
+ * Path to a Playwright storage state JSON file (cookies + localStorage).
2729
+ * Resolved relative to cwd.
2730
+ */
2731
+ storageState?: string;
2732
+ /**
2733
+ * Extra cookies to inject into the browser context.
2734
+ */
2735
+ cookies?: BrowserCookie[];
2736
+ }
2687
2737
  /**
2688
2738
  * Configuration for MCP host simulation
2689
2739
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
2729
2779
  * CLI host configuration (required for 'cli' host type).
2730
2780
  */
2731
2781
  cli?: CLIConfig;
2782
+ /**
2783
+ * Browser host configuration (required for 'browser' host type).
2784
+ */
2785
+ browser?: BrowserConfig;
2732
2786
  }
2733
2787
  /**
2734
2788
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
2770
2824
  * (excludes LLM response time)
2771
2825
  */
2772
2826
  mcpDurationMs?: number;
2827
+ /**
2828
+ * Token usage from the LLM during simulation.
2829
+ * Populated by SDK-based hosts from the AI SDK response.
2830
+ */
2831
+ usage?: UsageMetrics;
2773
2832
  }
2774
2833
  /**
2775
2834
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
3071
3130
  desktop: "desktop";
3072
3131
  }>>;
3073
3132
  provider: z.ZodOptional<z.ZodEnum<{
3074
- openai: "openai";
3075
3133
  anthropic: "anthropic";
3076
- azure: "azure";
3134
+ "vertex-anthropic": "vertex-anthropic";
3135
+ openai: "openai";
3077
3136
  google: "google";
3137
+ azure: "azure";
3078
3138
  mistral: "mistral";
3079
3139
  deepseek: "deepseek";
3080
3140
  openrouter: "openrouter";
3081
3141
  xai: "xai";
3082
- "vertex-anthropic": "vertex-anthropic";
3083
3142
  }>>;
3084
3143
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3085
3144
  model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3136
3195
  threshold: z.ZodOptional<z.ZodNumber>;
3137
3196
  reps: z.ZodOptional<z.ZodNumber>;
3138
3197
  provider: z.ZodOptional<z.ZodEnum<{
3139
- openai: "openai";
3140
3198
  anthropic: "anthropic";
3141
- google: "google";
3142
3199
  "vertex-anthropic": "vertex-anthropic";
3143
3200
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3201
+ openai: "openai";
3202
+ google: "google";
3144
3203
  }>>;
3145
3204
  model: z.ZodOptional<z.ZodString>;
3146
3205
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
3163
3222
  threshold: z.ZodOptional<z.ZodNumber>;
3164
3223
  reps: z.ZodOptional<z.ZodNumber>;
3165
3224
  provider: z.ZodOptional<z.ZodEnum<{
3166
- openai: "openai";
3167
3225
  anthropic: "anthropic";
3168
- google: "google";
3169
3226
  "vertex-anthropic": "vertex-anthropic";
3170
3227
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3228
+ openai: "openai";
3229
+ google: "google";
3171
3230
  }>>;
3172
3231
  model: z.ZodOptional<z.ZodString>;
3173
3232
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
3223
3282
  desktop: "desktop";
3224
3283
  }>>;
3225
3284
  provider: z.ZodOptional<z.ZodEnum<{
3226
- openai: "openai";
3227
3285
  anthropic: "anthropic";
3228
- azure: "azure";
3286
+ "vertex-anthropic": "vertex-anthropic";
3287
+ openai: "openai";
3229
3288
  google: "google";
3289
+ azure: "azure";
3230
3290
  mistral: "mistral";
3231
3291
  deepseek: "deepseek";
3232
3292
  openrouter: "openrouter";
3233
3293
  xai: "xai";
3234
- "vertex-anthropic": "vertex-anthropic";
3235
3294
  }>>;
3236
3295
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3237
3296
  model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3288
3347
  threshold: z.ZodOptional<z.ZodNumber>;
3289
3348
  reps: z.ZodOptional<z.ZodNumber>;
3290
3349
  provider: z.ZodOptional<z.ZodEnum<{
3291
- openai: "openai";
3292
3350
  anthropic: "anthropic";
3293
- google: "google";
3294
3351
  "vertex-anthropic": "vertex-anthropic";
3295
3352
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3353
+ openai: "openai";
3354
+ google: "google";
3296
3355
  }>>;
3297
3356
  model: z.ZodOptional<z.ZodString>;
3298
3357
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
3315
3374
  threshold: z.ZodOptional<z.ZodNumber>;
3316
3375
  reps: z.ZodOptional<z.ZodNumber>;
3317
3376
  provider: z.ZodOptional<z.ZodEnum<{
3318
- openai: "openai";
3319
3377
  anthropic: "anthropic";
3320
- google: "google";
3321
3378
  "vertex-anthropic": "vertex-anthropic";
3322
3379
  "anthropic-agent-sdk": "anthropic-agent-sdk";
3380
+ openai: "openai";
3381
+ google: "google";
3323
3382
  }>>;
3324
3383
  model: z.ZodOptional<z.ZodString>;
3325
3384
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
3562
3621
  name: string;
3563
3622
  }>;
3564
3623
  };
3624
+ /** Token usage from mcp_host LLM simulation in this iteration */
3625
+ hostUsage?: UsageMetrics;
3565
3626
  }
3566
3627
  /**
3567
3628
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
3710
3771
  name: string;
3711
3772
  }>;
3712
3773
  };
3774
+ /**
3775
+ * Aggregate token usage from mcp_host LLM simulation for this case.
3776
+ * Summed across all iterations. Only populated for mcp_host mode cases.
3777
+ */
3778
+ hostUsage?: UsageMetrics;
3713
3779
  }
3714
3780
  /**
3715
3781
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
3759
3825
  * Expectation type breakdown
3760
3826
  */
3761
3827
  expectationBreakdown: ExpectationBreakdown;
3828
+ /**
3829
+ * Aggregate token usage from all mcp_host LLM simulations in this run.
3830
+ */
3831
+ totalHostUsage?: UsageMetrics;
3762
3832
  };
3763
3833
  /**
3764
3834
  * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
3873
3943
  * Experiment tracking metadata captured at run time.
3874
3944
  */
3875
3945
  metadata?: EvalRunMetadata;
3946
+ /**
3947
+ * Aggregate token usage from all mcp_host LLM simulations across all cases.
3948
+ */
3949
+ totalHostUsage?: UsageMetrics;
3876
3950
  }
3877
3951
  /**
3878
3952
  * Options for running eval dataset
package/dist/index.js CHANGED
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
4384
4384
 
4385
4385
  // package.json
4386
4386
  var package_default = {
4387
- version: "1.0.0"};
4387
+ version: "1.0.1-beta.0"};
4388
4388
 
4389
4389
  // src/mcp/clientFactory.ts
4390
4390
  function getRetryAfterDelayMs(err) {
@@ -6931,6 +6931,12 @@ function createVercelOrchestrator() {
6931
6931
  });
6932
6932
  const totalDurationMs = Date.now() - llmStart;
6933
6933
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6934
+ const hostUsage = result.usage ? {
6935
+ inputTokens: result.usage.promptTokens ?? 0,
6936
+ outputTokens: result.usage.completionTokens ?? 0,
6937
+ totalCostUsd: 0,
6938
+ durationMs: llmDurationMs
6939
+ } : void 0;
6934
6940
  const conversationHistory = (result.steps ?? []).map((step) => ({
6935
6941
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6936
6942
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6942,7 +6948,8 @@ function createVercelOrchestrator() {
6942
6948
  scenario,
6943
6949
  llmDurationMs,
6944
6950
  mcpDurationMs,
6945
- conversationHistory
6951
+ conversationHistory,
6952
+ usage: hostUsage
6946
6953
  };
6947
6954
  } catch (err) {
6948
6955
  return {
@@ -6960,6 +6967,7 @@ function parseStreamJson(stdout) {
6960
6967
  const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6961
6968
  const toolCalls = [];
6962
6969
  const textParts = [];
6970
+ let usage;
6963
6971
  const conversationHistory = [];
6964
6972
  for (const line of lines) {
6965
6973
  let event;
@@ -6992,16 +7000,28 @@ function parseStreamJson(stdout) {
6992
7000
  }
6993
7001
  }
6994
7002
  }
6995
- if (event.type === "result" && typeof event.result === "string") {
6996
- if (textParts.length === 0) {
7003
+ if (event.type === "result") {
7004
+ if (typeof event.result === "string" && textParts.length === 0) {
6997
7005
  textParts.push(event.result);
6998
7006
  }
7007
+ if (event.usage) {
7008
+ usage = {
7009
+ inputTokens: event.usage.input_tokens ?? 0,
7010
+ outputTokens: event.usage.output_tokens ?? 0,
7011
+ totalCostUsd: event.total_cost_usd ?? 0,
7012
+ durationMs: event.duration_ms ?? 0,
7013
+ durationApiMs: event.duration_api_ms,
7014
+ cacheReadInputTokens: event.usage.cache_read_input_tokens,
7015
+ cacheCreationInputTokens: event.usage.cache_creation_input_tokens
7016
+ };
7017
+ }
6999
7018
  }
7000
7019
  if (event.type === "result" && event.is_error === true) {
7001
7020
  return {
7002
7021
  success: false,
7003
7022
  toolCalls,
7004
- error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7023
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error",
7024
+ usage
7005
7025
  };
7006
7026
  }
7007
7027
  }
@@ -7013,7 +7033,8 @@ function parseStreamJson(stdout) {
7013
7033
  success: true,
7014
7034
  toolCalls,
7015
7035
  response: response || void 0,
7016
- conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7036
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
7037
+ usage
7017
7038
  };
7018
7039
  }
7019
7040
  function createJsonParser(paths) {
@@ -7278,6 +7299,32 @@ async function execFileNoThrow(file, args) {
7278
7299
  }
7279
7300
  }
7280
7301
 
7302
+ // src/utils/usageUtils.ts
7303
+ function optionalSum(a, b) {
7304
+ if (a === void 0 && b === void 0) return void 0;
7305
+ return (a ?? 0) + (b ?? 0);
7306
+ }
7307
+ function sumUsage(a, b) {
7308
+ if (!a && !b) return void 0;
7309
+ if (!a) return b ? { ...b } : void 0;
7310
+ if (!b) return { ...a };
7311
+ return {
7312
+ inputTokens: a.inputTokens + b.inputTokens,
7313
+ outputTokens: a.outputTokens + b.outputTokens,
7314
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7315
+ durationMs: a.durationMs + b.durationMs,
7316
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7317
+ cacheReadInputTokens: optionalSum(
7318
+ a.cacheReadInputTokens,
7319
+ b.cacheReadInputTokens
7320
+ ),
7321
+ cacheCreationInputTokens: optionalSum(
7322
+ a.cacheCreationInputTokens,
7323
+ b.cacheCreationInputTokens
7324
+ )
7325
+ };
7326
+ }
7327
+
7281
7328
  // src/evals/evalRunner.ts
7282
7329
  async function executeToolCall(evalCase, mcp) {
7283
7330
  const mode = evalCase.mode || "direct";
@@ -7523,6 +7570,7 @@ async function runSingleIteration(evalCase, context, options) {
7523
7570
  };
7524
7571
  }
7525
7572
  }
7573
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7526
7574
  return {
7527
7575
  id: evalCase.id,
7528
7576
  datasetName: options.datasetName ?? "single-case",
@@ -7539,7 +7587,8 @@ async function runSingleIteration(evalCase, context, options) {
7539
7587
  tags: evalCase.tags,
7540
7588
  toolPrecision,
7541
7589
  toolRecall,
7542
- mcpHostTrace
7590
+ mcpHostTrace,
7591
+ hostUsage
7543
7592
  };
7544
7593
  }
7545
7594
  function isInfrastructureError(err) {
@@ -7575,7 +7624,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7575
7624
  durationMs: result.durationMs,
7576
7625
  error: result.error,
7577
7626
  isInfrastructureError: infraError,
7578
- mcpHostTrace: result.mcpHostTrace
7627
+ mcpHostTrace: result.mcpHostTrace,
7628
+ hostUsage: result.hostUsage
7579
7629
  });
7580
7630
  } catch (err) {
7581
7631
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7608,6 +7658,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7608
7658
  durationMs: 0,
7609
7659
  tags: evalCase.tags
7610
7660
  };
7661
+ const totalHostUsage = iterationResults.reduce(
7662
+ (acc, r) => sumUsage(acc, r.hostUsage),
7663
+ void 0
7664
+ );
7611
7665
  return {
7612
7666
  ...baseResult,
7613
7667
  pass: assertionPassRate >= threshold,
@@ -7616,7 +7670,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7616
7670
  infrastructureErrorRate,
7617
7671
  iterationResults,
7618
7672
  infrastructureErrorCount: infraErrors.length,
7619
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7673
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7674
+ hostUsage: totalHostUsage
7620
7675
  };
7621
7676
  }
7622
7677
  function wilsonCI(k, n) {
@@ -7726,13 +7781,18 @@ async function runEvalDataset(options, context) {
7726
7781
  ...mcpHostModel !== void 0 && { mcpHostModel },
7727
7782
  ...judgeModel !== void 0 && { judgeModel }
7728
7783
  };
7784
+ const runHostUsage = caseResults.reduce(
7785
+ (acc, r) => sumUsage(acc, r.hostUsage),
7786
+ void 0
7787
+ );
7729
7788
  const result = {
7730
7789
  total,
7731
7790
  passed,
7732
7791
  failed: total - passed,
7733
7792
  caseResults,
7734
7793
  durationMs: Date.now() - startTime,
7735
- metadata
7794
+ metadata,
7795
+ totalHostUsage: runHostUsage
7736
7796
  };
7737
7797
  if (baselineResultsFrom) {
7738
7798
  try {