@gleanwork/mcp-server-tester 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +52 -6
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +88 -14
- package/dist/index.d.ts +88 -14
- package/dist/index.js +52 -6
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -2684,6 +2684,56 @@ interface CLIConfig {
|
|
|
2684
2684
|
*/
|
|
2685
2685
|
timeout?: number;
|
|
2686
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2687
2737
|
/**
|
|
2688
2738
|
* Configuration for MCP host simulation
|
|
2689
2739
|
*/
|
|
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2729
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2730
2780
|
*/
|
|
2731
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2732
2786
|
}
|
|
2733
2787
|
/**
|
|
2734
2788
|
* A tool call made by the LLM
|
|
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2770
2824
|
* (excludes LLM response time)
|
|
2771
2825
|
*/
|
|
2772
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2773
2832
|
}
|
|
2774
2833
|
/**
|
|
2775
2834
|
* Interface for MCP host simulators.
|
|
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3071
3130
|
desktop: "desktop";
|
|
3072
3131
|
}>>;
|
|
3073
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3074
|
-
openai: "openai";
|
|
3075
3133
|
anthropic: "anthropic";
|
|
3076
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3077
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3078
3138
|
mistral: "mistral";
|
|
3079
3139
|
deepseek: "deepseek";
|
|
3080
3140
|
openrouter: "openrouter";
|
|
3081
3141
|
xai: "xai";
|
|
3082
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3083
3142
|
}>>;
|
|
3084
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3085
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3136
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3137
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3138
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3139
|
-
openai: "openai";
|
|
3140
3198
|
anthropic: "anthropic";
|
|
3141
|
-
google: "google";
|
|
3142
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3143
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3144
3203
|
}>>;
|
|
3145
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3146
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3163
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3164
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3165
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3166
|
-
openai: "openai";
|
|
3167
3225
|
anthropic: "anthropic";
|
|
3168
|
-
google: "google";
|
|
3169
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3170
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3171
3230
|
}>>;
|
|
3172
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3173
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3223
3282
|
desktop: "desktop";
|
|
3224
3283
|
}>>;
|
|
3225
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3226
|
-
openai: "openai";
|
|
3227
3285
|
anthropic: "anthropic";
|
|
3228
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3229
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3230
3290
|
mistral: "mistral";
|
|
3231
3291
|
deepseek: "deepseek";
|
|
3232
3292
|
openrouter: "openrouter";
|
|
3233
3293
|
xai: "xai";
|
|
3234
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3235
3294
|
}>>;
|
|
3236
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3237
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3288
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3289
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3290
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3291
|
-
openai: "openai";
|
|
3292
3350
|
anthropic: "anthropic";
|
|
3293
|
-
google: "google";
|
|
3294
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3295
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3296
3355
|
}>>;
|
|
3297
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3298
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3315
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3316
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3317
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3318
|
-
openai: "openai";
|
|
3319
3377
|
anthropic: "anthropic";
|
|
3320
|
-
google: "google";
|
|
3321
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3322
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3323
3382
|
}>>;
|
|
3324
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3325
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3562,6 +3621,8 @@ interface IterationResult {
|
|
|
3562
3621
|
name: string;
|
|
3563
3622
|
}>;
|
|
3564
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3565
3626
|
}
|
|
3566
3627
|
/**
|
|
3567
3628
|
* Request data captured from the eval case input.
|
|
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3710
3771
|
name: string;
|
|
3711
3772
|
}>;
|
|
3712
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3713
3779
|
}
|
|
3714
3780
|
/**
|
|
3715
3781
|
* Aggregated MCP eval run data
|
|
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3759
3825
|
* Expectation type breakdown
|
|
3760
3826
|
*/
|
|
3761
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3762
3832
|
};
|
|
3763
3833
|
/**
|
|
3764
3834
|
* All eval results from this run
|
|
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3873
3943
|
* Experiment tracking metadata captured at run time.
|
|
3874
3944
|
*/
|
|
3875
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3876
3950
|
}
|
|
3877
3951
|
/**
|
|
3878
3952
|
* Options for running eval dataset
|
package/dist/index.d.ts
CHANGED
|
@@ -2684,6 +2684,56 @@ interface CLIConfig {
|
|
|
2684
2684
|
*/
|
|
2685
2685
|
timeout?: number;
|
|
2686
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2687
2737
|
/**
|
|
2688
2738
|
* Configuration for MCP host simulation
|
|
2689
2739
|
*/
|
|
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2729
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2730
2780
|
*/
|
|
2731
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2732
2786
|
}
|
|
2733
2787
|
/**
|
|
2734
2788
|
* A tool call made by the LLM
|
|
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2770
2824
|
* (excludes LLM response time)
|
|
2771
2825
|
*/
|
|
2772
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2773
2832
|
}
|
|
2774
2833
|
/**
|
|
2775
2834
|
* Interface for MCP host simulators.
|
|
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3071
3130
|
desktop: "desktop";
|
|
3072
3131
|
}>>;
|
|
3073
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3074
|
-
openai: "openai";
|
|
3075
3133
|
anthropic: "anthropic";
|
|
3076
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3077
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3078
3138
|
mistral: "mistral";
|
|
3079
3139
|
deepseek: "deepseek";
|
|
3080
3140
|
openrouter: "openrouter";
|
|
3081
3141
|
xai: "xai";
|
|
3082
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3083
3142
|
}>>;
|
|
3084
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3085
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3136
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3137
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3138
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3139
|
-
openai: "openai";
|
|
3140
3198
|
anthropic: "anthropic";
|
|
3141
|
-
google: "google";
|
|
3142
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3143
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3144
3203
|
}>>;
|
|
3145
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3146
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3163
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3164
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3165
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3166
|
-
openai: "openai";
|
|
3167
3225
|
anthropic: "anthropic";
|
|
3168
|
-
google: "google";
|
|
3169
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3170
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3171
3230
|
}>>;
|
|
3172
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3173
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3223
3282
|
desktop: "desktop";
|
|
3224
3283
|
}>>;
|
|
3225
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3226
|
-
openai: "openai";
|
|
3227
3285
|
anthropic: "anthropic";
|
|
3228
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3229
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3230
3290
|
mistral: "mistral";
|
|
3231
3291
|
deepseek: "deepseek";
|
|
3232
3292
|
openrouter: "openrouter";
|
|
3233
3293
|
xai: "xai";
|
|
3234
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3235
3294
|
}>>;
|
|
3236
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3237
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3288
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3289
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3290
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3291
|
-
openai: "openai";
|
|
3292
3350
|
anthropic: "anthropic";
|
|
3293
|
-
google: "google";
|
|
3294
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3295
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3296
3355
|
}>>;
|
|
3297
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3298
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3315
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3316
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3317
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3318
|
-
openai: "openai";
|
|
3319
3377
|
anthropic: "anthropic";
|
|
3320
|
-
google: "google";
|
|
3321
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3322
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3323
3382
|
}>>;
|
|
3324
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3325
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3562,6 +3621,8 @@ interface IterationResult {
|
|
|
3562
3621
|
name: string;
|
|
3563
3622
|
}>;
|
|
3564
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3565
3626
|
}
|
|
3566
3627
|
/**
|
|
3567
3628
|
* Request data captured from the eval case input.
|
|
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3710
3771
|
name: string;
|
|
3711
3772
|
}>;
|
|
3712
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3713
3779
|
}
|
|
3714
3780
|
/**
|
|
3715
3781
|
* Aggregated MCP eval run data
|
|
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3759
3825
|
* Expectation type breakdown
|
|
3760
3826
|
*/
|
|
3761
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3762
3832
|
};
|
|
3763
3833
|
/**
|
|
3764
3834
|
* All eval results from this run
|
|
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3873
3943
|
* Experiment tracking metadata captured at run time.
|
|
3874
3944
|
*/
|
|
3875
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3876
3950
|
}
|
|
3877
3951
|
/**
|
|
3878
3952
|
* Options for running eval dataset
|
package/dist/index.js
CHANGED
|
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
|
|
|
4384
4384
|
|
|
4385
4385
|
// package.json
|
|
4386
4386
|
var package_default = {
|
|
4387
|
-
version: "1.0.
|
|
4387
|
+
version: "1.0.1"};
|
|
4388
4388
|
|
|
4389
4389
|
// src/mcp/clientFactory.ts
|
|
4390
4390
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6931,6 +6931,12 @@ function createVercelOrchestrator() {
|
|
|
6931
6931
|
});
|
|
6932
6932
|
const totalDurationMs = Date.now() - llmStart;
|
|
6933
6933
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6934
|
+
const hostUsage = result.usage ? {
|
|
6935
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6936
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6937
|
+
totalCostUsd: 0,
|
|
6938
|
+
durationMs: llmDurationMs
|
|
6939
|
+
} : void 0;
|
|
6934
6940
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6935
6941
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6936
6942
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6942,7 +6948,8 @@ function createVercelOrchestrator() {
|
|
|
6942
6948
|
scenario,
|
|
6943
6949
|
llmDurationMs,
|
|
6944
6950
|
mcpDurationMs,
|
|
6945
|
-
conversationHistory
|
|
6951
|
+
conversationHistory,
|
|
6952
|
+
usage: hostUsage
|
|
6946
6953
|
};
|
|
6947
6954
|
} catch (err) {
|
|
6948
6955
|
return {
|
|
@@ -7278,6 +7285,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7278
7285
|
}
|
|
7279
7286
|
}
|
|
7280
7287
|
|
|
7288
|
+
// src/utils/usageUtils.ts
|
|
7289
|
+
function optionalSum(a, b) {
|
|
7290
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7291
|
+
return (a ?? 0) + (b ?? 0);
|
|
7292
|
+
}
|
|
7293
|
+
function sumUsage(a, b) {
|
|
7294
|
+
if (!a && !b) return void 0;
|
|
7295
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7296
|
+
if (!b) return { ...a };
|
|
7297
|
+
return {
|
|
7298
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7299
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7300
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7301
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7302
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7303
|
+
cacheReadInputTokens: optionalSum(
|
|
7304
|
+
a.cacheReadInputTokens,
|
|
7305
|
+
b.cacheReadInputTokens
|
|
7306
|
+
),
|
|
7307
|
+
cacheCreationInputTokens: optionalSum(
|
|
7308
|
+
a.cacheCreationInputTokens,
|
|
7309
|
+
b.cacheCreationInputTokens
|
|
7310
|
+
)
|
|
7311
|
+
};
|
|
7312
|
+
}
|
|
7313
|
+
|
|
7281
7314
|
// src/evals/evalRunner.ts
|
|
7282
7315
|
async function executeToolCall(evalCase, mcp) {
|
|
7283
7316
|
const mode = evalCase.mode || "direct";
|
|
@@ -7523,6 +7556,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7523
7556
|
};
|
|
7524
7557
|
}
|
|
7525
7558
|
}
|
|
7559
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7526
7560
|
return {
|
|
7527
7561
|
id: evalCase.id,
|
|
7528
7562
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7539,7 +7573,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7539
7573
|
tags: evalCase.tags,
|
|
7540
7574
|
toolPrecision,
|
|
7541
7575
|
toolRecall,
|
|
7542
|
-
mcpHostTrace
|
|
7576
|
+
mcpHostTrace,
|
|
7577
|
+
hostUsage
|
|
7543
7578
|
};
|
|
7544
7579
|
}
|
|
7545
7580
|
function isInfrastructureError(err) {
|
|
@@ -7575,7 +7610,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7575
7610
|
durationMs: result.durationMs,
|
|
7576
7611
|
error: result.error,
|
|
7577
7612
|
isInfrastructureError: infraError,
|
|
7578
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7613
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7614
|
+
hostUsage: result.hostUsage
|
|
7579
7615
|
});
|
|
7580
7616
|
} catch (err) {
|
|
7581
7617
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7608,6 +7644,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7608
7644
|
durationMs: 0,
|
|
7609
7645
|
tags: evalCase.tags
|
|
7610
7646
|
};
|
|
7647
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7648
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7649
|
+
void 0
|
|
7650
|
+
);
|
|
7611
7651
|
return {
|
|
7612
7652
|
...baseResult,
|
|
7613
7653
|
pass: assertionPassRate >= threshold,
|
|
@@ -7616,7 +7656,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7616
7656
|
infrastructureErrorRate,
|
|
7617
7657
|
iterationResults,
|
|
7618
7658
|
infrastructureErrorCount: infraErrors.length,
|
|
7619
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7659
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7660
|
+
hostUsage: totalHostUsage
|
|
7620
7661
|
};
|
|
7621
7662
|
}
|
|
7622
7663
|
function wilsonCI(k, n) {
|
|
@@ -7726,13 +7767,18 @@ async function runEvalDataset(options, context) {
|
|
|
7726
7767
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7727
7768
|
...judgeModel !== void 0 && { judgeModel }
|
|
7728
7769
|
};
|
|
7770
|
+
const runHostUsage = caseResults.reduce(
|
|
7771
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7772
|
+
void 0
|
|
7773
|
+
);
|
|
7729
7774
|
const result = {
|
|
7730
7775
|
total,
|
|
7731
7776
|
passed,
|
|
7732
7777
|
failed: total - passed,
|
|
7733
7778
|
caseResults,
|
|
7734
7779
|
durationMs: Date.now() - startTime,
|
|
7735
|
-
metadata
|
|
7780
|
+
metadata,
|
|
7781
|
+
totalHostUsage: runHostUsage
|
|
7736
7782
|
};
|
|
7737
7783
|
if (baselineResultsFrom) {
|
|
7738
7784
|
try {
|