@gleanwork/mcp-server-tester 1.0.0 → 1.0.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +70 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +88 -14
- package/dist/index.d.ts +88 -14
- package/dist/index.js +70 -10
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -2684,6 +2684,56 @@ interface CLIConfig {
|
|
|
2684
2684
|
*/
|
|
2685
2685
|
timeout?: number;
|
|
2686
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2687
2737
|
/**
|
|
2688
2738
|
* Configuration for MCP host simulation
|
|
2689
2739
|
*/
|
|
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2729
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2730
2780
|
*/
|
|
2731
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2732
2786
|
}
|
|
2733
2787
|
/**
|
|
2734
2788
|
* A tool call made by the LLM
|
|
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2770
2824
|
* (excludes LLM response time)
|
|
2771
2825
|
*/
|
|
2772
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2773
2832
|
}
|
|
2774
2833
|
/**
|
|
2775
2834
|
* Interface for MCP host simulators.
|
|
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3071
3130
|
desktop: "desktop";
|
|
3072
3131
|
}>>;
|
|
3073
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3074
|
-
openai: "openai";
|
|
3075
3133
|
anthropic: "anthropic";
|
|
3076
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3077
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3078
3138
|
mistral: "mistral";
|
|
3079
3139
|
deepseek: "deepseek";
|
|
3080
3140
|
openrouter: "openrouter";
|
|
3081
3141
|
xai: "xai";
|
|
3082
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3083
3142
|
}>>;
|
|
3084
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3085
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3136
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3137
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3138
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3139
|
-
openai: "openai";
|
|
3140
3198
|
anthropic: "anthropic";
|
|
3141
|
-
google: "google";
|
|
3142
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3143
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3144
3203
|
}>>;
|
|
3145
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3146
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3163
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3164
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3165
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3166
|
-
openai: "openai";
|
|
3167
3225
|
anthropic: "anthropic";
|
|
3168
|
-
google: "google";
|
|
3169
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3170
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3171
3230
|
}>>;
|
|
3172
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3173
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3223
3282
|
desktop: "desktop";
|
|
3224
3283
|
}>>;
|
|
3225
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3226
|
-
openai: "openai";
|
|
3227
3285
|
anthropic: "anthropic";
|
|
3228
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3229
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3230
3290
|
mistral: "mistral";
|
|
3231
3291
|
deepseek: "deepseek";
|
|
3232
3292
|
openrouter: "openrouter";
|
|
3233
3293
|
xai: "xai";
|
|
3234
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3235
3294
|
}>>;
|
|
3236
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3237
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3288
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3289
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3290
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3291
|
-
openai: "openai";
|
|
3292
3350
|
anthropic: "anthropic";
|
|
3293
|
-
google: "google";
|
|
3294
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3295
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3296
3355
|
}>>;
|
|
3297
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3298
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3315
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3316
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3317
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3318
|
-
openai: "openai";
|
|
3319
3377
|
anthropic: "anthropic";
|
|
3320
|
-
google: "google";
|
|
3321
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3322
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3323
3382
|
}>>;
|
|
3324
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3325
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3562,6 +3621,8 @@ interface IterationResult {
|
|
|
3562
3621
|
name: string;
|
|
3563
3622
|
}>;
|
|
3564
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3565
3626
|
}
|
|
3566
3627
|
/**
|
|
3567
3628
|
* Request data captured from the eval case input.
|
|
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3710
3771
|
name: string;
|
|
3711
3772
|
}>;
|
|
3712
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3713
3779
|
}
|
|
3714
3780
|
/**
|
|
3715
3781
|
* Aggregated MCP eval run data
|
|
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3759
3825
|
* Expectation type breakdown
|
|
3760
3826
|
*/
|
|
3761
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3762
3832
|
};
|
|
3763
3833
|
/**
|
|
3764
3834
|
* All eval results from this run
|
|
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3873
3943
|
* Experiment tracking metadata captured at run time.
|
|
3874
3944
|
*/
|
|
3875
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3876
3950
|
}
|
|
3877
3951
|
/**
|
|
3878
3952
|
* Options for running eval dataset
|
package/dist/index.d.ts
CHANGED
|
@@ -2684,6 +2684,56 @@ interface CLIConfig {
|
|
|
2684
2684
|
*/
|
|
2685
2685
|
timeout?: number;
|
|
2686
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2687
2737
|
/**
|
|
2688
2738
|
* Configuration for MCP host simulation
|
|
2689
2739
|
*/
|
|
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2729
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2730
2780
|
*/
|
|
2731
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2732
2786
|
}
|
|
2733
2787
|
/**
|
|
2734
2788
|
* A tool call made by the LLM
|
|
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2770
2824
|
* (excludes LLM response time)
|
|
2771
2825
|
*/
|
|
2772
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2773
2832
|
}
|
|
2774
2833
|
/**
|
|
2775
2834
|
* Interface for MCP host simulators.
|
|
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3071
3130
|
desktop: "desktop";
|
|
3072
3131
|
}>>;
|
|
3073
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3074
|
-
openai: "openai";
|
|
3075
3133
|
anthropic: "anthropic";
|
|
3076
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3077
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3078
3138
|
mistral: "mistral";
|
|
3079
3139
|
deepseek: "deepseek";
|
|
3080
3140
|
openrouter: "openrouter";
|
|
3081
3141
|
xai: "xai";
|
|
3082
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3083
3142
|
}>>;
|
|
3084
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3085
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3136
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3137
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3138
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3139
|
-
openai: "openai";
|
|
3140
3198
|
anthropic: "anthropic";
|
|
3141
|
-
google: "google";
|
|
3142
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3143
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3144
3203
|
}>>;
|
|
3145
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3146
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3163
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3164
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3165
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3166
|
-
openai: "openai";
|
|
3167
3225
|
anthropic: "anthropic";
|
|
3168
|
-
google: "google";
|
|
3169
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3170
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3171
3230
|
}>>;
|
|
3172
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3173
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3223
3282
|
desktop: "desktop";
|
|
3224
3283
|
}>>;
|
|
3225
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3226
|
-
openai: "openai";
|
|
3227
3285
|
anthropic: "anthropic";
|
|
3228
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3229
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3230
3290
|
mistral: "mistral";
|
|
3231
3291
|
deepseek: "deepseek";
|
|
3232
3292
|
openrouter: "openrouter";
|
|
3233
3293
|
xai: "xai";
|
|
3234
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3235
3294
|
}>>;
|
|
3236
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3237
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3288
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3289
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3290
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3291
|
-
openai: "openai";
|
|
3292
3350
|
anthropic: "anthropic";
|
|
3293
|
-
google: "google";
|
|
3294
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3295
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3296
3355
|
}>>;
|
|
3297
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3298
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3315
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3316
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3317
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3318
|
-
openai: "openai";
|
|
3319
3377
|
anthropic: "anthropic";
|
|
3320
|
-
google: "google";
|
|
3321
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3322
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3323
3382
|
}>>;
|
|
3324
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3325
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3562,6 +3621,8 @@ interface IterationResult {
|
|
|
3562
3621
|
name: string;
|
|
3563
3622
|
}>;
|
|
3564
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3565
3626
|
}
|
|
3566
3627
|
/**
|
|
3567
3628
|
* Request data captured from the eval case input.
|
|
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3710
3771
|
name: string;
|
|
3711
3772
|
}>;
|
|
3712
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3713
3779
|
}
|
|
3714
3780
|
/**
|
|
3715
3781
|
* Aggregated MCP eval run data
|
|
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3759
3825
|
* Expectation type breakdown
|
|
3760
3826
|
*/
|
|
3761
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3762
3832
|
};
|
|
3763
3833
|
/**
|
|
3764
3834
|
* All eval results from this run
|
|
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3873
3943
|
* Experiment tracking metadata captured at run time.
|
|
3874
3944
|
*/
|
|
3875
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3876
3950
|
}
|
|
3877
3951
|
/**
|
|
3878
3952
|
* Options for running eval dataset
|
package/dist/index.js
CHANGED
|
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
|
|
|
4384
4384
|
|
|
4385
4385
|
// package.json
|
|
4386
4386
|
var package_default = {
|
|
4387
|
-
version: "1.0.0"};
|
|
4387
|
+
version: "1.0.1-beta.0"};
|
|
4388
4388
|
|
|
4389
4389
|
// src/mcp/clientFactory.ts
|
|
4390
4390
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6931,6 +6931,12 @@ function createVercelOrchestrator() {
|
|
|
6931
6931
|
});
|
|
6932
6932
|
const totalDurationMs = Date.now() - llmStart;
|
|
6933
6933
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6934
|
+
const hostUsage = result.usage ? {
|
|
6935
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6936
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6937
|
+
totalCostUsd: 0,
|
|
6938
|
+
durationMs: llmDurationMs
|
|
6939
|
+
} : void 0;
|
|
6934
6940
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6935
6941
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6936
6942
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6942,7 +6948,8 @@ function createVercelOrchestrator() {
|
|
|
6942
6948
|
scenario,
|
|
6943
6949
|
llmDurationMs,
|
|
6944
6950
|
mcpDurationMs,
|
|
6945
|
-
conversationHistory
|
|
6951
|
+
conversationHistory,
|
|
6952
|
+
usage: hostUsage
|
|
6946
6953
|
};
|
|
6947
6954
|
} catch (err) {
|
|
6948
6955
|
return {
|
|
@@ -6960,6 +6967,7 @@ function parseStreamJson(stdout) {
|
|
|
6960
6967
|
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6961
6968
|
const toolCalls = [];
|
|
6962
6969
|
const textParts = [];
|
|
6970
|
+
let usage;
|
|
6963
6971
|
const conversationHistory = [];
|
|
6964
6972
|
for (const line of lines) {
|
|
6965
6973
|
let event;
|
|
@@ -6992,16 +7000,28 @@ function parseStreamJson(stdout) {
|
|
|
6992
7000
|
}
|
|
6993
7001
|
}
|
|
6994
7002
|
}
|
|
6995
|
-
if (event.type === "result"
|
|
6996
|
-
if (textParts.length === 0) {
|
|
7003
|
+
if (event.type === "result") {
|
|
7004
|
+
if (typeof event.result === "string" && textParts.length === 0) {
|
|
6997
7005
|
textParts.push(event.result);
|
|
6998
7006
|
}
|
|
7007
|
+
if (event.usage) {
|
|
7008
|
+
usage = {
|
|
7009
|
+
inputTokens: event.usage.input_tokens ?? 0,
|
|
7010
|
+
outputTokens: event.usage.output_tokens ?? 0,
|
|
7011
|
+
totalCostUsd: event.total_cost_usd ?? 0,
|
|
7012
|
+
durationMs: event.duration_ms ?? 0,
|
|
7013
|
+
durationApiMs: event.duration_api_ms,
|
|
7014
|
+
cacheReadInputTokens: event.usage.cache_read_input_tokens,
|
|
7015
|
+
cacheCreationInputTokens: event.usage.cache_creation_input_tokens
|
|
7016
|
+
};
|
|
7017
|
+
}
|
|
6999
7018
|
}
|
|
7000
7019
|
if (event.type === "result" && event.is_error === true) {
|
|
7001
7020
|
return {
|
|
7002
7021
|
success: false,
|
|
7003
7022
|
toolCalls,
|
|
7004
|
-
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7023
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error",
|
|
7024
|
+
usage
|
|
7005
7025
|
};
|
|
7006
7026
|
}
|
|
7007
7027
|
}
|
|
@@ -7013,7 +7033,8 @@ function parseStreamJson(stdout) {
|
|
|
7013
7033
|
success: true,
|
|
7014
7034
|
toolCalls,
|
|
7015
7035
|
response: response || void 0,
|
|
7016
|
-
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7036
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
|
|
7037
|
+
usage
|
|
7017
7038
|
};
|
|
7018
7039
|
}
|
|
7019
7040
|
function createJsonParser(paths) {
|
|
@@ -7278,6 +7299,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7278
7299
|
}
|
|
7279
7300
|
}
|
|
7280
7301
|
|
|
7302
|
+
// src/utils/usageUtils.ts
|
|
7303
|
+
function optionalSum(a, b) {
|
|
7304
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7305
|
+
return (a ?? 0) + (b ?? 0);
|
|
7306
|
+
}
|
|
7307
|
+
function sumUsage(a, b) {
|
|
7308
|
+
if (!a && !b) return void 0;
|
|
7309
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7310
|
+
if (!b) return { ...a };
|
|
7311
|
+
return {
|
|
7312
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7313
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7314
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7315
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7316
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7317
|
+
cacheReadInputTokens: optionalSum(
|
|
7318
|
+
a.cacheReadInputTokens,
|
|
7319
|
+
b.cacheReadInputTokens
|
|
7320
|
+
),
|
|
7321
|
+
cacheCreationInputTokens: optionalSum(
|
|
7322
|
+
a.cacheCreationInputTokens,
|
|
7323
|
+
b.cacheCreationInputTokens
|
|
7324
|
+
)
|
|
7325
|
+
};
|
|
7326
|
+
}
|
|
7327
|
+
|
|
7281
7328
|
// src/evals/evalRunner.ts
|
|
7282
7329
|
async function executeToolCall(evalCase, mcp) {
|
|
7283
7330
|
const mode = evalCase.mode || "direct";
|
|
@@ -7523,6 +7570,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7523
7570
|
};
|
|
7524
7571
|
}
|
|
7525
7572
|
}
|
|
7573
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7526
7574
|
return {
|
|
7527
7575
|
id: evalCase.id,
|
|
7528
7576
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7539,7 +7587,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7539
7587
|
tags: evalCase.tags,
|
|
7540
7588
|
toolPrecision,
|
|
7541
7589
|
toolRecall,
|
|
7542
|
-
mcpHostTrace
|
|
7590
|
+
mcpHostTrace,
|
|
7591
|
+
hostUsage
|
|
7543
7592
|
};
|
|
7544
7593
|
}
|
|
7545
7594
|
function isInfrastructureError(err) {
|
|
@@ -7575,7 +7624,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7575
7624
|
durationMs: result.durationMs,
|
|
7576
7625
|
error: result.error,
|
|
7577
7626
|
isInfrastructureError: infraError,
|
|
7578
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7627
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7628
|
+
hostUsage: result.hostUsage
|
|
7579
7629
|
});
|
|
7580
7630
|
} catch (err) {
|
|
7581
7631
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7608,6 +7658,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7608
7658
|
durationMs: 0,
|
|
7609
7659
|
tags: evalCase.tags
|
|
7610
7660
|
};
|
|
7661
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7662
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7663
|
+
void 0
|
|
7664
|
+
);
|
|
7611
7665
|
return {
|
|
7612
7666
|
...baseResult,
|
|
7613
7667
|
pass: assertionPassRate >= threshold,
|
|
@@ -7616,7 +7670,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7616
7670
|
infrastructureErrorRate,
|
|
7617
7671
|
iterationResults,
|
|
7618
7672
|
infrastructureErrorCount: infraErrors.length,
|
|
7619
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7673
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7674
|
+
hostUsage: totalHostUsage
|
|
7620
7675
|
};
|
|
7621
7676
|
}
|
|
7622
7677
|
function wilsonCI(k, n) {
|
|
@@ -7726,13 +7781,18 @@ async function runEvalDataset(options, context) {
|
|
|
7726
7781
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7727
7782
|
...judgeModel !== void 0 && { judgeModel }
|
|
7728
7783
|
};
|
|
7784
|
+
const runHostUsage = caseResults.reduce(
|
|
7785
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7786
|
+
void 0
|
|
7787
|
+
);
|
|
7729
7788
|
const result = {
|
|
7730
7789
|
total,
|
|
7731
7790
|
passed,
|
|
7732
7791
|
failed: total - passed,
|
|
7733
7792
|
caseResults,
|
|
7734
7793
|
durationMs: Date.now() - startTime,
|
|
7735
|
-
metadata
|
|
7794
|
+
metadata,
|
|
7795
|
+
totalHostUsage: runHostUsage
|
|
7736
7796
|
};
|
|
7737
7797
|
if (baselineResultsFrom) {
|
|
7738
7798
|
try {
|