@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.js +71 -14
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +142 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +118 -16
- package/dist/index.d.ts +118 -16
- package/dist/index.js +142 -25
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -735,6 +735,34 @@ interface AuthServerMetadata {
|
|
|
735
735
|
*/
|
|
736
736
|
issuer: string;
|
|
737
737
|
}
|
|
738
|
+
/**
|
|
739
|
+
* Configuration for token refresh
|
|
740
|
+
*/
|
|
741
|
+
interface TokenRefreshConfig {
|
|
742
|
+
/**
|
|
743
|
+
* Authorization server metadata
|
|
744
|
+
*/
|
|
745
|
+
authServer: AuthServerMetadata;
|
|
746
|
+
/**
|
|
747
|
+
* Client ID
|
|
748
|
+
*/
|
|
749
|
+
clientId: string;
|
|
750
|
+
/**
|
|
751
|
+
* Client secret (for confidential clients)
|
|
752
|
+
*/
|
|
753
|
+
clientSecret?: string;
|
|
754
|
+
/**
|
|
755
|
+
* Refresh token
|
|
756
|
+
*/
|
|
757
|
+
refreshToken: string;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Refreshes an access token using a refresh token
|
|
761
|
+
*
|
|
762
|
+
* @param config - Token refresh configuration
|
|
763
|
+
* @returns New token result
|
|
764
|
+
*/
|
|
765
|
+
declare function refreshAccessToken(config: TokenRefreshConfig): Promise<TokenResult>;
|
|
738
766
|
/**
|
|
739
767
|
* Configuration for client credentials grant
|
|
740
768
|
*/
|
|
@@ -2131,7 +2159,7 @@ type ExpectationResultMap = Partial<Record<ExpectationType, EvalExpectationResul
|
|
|
2131
2159
|
/**
|
|
2132
2160
|
* Breakdown of expectation types used in a run
|
|
2133
2161
|
*/
|
|
2134
|
-
type ExpectationBreakdown = Record<ExpectationType, number
|
|
2162
|
+
type ExpectationBreakdown = Partial<Record<ExpectationType, number>>;
|
|
2135
2163
|
|
|
2136
2164
|
/**
|
|
2137
2165
|
* Options for creating an MCP fixture
|
|
@@ -2656,6 +2684,56 @@ interface CLIConfig {
|
|
|
2656
2684
|
*/
|
|
2657
2685
|
timeout?: number;
|
|
2658
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2659
2737
|
/**
|
|
2660
2738
|
* Configuration for MCP host simulation
|
|
2661
2739
|
*/
|
|
@@ -2701,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2701
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2702
2780
|
*/
|
|
2703
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2704
2786
|
}
|
|
2705
2787
|
/**
|
|
2706
2788
|
* A tool call made by the LLM
|
|
@@ -2742,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2742
2824
|
* (excludes LLM response time)
|
|
2743
2825
|
*/
|
|
2744
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2745
2832
|
}
|
|
2746
2833
|
/**
|
|
2747
2834
|
* Interface for MCP host simulators.
|
|
@@ -3043,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3043
3130
|
desktop: "desktop";
|
|
3044
3131
|
}>>;
|
|
3045
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3046
|
-
openai: "openai";
|
|
3047
3133
|
anthropic: "anthropic";
|
|
3048
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3049
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3050
3138
|
mistral: "mistral";
|
|
3051
3139
|
deepseek: "deepseek";
|
|
3052
3140
|
openrouter: "openrouter";
|
|
3053
3141
|
xai: "xai";
|
|
3054
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3055
3142
|
}>>;
|
|
3056
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3057
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3108,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3108
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3109
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3110
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3111
|
-
openai: "openai";
|
|
3112
3198
|
anthropic: "anthropic";
|
|
3113
|
-
google: "google";
|
|
3114
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3115
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3116
3203
|
}>>;
|
|
3117
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3118
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3135,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3135
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3136
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3137
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3138
|
-
openai: "openai";
|
|
3139
3225
|
anthropic: "anthropic";
|
|
3140
|
-
google: "google";
|
|
3141
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3142
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3143
3230
|
}>>;
|
|
3144
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3145
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3195,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3195
3282
|
desktop: "desktop";
|
|
3196
3283
|
}>>;
|
|
3197
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3198
|
-
openai: "openai";
|
|
3199
3285
|
anthropic: "anthropic";
|
|
3200
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3201
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3202
3290
|
mistral: "mistral";
|
|
3203
3291
|
deepseek: "deepseek";
|
|
3204
3292
|
openrouter: "openrouter";
|
|
3205
3293
|
xai: "xai";
|
|
3206
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3207
3294
|
}>>;
|
|
3208
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3209
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3260,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3260
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3261
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3262
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3263
|
-
openai: "openai";
|
|
3264
3350
|
anthropic: "anthropic";
|
|
3265
|
-
google: "google";
|
|
3266
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3267
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3268
3355
|
}>>;
|
|
3269
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3270
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3287,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3287
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3288
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3289
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3290
|
-
openai: "openai";
|
|
3291
3377
|
anthropic: "anthropic";
|
|
3292
|
-
google: "google";
|
|
3293
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3294
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3295
3382
|
}>>;
|
|
3296
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3297
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3534,6 +3621,8 @@ interface IterationResult {
|
|
|
3534
3621
|
name: string;
|
|
3535
3622
|
}>;
|
|
3536
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3537
3626
|
}
|
|
3538
3627
|
/**
|
|
3539
3628
|
* Request data captured from the eval case input.
|
|
@@ -3682,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3682
3771
|
name: string;
|
|
3683
3772
|
}>;
|
|
3684
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3685
3779
|
}
|
|
3686
3780
|
/**
|
|
3687
3781
|
* Aggregated MCP eval run data
|
|
@@ -3731,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3731
3825
|
* Expectation type breakdown
|
|
3732
3826
|
*/
|
|
3733
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3734
3832
|
};
|
|
3735
3833
|
/**
|
|
3736
3834
|
* All eval results from this run
|
|
@@ -3845,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3845
3943
|
* Experiment tracking metadata captured at run time.
|
|
3846
3944
|
*/
|
|
3847
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3848
3950
|
}
|
|
3849
3951
|
/**
|
|
3850
3952
|
* Options for running eval dataset
|
|
@@ -4455,4 +4557,4 @@ interface MCPEvalReporterConfig {
|
|
|
4455
4557
|
includeAutoTracking?: boolean;
|
|
4456
4558
|
}
|
|
4457
4559
|
|
|
4458
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4560
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseRequest, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunMetadata, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
package/dist/index.d.ts
CHANGED
|
@@ -735,6 +735,34 @@ interface AuthServerMetadata {
|
|
|
735
735
|
*/
|
|
736
736
|
issuer: string;
|
|
737
737
|
}
|
|
738
|
+
/**
|
|
739
|
+
* Configuration for token refresh
|
|
740
|
+
*/
|
|
741
|
+
interface TokenRefreshConfig {
|
|
742
|
+
/**
|
|
743
|
+
* Authorization server metadata
|
|
744
|
+
*/
|
|
745
|
+
authServer: AuthServerMetadata;
|
|
746
|
+
/**
|
|
747
|
+
* Client ID
|
|
748
|
+
*/
|
|
749
|
+
clientId: string;
|
|
750
|
+
/**
|
|
751
|
+
* Client secret (for confidential clients)
|
|
752
|
+
*/
|
|
753
|
+
clientSecret?: string;
|
|
754
|
+
/**
|
|
755
|
+
* Refresh token
|
|
756
|
+
*/
|
|
757
|
+
refreshToken: string;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Refreshes an access token using a refresh token
|
|
761
|
+
*
|
|
762
|
+
* @param config - Token refresh configuration
|
|
763
|
+
* @returns New token result
|
|
764
|
+
*/
|
|
765
|
+
declare function refreshAccessToken(config: TokenRefreshConfig): Promise<TokenResult>;
|
|
738
766
|
/**
|
|
739
767
|
* Configuration for client credentials grant
|
|
740
768
|
*/
|
|
@@ -2131,7 +2159,7 @@ type ExpectationResultMap = Partial<Record<ExpectationType, EvalExpectationResul
|
|
|
2131
2159
|
/**
|
|
2132
2160
|
* Breakdown of expectation types used in a run
|
|
2133
2161
|
*/
|
|
2134
|
-
type ExpectationBreakdown = Record<ExpectationType, number
|
|
2162
|
+
type ExpectationBreakdown = Partial<Record<ExpectationType, number>>;
|
|
2135
2163
|
|
|
2136
2164
|
/**
|
|
2137
2165
|
* Options for creating an MCP fixture
|
|
@@ -2656,6 +2684,56 @@ interface CLIConfig {
|
|
|
2656
2684
|
*/
|
|
2657
2685
|
timeout?: number;
|
|
2658
2686
|
}
|
|
2687
|
+
/**
|
|
2688
|
+
* A cookie to inject into the browser context before running the script.
|
|
2689
|
+
* Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
|
|
2690
|
+
*/
|
|
2691
|
+
interface BrowserCookie {
|
|
2692
|
+
name: string;
|
|
2693
|
+
value: string;
|
|
2694
|
+
url?: string;
|
|
2695
|
+
domain?: string;
|
|
2696
|
+
path?: string;
|
|
2697
|
+
expires?: number;
|
|
2698
|
+
httpOnly?: boolean;
|
|
2699
|
+
secure?: boolean;
|
|
2700
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
2701
|
+
partitionKey?: string;
|
|
2702
|
+
}
|
|
2703
|
+
/**
|
|
2704
|
+
* Configuration for a browser-based host.
|
|
2705
|
+
*
|
|
2706
|
+
* Uses Playwright to launch a Chromium instance, inject auth state,
|
|
2707
|
+
* and execute a user-provided script that drives a web-based MCP host
|
|
2708
|
+
* (e.g., claude.ai).
|
|
2709
|
+
*/
|
|
2710
|
+
interface BrowserConfig {
|
|
2711
|
+
/**
|
|
2712
|
+
* Path to the browser script (resolved relative to cwd).
|
|
2713
|
+
* The script must default-export an async function
|
|
2714
|
+
* `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
|
|
2715
|
+
*/
|
|
2716
|
+
script: string;
|
|
2717
|
+
/**
|
|
2718
|
+
* Timeout in milliseconds for the browser script.
|
|
2719
|
+
* @default 120000 (2 minutes)
|
|
2720
|
+
*/
|
|
2721
|
+
timeout?: number;
|
|
2722
|
+
/**
|
|
2723
|
+
* Whether to launch in headless mode.
|
|
2724
|
+
* @default true
|
|
2725
|
+
*/
|
|
2726
|
+
headless?: boolean;
|
|
2727
|
+
/**
|
|
2728
|
+
* Path to a Playwright storage state JSON file (cookies + localStorage).
|
|
2729
|
+
* Resolved relative to cwd.
|
|
2730
|
+
*/
|
|
2731
|
+
storageState?: string;
|
|
2732
|
+
/**
|
|
2733
|
+
* Extra cookies to inject into the browser context.
|
|
2734
|
+
*/
|
|
2735
|
+
cookies?: BrowserCookie[];
|
|
2736
|
+
}
|
|
2659
2737
|
/**
|
|
2660
2738
|
* Configuration for MCP host simulation
|
|
2661
2739
|
*/
|
|
@@ -2701,6 +2779,10 @@ interface MCPHostConfig {
|
|
|
2701
2779
|
* CLI host configuration (required for 'cli' host type).
|
|
2702
2780
|
*/
|
|
2703
2781
|
cli?: CLIConfig;
|
|
2782
|
+
/**
|
|
2783
|
+
* Browser host configuration (required for 'browser' host type).
|
|
2784
|
+
*/
|
|
2785
|
+
browser?: BrowserConfig;
|
|
2704
2786
|
}
|
|
2705
2787
|
/**
|
|
2706
2788
|
* A tool call made by the LLM
|
|
@@ -2742,6 +2824,11 @@ interface MCPHostSimulationResult {
|
|
|
2742
2824
|
* (excludes LLM response time)
|
|
2743
2825
|
*/
|
|
2744
2826
|
mcpDurationMs?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Token usage from the LLM during simulation.
|
|
2829
|
+
* Populated by SDK-based hosts from the AI SDK response.
|
|
2830
|
+
*/
|
|
2831
|
+
usage?: UsageMetrics;
|
|
2745
2832
|
}
|
|
2746
2833
|
/**
|
|
2747
2834
|
* Interface for MCP host simulators.
|
|
@@ -3043,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3043
3130
|
desktop: "desktop";
|
|
3044
3131
|
}>>;
|
|
3045
3132
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3046
|
-
openai: "openai";
|
|
3047
3133
|
anthropic: "anthropic";
|
|
3048
|
-
|
|
3134
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3135
|
+
openai: "openai";
|
|
3049
3136
|
google: "google";
|
|
3137
|
+
azure: "azure";
|
|
3050
3138
|
mistral: "mistral";
|
|
3051
3139
|
deepseek: "deepseek";
|
|
3052
3140
|
openrouter: "openrouter";
|
|
3053
3141
|
xai: "xai";
|
|
3054
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3055
3142
|
}>>;
|
|
3056
3143
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3057
3144
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3108,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3108
3195
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3109
3196
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3110
3197
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3111
|
-
openai: "openai";
|
|
3112
3198
|
anthropic: "anthropic";
|
|
3113
|
-
google: "google";
|
|
3114
3199
|
"vertex-anthropic": "vertex-anthropic";
|
|
3115
3200
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3201
|
+
openai: "openai";
|
|
3202
|
+
google: "google";
|
|
3116
3203
|
}>>;
|
|
3117
3204
|
model: z.ZodOptional<z.ZodString>;
|
|
3118
3205
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3135,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
3135
3222
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3136
3223
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3137
3224
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3138
|
-
openai: "openai";
|
|
3139
3225
|
anthropic: "anthropic";
|
|
3140
|
-
google: "google";
|
|
3141
3226
|
"vertex-anthropic": "vertex-anthropic";
|
|
3142
3227
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3228
|
+
openai: "openai";
|
|
3229
|
+
google: "google";
|
|
3143
3230
|
}>>;
|
|
3144
3231
|
model: z.ZodOptional<z.ZodString>;
|
|
3145
3232
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3195,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3195
3282
|
desktop: "desktop";
|
|
3196
3283
|
}>>;
|
|
3197
3284
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3198
|
-
openai: "openai";
|
|
3199
3285
|
anthropic: "anthropic";
|
|
3200
|
-
|
|
3286
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3287
|
+
openai: "openai";
|
|
3201
3288
|
google: "google";
|
|
3289
|
+
azure: "azure";
|
|
3202
3290
|
mistral: "mistral";
|
|
3203
3291
|
deepseek: "deepseek";
|
|
3204
3292
|
openrouter: "openrouter";
|
|
3205
3293
|
xai: "xai";
|
|
3206
|
-
"vertex-anthropic": "vertex-anthropic";
|
|
3207
3294
|
}>>;
|
|
3208
3295
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3209
3296
|
model: z.ZodOptional<z.ZodString>;
|
|
@@ -3260,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3260
3347
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3261
3348
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3262
3349
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3263
|
-
openai: "openai";
|
|
3264
3350
|
anthropic: "anthropic";
|
|
3265
|
-
google: "google";
|
|
3266
3351
|
"vertex-anthropic": "vertex-anthropic";
|
|
3267
3352
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3353
|
+
openai: "openai";
|
|
3354
|
+
google: "google";
|
|
3268
3355
|
}>>;
|
|
3269
3356
|
model: z.ZodOptional<z.ZodString>;
|
|
3270
3357
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3287,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3287
3374
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3288
3375
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3289
3376
|
provider: z.ZodOptional<z.ZodEnum<{
|
|
3290
|
-
openai: "openai";
|
|
3291
3377
|
anthropic: "anthropic";
|
|
3292
|
-
google: "google";
|
|
3293
3378
|
"vertex-anthropic": "vertex-anthropic";
|
|
3294
3379
|
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3380
|
+
openai: "openai";
|
|
3381
|
+
google: "google";
|
|
3295
3382
|
}>>;
|
|
3296
3383
|
model: z.ZodOptional<z.ZodString>;
|
|
3297
3384
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3534,6 +3621,8 @@ interface IterationResult {
|
|
|
3534
3621
|
name: string;
|
|
3535
3622
|
}>;
|
|
3536
3623
|
};
|
|
3624
|
+
/** Token usage from mcp_host LLM simulation in this iteration */
|
|
3625
|
+
hostUsage?: UsageMetrics;
|
|
3537
3626
|
}
|
|
3538
3627
|
/**
|
|
3539
3628
|
* Request data captured from the eval case input.
|
|
@@ -3682,6 +3771,11 @@ interface EvalCaseResult {
|
|
|
3682
3771
|
name: string;
|
|
3683
3772
|
}>;
|
|
3684
3773
|
};
|
|
3774
|
+
/**
|
|
3775
|
+
* Aggregate token usage from mcp_host LLM simulation for this case.
|
|
3776
|
+
* Summed across all iterations. Only populated for mcp_host mode cases.
|
|
3777
|
+
*/
|
|
3778
|
+
hostUsage?: UsageMetrics;
|
|
3685
3779
|
}
|
|
3686
3780
|
/**
|
|
3687
3781
|
* Aggregated MCP eval run data
|
|
@@ -3731,6 +3825,10 @@ interface MCPEvalRunData {
|
|
|
3731
3825
|
* Expectation type breakdown
|
|
3732
3826
|
*/
|
|
3733
3827
|
expectationBreakdown: ExpectationBreakdown;
|
|
3828
|
+
/**
|
|
3829
|
+
* Aggregate token usage from all mcp_host LLM simulations in this run.
|
|
3830
|
+
*/
|
|
3831
|
+
totalHostUsage?: UsageMetrics;
|
|
3734
3832
|
};
|
|
3735
3833
|
/**
|
|
3736
3834
|
* All eval results from this run
|
|
@@ -3845,6 +3943,10 @@ interface EvalRunnerResult {
|
|
|
3845
3943
|
* Experiment tracking metadata captured at run time.
|
|
3846
3944
|
*/
|
|
3847
3945
|
metadata?: EvalRunMetadata;
|
|
3946
|
+
/**
|
|
3947
|
+
* Aggregate token usage from all mcp_host LLM simulations across all cases.
|
|
3948
|
+
*/
|
|
3949
|
+
totalHostUsage?: UsageMetrics;
|
|
3848
3950
|
}
|
|
3849
3951
|
/**
|
|
3850
3952
|
* Options for running eval dataset
|
|
@@ -4455,4 +4557,4 @@ interface MCPEvalReporterConfig {
|
|
|
4455
4557
|
includeAutoTracking?: boolean;
|
|
4456
4558
|
}
|
|
4457
4559
|
|
|
4458
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4560
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseRequest, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunMetadata, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|