@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +354 -37
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +721 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +533 -116
- package/dist/index.d.ts +533 -116
- package/dist/index.js +719 -78
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -6
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.d.cts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { z, ZodType } from 'zod';
|
|
2
|
+
import { Page, TestInfo, Expect } from '@playwright/test';
|
|
2
3
|
import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
|
|
3
4
|
import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
|
|
4
5
|
import * as oauth from 'oauth4webapi';
|
|
5
6
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
6
7
|
import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
|
|
7
|
-
import { TestInfo, Expect } from '@playwright/test';
|
|
8
8
|
import * as playwright_test from 'playwright/test';
|
|
9
9
|
|
|
10
10
|
/**
|
|
@@ -311,6 +311,7 @@ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
|
|
|
311
311
|
/**
|
|
312
312
|
* Auth types for MCP OAuth integration
|
|
313
313
|
*/
|
|
314
|
+
|
|
314
315
|
/**
|
|
315
316
|
* Stored OAuth tokens
|
|
316
317
|
*/
|
|
@@ -384,70 +385,90 @@ interface StoredOAuthState {
|
|
|
384
385
|
savedAt: number;
|
|
385
386
|
}
|
|
386
387
|
/**
|
|
387
|
-
*
|
|
388
|
+
* Login form selectors for standard OAuth login automation
|
|
389
|
+
*/
|
|
390
|
+
interface OAuthLoginSelectors {
|
|
391
|
+
/** Selector for username/email input field */
|
|
392
|
+
usernameInput: string;
|
|
393
|
+
/** Selector for password input field */
|
|
394
|
+
passwordInput: string;
|
|
395
|
+
/** Selector for login submit button */
|
|
396
|
+
submitButton: string;
|
|
397
|
+
/** Selector for consent/authorize button (optional) */
|
|
398
|
+
consentButton?: string;
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Base configuration shared by all OAuth setup strategies
|
|
388
402
|
*/
|
|
389
|
-
interface
|
|
390
|
-
/**
|
|
391
|
-
* OAuth authorization server metadata URL
|
|
392
|
-
*/
|
|
403
|
+
interface OAuthSetupBaseConfig {
|
|
404
|
+
/** OAuth authorization server metadata URL */
|
|
393
405
|
authServerUrl: string;
|
|
394
|
-
/**
|
|
395
|
-
* Scopes to request
|
|
396
|
-
*/
|
|
406
|
+
/** Scopes to request */
|
|
397
407
|
scopes: Array<string>;
|
|
398
|
-
/**
|
|
399
|
-
|
|
400
|
-
|
|
408
|
+
/** Path to save OAuth state file */
|
|
409
|
+
outputPath: string;
|
|
410
|
+
/** Pre-registered client ID (optional, uses DCR if not provided) */
|
|
411
|
+
clientId?: string;
|
|
412
|
+
/** Pre-registered client secret (optional) */
|
|
413
|
+
clientSecret?: string;
|
|
414
|
+
/** Redirect URI for OAuth callback */
|
|
415
|
+
redirectUri?: string;
|
|
416
|
+
/** Resource indicator (RFC 8707) */
|
|
401
417
|
resource?: string;
|
|
402
|
-
/**
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
passwordInput: string;
|
|
414
|
-
/**
|
|
415
|
-
* Selector for login submit button
|
|
416
|
-
*/
|
|
417
|
-
submitButton: string;
|
|
418
|
-
/**
|
|
419
|
-
* Selector for consent/authorize button (optional)
|
|
420
|
-
*/
|
|
421
|
-
consentButton?: string;
|
|
422
|
-
};
|
|
423
|
-
/**
|
|
424
|
-
* Test user credentials
|
|
425
|
-
*/
|
|
418
|
+
/** Timeout for login flow in milliseconds (default: 30000) */
|
|
419
|
+
timeoutMs?: number;
|
|
420
|
+
}
|
|
421
|
+
/**
|
|
422
|
+
* Standard login strategy: automates a form with username, password, and submit button.
|
|
423
|
+
* Use when the IdP presents all login fields on a single page.
|
|
424
|
+
*/
|
|
425
|
+
interface StandardLoginConfig {
|
|
426
|
+
/** Login form selectors for Playwright automation */
|
|
427
|
+
loginSelectors: OAuthLoginSelectors;
|
|
428
|
+
/** Test user credentials */
|
|
426
429
|
credentials: {
|
|
427
430
|
username: string;
|
|
428
431
|
password: string;
|
|
429
432
|
};
|
|
433
|
+
customLoginFlow?: never;
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* Custom login strategy: full control over the browser-based login flow.
|
|
437
|
+
* Use for multi-step logins, MFA, custom consent screens, or any flow
|
|
438
|
+
* that doesn't fit the standard username/password/submit pattern.
|
|
439
|
+
*
|
|
440
|
+
* The callback receives a Playwright Page already navigated to the OAuth
|
|
441
|
+
* authorization URL. Complete the login so the IdP redirects to the
|
|
442
|
+
* callback URL — `performOAuthSetup` handles PKCE, token exchange,
|
|
443
|
+
* and state persistence automatically.
|
|
444
|
+
*/
|
|
445
|
+
interface CustomLoginConfig {
|
|
430
446
|
/**
|
|
431
|
-
*
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
*
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
*
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
*
|
|
444
|
-
*/
|
|
445
|
-
redirectUri?: string;
|
|
446
|
-
/**
|
|
447
|
-
* Timeout for login flow in milliseconds (default: 30000)
|
|
447
|
+
* Custom Playwright automation for the IdP login flow.
|
|
448
|
+
*
|
|
449
|
+
* @param page - Playwright Page already navigated to the OAuth authorization URL
|
|
450
|
+
*
|
|
451
|
+
* @example
|
|
452
|
+
* ```typescript
|
|
453
|
+
* customLoginFlow: async (page) => {
|
|
454
|
+
* await page.fill('#username', process.env.TEST_USER!);
|
|
455
|
+
* await page.click('#continue');
|
|
456
|
+
* await page.fill('#password', process.env.TEST_PASS!);
|
|
457
|
+
* await page.click('#submit');
|
|
458
|
+
* }
|
|
459
|
+
* ```
|
|
448
460
|
*/
|
|
449
|
-
|
|
461
|
+
customLoginFlow: (page: Page) => Promise<void>;
|
|
462
|
+
loginSelectors?: never;
|
|
463
|
+
credentials?: never;
|
|
450
464
|
}
|
|
465
|
+
/**
|
|
466
|
+
* Configuration for OAuth setup flow.
|
|
467
|
+
*
|
|
468
|
+
* Provide either `loginSelectors` + `credentials` for standard form-based login,
|
|
469
|
+
* or `customLoginFlow` for full control over the browser automation.
|
|
470
|
+
*/
|
|
471
|
+
type OAuthSetupConfig = OAuthSetupBaseConfig & (StandardLoginConfig | CustomLoginConfig);
|
|
451
472
|
/**
|
|
452
473
|
* Result of token exchange or refresh
|
|
453
474
|
*/
|
|
@@ -714,6 +735,34 @@ interface AuthServerMetadata {
|
|
|
714
735
|
*/
|
|
715
736
|
issuer: string;
|
|
716
737
|
}
|
|
738
|
+
/**
|
|
739
|
+
* Configuration for token refresh
|
|
740
|
+
*/
|
|
741
|
+
interface TokenRefreshConfig {
|
|
742
|
+
/**
|
|
743
|
+
* Authorization server metadata
|
|
744
|
+
*/
|
|
745
|
+
authServer: AuthServerMetadata;
|
|
746
|
+
/**
|
|
747
|
+
* Client ID
|
|
748
|
+
*/
|
|
749
|
+
clientId: string;
|
|
750
|
+
/**
|
|
751
|
+
* Client secret (for confidential clients)
|
|
752
|
+
*/
|
|
753
|
+
clientSecret?: string;
|
|
754
|
+
/**
|
|
755
|
+
* Refresh token
|
|
756
|
+
*/
|
|
757
|
+
refreshToken: string;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Refreshes an access token using a refresh token
|
|
761
|
+
*
|
|
762
|
+
* @param config - Token refresh configuration
|
|
763
|
+
* @returns New token result
|
|
764
|
+
*/
|
|
765
|
+
declare function refreshAccessToken(config: TokenRefreshConfig): Promise<TokenResult>;
|
|
717
766
|
/**
|
|
718
767
|
* Configuration for client credentials grant
|
|
719
768
|
*/
|
|
@@ -1632,7 +1681,7 @@ interface UsageMetrics {
|
|
|
1632
1681
|
cacheCreationInputTokens?: number;
|
|
1633
1682
|
}
|
|
1634
1683
|
/** Valid LLM judge provider kinds. */
|
|
1635
|
-
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
1684
|
+
type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
1636
1685
|
/**
|
|
1637
1686
|
* Configuration for an LLM judge
|
|
1638
1687
|
*/
|
|
@@ -1744,8 +1793,11 @@ interface Judge {
|
|
|
1744
1793
|
* Configuration for the judge validator
|
|
1745
1794
|
*/
|
|
1746
1795
|
interface JudgeValidatorConfig {
|
|
1747
|
-
/**
|
|
1748
|
-
|
|
1796
|
+
/**
|
|
1797
|
+
* The evaluation rubric: a built-in name or custom { text: string }.
|
|
1798
|
+
* Required when no named `judge` is specified.
|
|
1799
|
+
*/
|
|
1800
|
+
rubric?: RubricSpec;
|
|
1749
1801
|
/** Optional reference response to compare against */
|
|
1750
1802
|
reference?: unknown;
|
|
1751
1803
|
/** Minimum score required to pass (0-1, default: 0.7) */
|
|
@@ -1766,6 +1818,13 @@ interface JudgeValidatorConfig {
|
|
|
1766
1818
|
maxBudgetUsd?: number;
|
|
1767
1819
|
/** Fail if response exceeds this size in bytes before judging */
|
|
1768
1820
|
maxToolOutputSize?: number;
|
|
1821
|
+
/**
|
|
1822
|
+
* Name of a registered custom judge executor.
|
|
1823
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
1824
|
+
* and returns a normalized score. The `threshold` determines pass/fail.
|
|
1825
|
+
* Register judges with `registerJudge()` before tests run.
|
|
1826
|
+
*/
|
|
1827
|
+
judge?: string;
|
|
1769
1828
|
}
|
|
1770
1829
|
declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
|
|
1771
1830
|
|
|
@@ -1823,6 +1882,12 @@ interface JudgeMatcherOptions {
|
|
|
1823
1882
|
provider?: ProviderKind;
|
|
1824
1883
|
/** Override the judge model */
|
|
1825
1884
|
model?: string;
|
|
1885
|
+
/**
|
|
1886
|
+
* Name of a registered custom judge executor.
|
|
1887
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
1888
|
+
* and its `pass` result is authoritative.
|
|
1889
|
+
*/
|
|
1890
|
+
judge?: string;
|
|
1826
1891
|
}
|
|
1827
1892
|
/**
|
|
1828
1893
|
* Declaration merging for Playwright matchers
|
|
@@ -1913,21 +1978,30 @@ declare global {
|
|
|
1913
1978
|
*/
|
|
1914
1979
|
toBeToolError(expected?: boolean | string | string[]): R;
|
|
1915
1980
|
/**
|
|
1916
|
-
* Validates that a response passes LLM-as-judge evaluation
|
|
1981
|
+
* Validates that a response passes LLM-as-judge evaluation.
|
|
1917
1982
|
*
|
|
1918
|
-
*
|
|
1919
|
-
*
|
|
1983
|
+
* Two call signatures:
|
|
1984
|
+
* - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
|
|
1985
|
+
* - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
|
|
1920
1986
|
*
|
|
1921
1987
|
* @example
|
|
1922
1988
|
* ```typescript
|
|
1989
|
+
* // Built-in LLM judge with rubric
|
|
1923
1990
|
* expect(result).toPassToolJudge('Response should be helpful and accurate');
|
|
1924
|
-
* expect(result).toPassToolJudge('
|
|
1991
|
+
* expect(result).toPassToolJudge('correctness', {
|
|
1925
1992
|
* reference: expectedOutput,
|
|
1926
1993
|
* passingThreshold: 0.8,
|
|
1927
1994
|
* });
|
|
1995
|
+
*
|
|
1996
|
+
* // Named custom judge (registered via registerJudge)
|
|
1997
|
+
* expect(result).toPassToolJudge({ judge: 'glean-completeness' });
|
|
1928
1998
|
* ```
|
|
1929
1999
|
*/
|
|
1930
2000
|
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
2001
|
+
toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
|
|
2002
|
+
toPassToolJudge(judges: Array<JudgeMatcherOptions & {
|
|
2003
|
+
rubric?: RubricSpec;
|
|
2004
|
+
}>): Promise<R>;
|
|
1931
2005
|
/**
|
|
1932
2006
|
* Validates that a response meets size constraints
|
|
1933
2007
|
*
|
|
@@ -2050,6 +2124,33 @@ interface EvalExpectationResult {
|
|
|
2050
2124
|
* Optional details about the result
|
|
2051
2125
|
*/
|
|
2052
2126
|
details?: string;
|
|
2127
|
+
/**
|
|
2128
|
+
* Judge score (0-1). Populated for passesJudge expectations.
|
|
2129
|
+
*/
|
|
2130
|
+
score?: number;
|
|
2131
|
+
/**
|
|
2132
|
+
* Judge reasoning. Populated for passesJudge expectations.
|
|
2133
|
+
*/
|
|
2134
|
+
reasoning?: string;
|
|
2135
|
+
/**
|
|
2136
|
+
* Judge name — rubric name (e.g. 'correctness') or custom judge name.
|
|
2137
|
+
* Populated for passesJudge expectations.
|
|
2138
|
+
*/
|
|
2139
|
+
judgeName?: string;
|
|
2140
|
+
/**
|
|
2141
|
+
* Judge provider used. Populated for passesJudge expectations.
|
|
2142
|
+
*/
|
|
2143
|
+
judgeProvider?: string;
|
|
2144
|
+
/**
|
|
2145
|
+
* Judge model used. Populated for passesJudge expectations.
|
|
2146
|
+
*/
|
|
2147
|
+
judgeModel?: string;
|
|
2148
|
+
/**
|
|
2149
|
+
* Per-judge breakdown when multiple judges are used.
|
|
2150
|
+
* Each entry contains the individual judge's result.
|
|
2151
|
+
* Only populated when passesJudge is an array with 2+ entries.
|
|
2152
|
+
*/
|
|
2153
|
+
judgeResults?: EvalExpectationResult[];
|
|
2053
2154
|
}
|
|
2054
2155
|
/**
|
|
2055
2156
|
* Map of expectation type to result
|
|
@@ -2058,7 +2159,7 @@ type ExpectationResultMap = Partial<Record<ExpectationType, EvalExpectationResul
|
|
|
2058
2159
|
/**
|
|
2059
2160
|
* Breakdown of expectation types used in a run
|
|
2060
2161
|
*/
|
|
2061
|
-
type ExpectationBreakdown = Record<ExpectationType, number
|
|
2162
|
+
type ExpectationBreakdown = Partial<Record<ExpectationType, number>>;
|
|
2062
2163
|
|
|
2063
2164
|
/**
|
|
2064
2165
|
* Options for creating an MCP fixture
|
|
@@ -2274,16 +2375,26 @@ declare function toBeToolError(this: {
|
|
|
2274
2375
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
2275
2376
|
* Delegates evaluation logic to validateJudge() for consistency
|
|
2276
2377
|
* with the validator/matcher duality pattern.
|
|
2378
|
+
*
|
|
2379
|
+
* Supports three call signatures:
|
|
2380
|
+
* - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
|
|
2381
|
+
* - toPassToolJudge({ judge: 'name', ... }) — named custom judge
|
|
2382
|
+
* - toPassToolJudge([...judges]) — multi-judge (all must pass)
|
|
2277
2383
|
*/
|
|
2278
2384
|
|
|
2279
2385
|
/**
|
|
2280
|
-
*
|
|
2386
|
+
* The toPassToolJudge matcher function.
|
|
2281
2387
|
*
|
|
2282
|
-
*
|
|
2388
|
+
* Accepts either:
|
|
2389
|
+
* (received, rubric, options?) — rubric-based LLM judge
|
|
2390
|
+
* (received, options) — named custom judge (options.judge required)
|
|
2391
|
+
* (received, judges[]) — multi-judge (all must pass)
|
|
2283
2392
|
*/
|
|
2284
2393
|
declare function toPassToolJudge(this: {
|
|
2285
2394
|
isNot: boolean;
|
|
2286
|
-
}, received: unknown,
|
|
2395
|
+
}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
|
|
2396
|
+
rubric?: RubricSpec;
|
|
2397
|
+
}>, maybeOptions?: JudgeMatcherOptions): Promise<{
|
|
2287
2398
|
pass: boolean;
|
|
2288
2399
|
message: () => string;
|
|
2289
2400
|
}>;
|
|
@@ -2485,10 +2596,19 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
|
|
|
2485
2596
|
*/
|
|
2486
2597
|
|
|
2487
2598
|
/**
|
|
2488
|
-
*
|
|
2599
|
+
* Host type for MCP host simulation.
|
|
2489
2600
|
*
|
|
2490
|
-
*
|
|
2491
|
-
*
|
|
2601
|
+
* - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
|
|
2602
|
+
* - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
|
|
2603
|
+
* - 'browser': Web-based hosts (e.g., claude.ai). Uses Playwright/CDP. (Not yet implemented.)
|
|
2604
|
+
* - 'desktop': Desktop app hosts (e.g., Claude Desktop). Uses computer use. (Not yet implemented.)
|
|
2605
|
+
*/
|
|
2606
|
+
type HostType = 'sdk' | 'cli' | 'browser' | 'desktop';
|
|
2607
|
+
/**
|
|
2608
|
+
* LLM provider for SDK-based host simulation.
|
|
2609
|
+
*
|
|
2610
|
+
* Each provider runs through the Vercel AI SDK (`ai` package)
|
|
2611
|
+
* and requires its corresponding @ai-sdk/* package:
|
|
2492
2612
|
*
|
|
2493
2613
|
* openai → npm install ai @ai-sdk/openai
|
|
2494
2614
|
* anthropic → npm install ai @ai-sdk/anthropic
|
|
@@ -2508,14 +2628,81 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
|
|
|
2508
2628
|
* @example model: 'claude-3-5-haiku@20241022'
|
|
2509
2629
|
*/
|
|
2510
2630
|
| 'vertex-anthropic';
|
|
2631
|
+
/**
|
|
2632
|
+
* Output format for CLI host processes.
|
|
2633
|
+
*
|
|
2634
|
+
* - 'stream-json': NDJSON (one JSON object per line). Used by Claude Code (`--output-format stream-json`).
|
|
2635
|
+
* - 'json': Single JSON object on stdout.
|
|
2636
|
+
*/
|
|
2637
|
+
type CLIOutputFormat = 'stream-json' | 'json';
|
|
2638
|
+
/**
|
|
2639
|
+
* Configuration for a CLI host process.
|
|
2640
|
+
*
|
|
2641
|
+
* The process is spawned directly (no shell) with `command` and `args`.
|
|
2642
|
+
* Use `{{scenario}}` in any args entry as a placeholder for the natural
|
|
2643
|
+
* language prompt — the framework replaces it before spawning.
|
|
2644
|
+
*
|
|
2645
|
+
* Because args are passed directly to the process (not through a shell),
|
|
2646
|
+
* special characters in the scenario (quotes, newlines, `$`, etc.) are
|
|
2647
|
+
* handled safely without escaping.
|
|
2648
|
+
*
|
|
2649
|
+
* @example Claude Code
|
|
2650
|
+
* ```json
|
|
2651
|
+
* {
|
|
2652
|
+
* "command": "claude",
|
|
2653
|
+
* "args": ["-p", "{{scenario}}", "--output-format", "stream-json",
|
|
2654
|
+
* "--verbose", "--mcp-config", "{...}"]
|
|
2655
|
+
* }
|
|
2656
|
+
* ```
|
|
2657
|
+
*
|
|
2658
|
+
* @example Custom CLI
|
|
2659
|
+
* ```json
|
|
2660
|
+
* {
|
|
2661
|
+
* "command": "my-agent",
|
|
2662
|
+
* "args": ["--prompt", "{{scenario}}", "--config", "./mcp.json"],
|
|
2663
|
+
* "outputFormat": "json"
|
|
2664
|
+
* }
|
|
2665
|
+
* ```
|
|
2666
|
+
*/
|
|
2667
|
+
interface CLIConfig {
|
|
2668
|
+
/**
|
|
2669
|
+
* CLI binary to invoke.
|
|
2670
|
+
*/
|
|
2671
|
+
command: string;
|
|
2672
|
+
/**
|
|
2673
|
+
* Arguments to pass. Use `{{scenario}}` as a placeholder for the prompt.
|
|
2674
|
+
*/
|
|
2675
|
+
args: string[];
|
|
2676
|
+
/**
|
|
2677
|
+
* How to parse stdout.
|
|
2678
|
+
* @default 'stream-json'
|
|
2679
|
+
*/
|
|
2680
|
+
outputFormat?: CLIOutputFormat;
|
|
2681
|
+
/**
|
|
2682
|
+
* Timeout in milliseconds.
|
|
2683
|
+
* @default 120000 (2 minutes)
|
|
2684
|
+
*/
|
|
2685
|
+
timeout?: number;
|
|
2686
|
+
}
|
|
2511
2687
|
/**
|
|
2512
2688
|
* Configuration for MCP host simulation
|
|
2513
2689
|
*/
|
|
2514
2690
|
interface MCPHostConfig {
|
|
2515
2691
|
/**
|
|
2516
|
-
*
|
|
2692
|
+
* Host type for the simulation.
|
|
2693
|
+
*
|
|
2694
|
+
* - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
|
|
2695
|
+
* - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
|
|
2696
|
+
* - 'browser': Web-based hosts (not yet implemented).
|
|
2697
|
+
* - 'desktop': Desktop app hosts (not yet implemented).
|
|
2698
|
+
*
|
|
2699
|
+
* @default 'sdk'
|
|
2700
|
+
*/
|
|
2701
|
+
hostType?: HostType;
|
|
2702
|
+
/**
|
|
2703
|
+
* LLM provider (required for 'sdk' host type, ignored for 'cli')
|
|
2517
2704
|
*/
|
|
2518
|
-
provider
|
|
2705
|
+
provider?: LLMProvider;
|
|
2519
2706
|
/**
|
|
2520
2707
|
* Environment variable name containing the API key
|
|
2521
2708
|
*/
|
|
@@ -2538,6 +2725,10 @@ interface MCPHostConfig {
|
|
|
2538
2725
|
* @default 10
|
|
2539
2726
|
*/
|
|
2540
2727
|
maxToolCalls?: number;
|
|
2728
|
+
/**
|
|
2729
|
+
* CLI host configuration (required for 'cli' host type).
|
|
2730
|
+
*/
|
|
2731
|
+
cli?: CLIConfig;
|
|
2541
2732
|
}
|
|
2542
2733
|
/**
|
|
2543
2734
|
* A tool call made by the LLM
|
|
@@ -2709,6 +2900,42 @@ interface EvalCase {
|
|
|
2709
2900
|
*/
|
|
2710
2901
|
expect?: EvalExpectBlock;
|
|
2711
2902
|
}
|
|
2903
|
+
/**
|
|
2904
|
+
* Configuration for a single LLM-as-judge evaluation
|
|
2905
|
+
*/
|
|
2906
|
+
interface JudgeExpectConfig {
|
|
2907
|
+
/**
|
|
2908
|
+
* Name of a registered custom judge executor.
|
|
2909
|
+
* When set, the named judge handles evaluation and returns a normalized score.
|
|
2910
|
+
* The `threshold` determines pass/fail. `reps` and LLM config fields
|
|
2911
|
+
* (provider, model, etc.) are ignored.
|
|
2912
|
+
*/
|
|
2913
|
+
judge?: string;
|
|
2914
|
+
/** Built-in rubric name or custom rubric object. Required when no `judge` is specified. */
|
|
2915
|
+
rubric?: BuiltInRubric | {
|
|
2916
|
+
text: string;
|
|
2917
|
+
};
|
|
2918
|
+
/** Reference response to compare against */
|
|
2919
|
+
reference?: unknown;
|
|
2920
|
+
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2921
|
+
threshold?: number;
|
|
2922
|
+
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
2923
|
+
reps?: number;
|
|
2924
|
+
/** Judge provider. @default 'anthropic' */
|
|
2925
|
+
provider?: 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
2926
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2927
|
+
model?: string;
|
|
2928
|
+
/** Environment variable name for API key */
|
|
2929
|
+
apiKeyEnvVar?: string;
|
|
2930
|
+
/** Max tokens for judge response */
|
|
2931
|
+
maxTokens?: number;
|
|
2932
|
+
/** Temperature for judge LLM (0–1) */
|
|
2933
|
+
temperature?: number;
|
|
2934
|
+
/** Max budget in USD per evaluation */
|
|
2935
|
+
maxBudgetUsd?: number;
|
|
2936
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
2937
|
+
maxToolOutputSize?: number;
|
|
2938
|
+
}
|
|
2712
2939
|
/**
|
|
2713
2940
|
* Unified expectation block for eval cases
|
|
2714
2941
|
*
|
|
@@ -2748,33 +2975,11 @@ interface EvalExpectBlock {
|
|
|
2748
2975
|
isError?: boolean | string | string[];
|
|
2749
2976
|
/**
|
|
2750
2977
|
* LLM-as-judge evaluation (toPassToolJudge)
|
|
2978
|
+
*
|
|
2979
|
+
* Accepts a single judge config or an array for multi-judge evaluation.
|
|
2980
|
+
* When an array is provided, all judges must pass (AND semantics).
|
|
2751
2981
|
*/
|
|
2752
|
-
passesJudge?:
|
|
2753
|
-
/** Built-in rubric name or custom rubric object */
|
|
2754
|
-
rubric: BuiltInRubric | {
|
|
2755
|
-
text: string;
|
|
2756
|
-
};
|
|
2757
|
-
/** Reference response to compare against */
|
|
2758
|
-
reference?: unknown;
|
|
2759
|
-
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2760
|
-
threshold?: number;
|
|
2761
|
-
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
2762
|
-
reps?: number;
|
|
2763
|
-
/** Judge provider. @default 'anthropic' */
|
|
2764
|
-
provider?: 'anthropic' | 'openai' | 'google';
|
|
2765
|
-
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2766
|
-
model?: string;
|
|
2767
|
-
/** Environment variable name for API key */
|
|
2768
|
-
apiKeyEnvVar?: string;
|
|
2769
|
-
/** Max tokens for judge response */
|
|
2770
|
-
maxTokens?: number;
|
|
2771
|
-
/** Temperature for judge LLM (0–1) */
|
|
2772
|
-
temperature?: number;
|
|
2773
|
-
/** Max budget in USD per evaluation */
|
|
2774
|
-
maxBudgetUsd?: number;
|
|
2775
|
-
/** Fail if response exceeds this size in bytes before judging */
|
|
2776
|
-
maxToolOutputSize?: number;
|
|
2777
|
-
};
|
|
2982
|
+
passesJudge?: JudgeExpectConfig | JudgeExpectConfig[];
|
|
2778
2983
|
/**
|
|
2779
2984
|
* Response size validation (toHaveToolResponseSize)
|
|
2780
2985
|
*/
|
|
@@ -2859,7 +3064,13 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2859
3064
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2860
3065
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2861
3066
|
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2862
|
-
|
|
3067
|
+
hostType: z.ZodOptional<z.ZodEnum<{
|
|
3068
|
+
sdk: "sdk";
|
|
3069
|
+
cli: "cli";
|
|
3070
|
+
browser: "browser";
|
|
3071
|
+
desktop: "desktop";
|
|
3072
|
+
}>>;
|
|
3073
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2863
3074
|
openai: "openai";
|
|
2864
3075
|
anthropic: "anthropic";
|
|
2865
3076
|
azure: "azure";
|
|
@@ -2869,12 +3080,21 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2869
3080
|
openrouter: "openrouter";
|
|
2870
3081
|
xai: "xai";
|
|
2871
3082
|
"vertex-anthropic": "vertex-anthropic";
|
|
2872
|
-
}
|
|
3083
|
+
}>>;
|
|
2873
3084
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2874
3085
|
model: z.ZodOptional<z.ZodString>;
|
|
2875
3086
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2876
3087
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2877
3088
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3089
|
+
cli: z.ZodOptional<z.ZodObject<{
|
|
3090
|
+
command: z.ZodString;
|
|
3091
|
+
args: z.ZodArray<z.ZodString>;
|
|
3092
|
+
outputFormat: z.ZodOptional<z.ZodEnum<{
|
|
3093
|
+
json: "json";
|
|
3094
|
+
"stream-json": "stream-json";
|
|
3095
|
+
}>>;
|
|
3096
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
3097
|
+
}, z.core.$strip>>;
|
|
2878
3098
|
}, z.core.$strip>>;
|
|
2879
3099
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2880
3100
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2901,8 +3121,9 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2901
3121
|
remove: z.ZodArray<z.ZodString>;
|
|
2902
3122
|
}, z.core.$strip>]>>>;
|
|
2903
3123
|
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2904
|
-
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2905
|
-
|
|
3124
|
+
passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
|
|
3125
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3126
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2906
3127
|
correctness: "correctness";
|
|
2907
3128
|
completeness: "completeness";
|
|
2908
3129
|
groundedness: "groundedness";
|
|
@@ -2910,7 +3131,7 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2910
3131
|
conciseness: "conciseness";
|
|
2911
3132
|
}>, z.ZodObject<{
|
|
2912
3133
|
text: z.ZodString;
|
|
2913
|
-
}, z.core.$strip>]
|
|
3134
|
+
}, z.core.$strip>]>>;
|
|
2914
3135
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2915
3136
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2916
3137
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2918,6 +3139,8 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2918
3139
|
openai: "openai";
|
|
2919
3140
|
anthropic: "anthropic";
|
|
2920
3141
|
google: "google";
|
|
3142
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3143
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
2921
3144
|
}>>;
|
|
2922
3145
|
model: z.ZodOptional<z.ZodString>;
|
|
2923
3146
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -2925,7 +3148,34 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2925
3148
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2926
3149
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
2927
3150
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2928
|
-
}, z.core.$strip
|
|
3151
|
+
}, z.core.$strip>, z.ZodArray<z.ZodObject<{
|
|
3152
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3153
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3154
|
+
correctness: "correctness";
|
|
3155
|
+
completeness: "completeness";
|
|
3156
|
+
groundedness: "groundedness";
|
|
3157
|
+
"instruction-following": "instruction-following";
|
|
3158
|
+
conciseness: "conciseness";
|
|
3159
|
+
}>, z.ZodObject<{
|
|
3160
|
+
text: z.ZodString;
|
|
3161
|
+
}, z.core.$strip>]>>;
|
|
3162
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3163
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3164
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3165
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3166
|
+
openai: "openai";
|
|
3167
|
+
anthropic: "anthropic";
|
|
3168
|
+
google: "google";
|
|
3169
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3170
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3171
|
+
}>>;
|
|
3172
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3173
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3174
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3175
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3176
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3177
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3178
|
+
}, z.core.$strip>>]>>;
|
|
2929
3179
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2930
3180
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
2931
3181
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2966,7 +3216,13 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2966
3216
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2967
3217
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2968
3218
|
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2969
|
-
|
|
3219
|
+
hostType: z.ZodOptional<z.ZodEnum<{
|
|
3220
|
+
sdk: "sdk";
|
|
3221
|
+
cli: "cli";
|
|
3222
|
+
browser: "browser";
|
|
3223
|
+
desktop: "desktop";
|
|
3224
|
+
}>>;
|
|
3225
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2970
3226
|
openai: "openai";
|
|
2971
3227
|
anthropic: "anthropic";
|
|
2972
3228
|
azure: "azure";
|
|
@@ -2976,12 +3232,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2976
3232
|
openrouter: "openrouter";
|
|
2977
3233
|
xai: "xai";
|
|
2978
3234
|
"vertex-anthropic": "vertex-anthropic";
|
|
2979
|
-
}
|
|
3235
|
+
}>>;
|
|
2980
3236
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2981
3237
|
model: z.ZodOptional<z.ZodString>;
|
|
2982
3238
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2983
3239
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2984
3240
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3241
|
+
cli: z.ZodOptional<z.ZodObject<{
|
|
3242
|
+
command: z.ZodString;
|
|
3243
|
+
args: z.ZodArray<z.ZodString>;
|
|
3244
|
+
outputFormat: z.ZodOptional<z.ZodEnum<{
|
|
3245
|
+
json: "json";
|
|
3246
|
+
"stream-json": "stream-json";
|
|
3247
|
+
}>>;
|
|
3248
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
3249
|
+
}, z.core.$strip>>;
|
|
2985
3250
|
}, z.core.$strip>>;
|
|
2986
3251
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2987
3252
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3008,8 +3273,9 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3008
3273
|
remove: z.ZodArray<z.ZodString>;
|
|
3009
3274
|
}, z.core.$strip>]>>>;
|
|
3010
3275
|
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3011
|
-
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3012
|
-
|
|
3276
|
+
passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
|
|
3277
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3278
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3013
3279
|
correctness: "correctness";
|
|
3014
3280
|
completeness: "completeness";
|
|
3015
3281
|
groundedness: "groundedness";
|
|
@@ -3017,7 +3283,7 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3017
3283
|
conciseness: "conciseness";
|
|
3018
3284
|
}>, z.ZodObject<{
|
|
3019
3285
|
text: z.ZodString;
|
|
3020
|
-
}, z.core.$strip>]
|
|
3286
|
+
}, z.core.$strip>]>>;
|
|
3021
3287
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3022
3288
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3023
3289
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3025,6 +3291,8 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3025
3291
|
openai: "openai";
|
|
3026
3292
|
anthropic: "anthropic";
|
|
3027
3293
|
google: "google";
|
|
3294
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3295
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3028
3296
|
}>>;
|
|
3029
3297
|
model: z.ZodOptional<z.ZodString>;
|
|
3030
3298
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3032,7 +3300,34 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3032
3300
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3033
3301
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3034
3302
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3035
|
-
}, z.core.$strip
|
|
3303
|
+
}, z.core.$strip>, z.ZodArray<z.ZodObject<{
|
|
3304
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3305
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3306
|
+
correctness: "correctness";
|
|
3307
|
+
completeness: "completeness";
|
|
3308
|
+
groundedness: "groundedness";
|
|
3309
|
+
"instruction-following": "instruction-following";
|
|
3310
|
+
conciseness: "conciseness";
|
|
3311
|
+
}>, z.ZodObject<{
|
|
3312
|
+
text: z.ZodString;
|
|
3313
|
+
}, z.core.$strip>]>>;
|
|
3314
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3315
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3316
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3317
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3318
|
+
openai: "openai";
|
|
3319
|
+
anthropic: "anthropic";
|
|
3320
|
+
google: "google";
|
|
3321
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3322
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3323
|
+
}>>;
|
|
3324
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3325
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3326
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3327
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3328
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3329
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3330
|
+
}, z.core.$strip>>]>>;
|
|
3036
3331
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3037
3332
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3038
3333
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3268,6 +3563,23 @@ interface IterationResult {
|
|
|
3268
3563
|
}>;
|
|
3269
3564
|
};
|
|
3270
3565
|
}
|
|
3566
|
+
/**
|
|
3567
|
+
* Request data captured from the eval case input.
|
|
3568
|
+
* Preserves what was sent so results are self-contained for debugging.
|
|
3569
|
+
*/
|
|
3570
|
+
interface EvalCaseRequest {
|
|
3571
|
+
/** Human-readable description of the case */
|
|
3572
|
+
description?: string;
|
|
3573
|
+
/** Tool arguments (direct mode) */
|
|
3574
|
+
args?: Record<string, unknown>;
|
|
3575
|
+
/** Natural language scenario sent to the LLM (mcp_host mode) */
|
|
3576
|
+
scenario?: string;
|
|
3577
|
+
/** LLM provider/model configuration (mcp_host mode) */
|
|
3578
|
+
mcpHostConfig?: {
|
|
3579
|
+
provider?: string;
|
|
3580
|
+
model?: string;
|
|
3581
|
+
};
|
|
3582
|
+
}
|
|
3271
3583
|
/**
|
|
3272
3584
|
* Result of a single eval case
|
|
3273
3585
|
*/
|
|
@@ -3292,6 +3604,11 @@ interface EvalCaseResult {
|
|
|
3292
3604
|
* Overall pass/fail status
|
|
3293
3605
|
*/
|
|
3294
3606
|
pass: boolean;
|
|
3607
|
+
/**
|
|
3608
|
+
* Request data from the eval case input (tool args, scenario, LLM config).
|
|
3609
|
+
* Populated so results are self-contained for debugging without the original dataset.
|
|
3610
|
+
*/
|
|
3611
|
+
request?: EvalCaseRequest;
|
|
3295
3612
|
/**
|
|
3296
3613
|
* Tool response
|
|
3297
3614
|
*/
|
|
@@ -3835,24 +4152,31 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
3835
4152
|
* schemas, testing discoverability and parameter clarity at the level a real
|
|
3836
4153
|
* user (via Claude Desktop, ChatGPT, etc.) would experience.
|
|
3837
4154
|
*
|
|
3838
|
-
*
|
|
3839
|
-
* which handles multi-turn tool calling natively and provides per-step latency
|
|
3840
|
-
* decomposition (llmDurationMs vs. mcpDurationMs).
|
|
3841
|
-
*
|
|
3842
|
-
* @param mcp - MCP fixture API
|
|
4155
|
+
* @param mcp - MCP fixture API (used by SDK hosts; ignored by CLI/browser hosts which establish their own connections)
|
|
3843
4156
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
3844
4157
|
* @param config - MCP host configuration (provider, model, temperature, etc.)
|
|
3845
4158
|
* @returns Simulation result with tool calls, final response, and latency data
|
|
3846
4159
|
*
|
|
3847
4160
|
* @example
|
|
3848
4161
|
* ```typescript
|
|
4162
|
+
* // SDK host (default) — uses the framework's existing MCP connection
|
|
3849
4163
|
* const result = await simulateMCPHost(mcp,
|
|
3850
4164
|
* "Find recent documents about MCP testing frameworks",
|
|
3851
4165
|
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
3852
4166
|
* );
|
|
3853
4167
|
*
|
|
3854
|
-
*
|
|
3855
|
-
*
|
|
4168
|
+
* // CLI host — spawns a CLI process with its own MCP connection
|
|
4169
|
+
* const result = await simulateMCPHost(mcp,
|
|
4170
|
+
* "Find recent documents about MCP testing frameworks",
|
|
4171
|
+
* {
|
|
4172
|
+
* hostType: 'cli',
|
|
4173
|
+
* provider: 'anthropic',
|
|
4174
|
+
* cli: {
|
|
4175
|
+
* command: 'claude',
|
|
4176
|
+
* args: ['-p', '{{scenario}}', '--output-format', 'stream-json', '--verbose'],
|
|
4177
|
+
* },
|
|
4178
|
+
* }
|
|
4179
|
+
* );
|
|
3856
4180
|
* ```
|
|
3857
4181
|
*/
|
|
3858
4182
|
declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
@@ -3905,6 +4229,99 @@ declare function getMissingDependencyMessage(provider: LLMProvider): string;
|
|
|
3905
4229
|
*/
|
|
3906
4230
|
declare function createJudge(config?: JudgeConfig): Judge;
|
|
3907
4231
|
|
|
4232
|
+
/**
|
|
4233
|
+
* Custom Judge Registry
|
|
4234
|
+
*
|
|
4235
|
+
* Allows consumers to register named judge executors that can be referenced
|
|
4236
|
+
* by string ID in eval fixtures and programmatic tests. This enables
|
|
4237
|
+
* multi-step judge pipelines (LLM call + post-processing), custom scoring
|
|
4238
|
+
* logic, and reusable judge configurations without duplicating rubrics.
|
|
4239
|
+
*/
|
|
4240
|
+
/**
|
|
4241
|
+
* Result returned by a custom judge executor.
|
|
4242
|
+
*
|
|
4243
|
+
* Custom judges must return a normalized score (0–1). The framework applies
|
|
4244
|
+
* the caller's `threshold` (default 0.7) to determine pass/fail. This keeps
|
|
4245
|
+
* judges reusable — the same judge can be used with different thresholds in
|
|
4246
|
+
* different tests.
|
|
4247
|
+
*/
|
|
4248
|
+
interface CustomJudgeResult {
|
|
4249
|
+
/** Normalized score (0–1, where 1 is best) */
|
|
4250
|
+
score: number;
|
|
4251
|
+
/** Optional reasoning/explanation */
|
|
4252
|
+
reasoning?: string;
|
|
4253
|
+
}
|
|
4254
|
+
/**
|
|
4255
|
+
* A user-defined judge executor function.
|
|
4256
|
+
*
|
|
4257
|
+
* Custom executors own their entire evaluation pipeline — prompt construction,
|
|
4258
|
+
* LLM calls, and post-processing — but return a normalized score. The framework
|
|
4259
|
+
* determines pass/fail by comparing the score against the caller's threshold.
|
|
4260
|
+
*
|
|
4261
|
+
* @param candidate - The actual response to evaluate
|
|
4262
|
+
* @param reference - Optional reference/expected response
|
|
4263
|
+
* @returns Evaluation result with a normalized score and optional reasoning
|
|
4264
|
+
*
|
|
4265
|
+
* @example
|
|
4266
|
+
* ```typescript
|
|
4267
|
+
* const completenessJudge: CustomJudgeExecutor = async (candidate, reference) => {
|
|
4268
|
+
* // Step 1: LLM call with your own prompt and schema
|
|
4269
|
+
* const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
|
|
4270
|
+
* const { verdict, reasoning } = JSON.parse(llmResult);
|
|
4271
|
+
*
|
|
4272
|
+
* // Step 2: Deterministic post-processing into a normalized score
|
|
4273
|
+
* const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
|
|
4274
|
+
*
|
|
4275
|
+
* return { score, reasoning };
|
|
4276
|
+
* };
|
|
4277
|
+
* ```
|
|
4278
|
+
*/
|
|
4279
|
+
type CustomJudgeExecutor = (candidate: unknown, reference?: unknown) => Promise<CustomJudgeResult>;
|
|
4280
|
+
/**
|
|
4281
|
+
* Registers a named custom judge executor.
|
|
4282
|
+
*
|
|
4283
|
+
* Call this in your test setup (e.g., `playwright.config.ts` or a global setup file)
|
|
4284
|
+
* before tests run. The name can then be referenced in JSON eval fixtures via the
|
|
4285
|
+
* `judge` field on `passesJudge`.
|
|
4286
|
+
*
|
|
4287
|
+
* @param name - Unique identifier for the judge
|
|
4288
|
+
* @param executor - The judge executor function
|
|
4289
|
+
* @throws {Error} If a judge with the same name is already registered
|
|
4290
|
+
*
|
|
4291
|
+
* @example
|
|
4292
|
+
* ```typescript
|
|
4293
|
+
* import { registerJudge } from '@gleanwork/mcp-server-tester';
|
|
4294
|
+
*
|
|
4295
|
+
* registerJudge('glean-completeness', async (candidate, reference) => {
|
|
4296
|
+
* // Step 1: LLM call with your own prompt and schema
|
|
4297
|
+
* const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
|
|
4298
|
+
* const { verdict, reasoning } = JSON.parse(llmResult);
|
|
4299
|
+
*
|
|
4300
|
+
* // Step 2: Deterministic post-processing into a normalized score
|
|
4301
|
+
* const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
|
|
4302
|
+
*
|
|
4303
|
+
* return { score, reasoning };
|
|
4304
|
+
* });
|
|
4305
|
+
*
|
|
4306
|
+
* // Then in tests — same judge, different thresholds:
|
|
4307
|
+
* // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.8 });
|
|
4308
|
+
* // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.5 });
|
|
4309
|
+
* ```
|
|
4310
|
+
*/
|
|
4311
|
+
declare function registerJudge(name: string, executor: CustomJudgeExecutor): void;
|
|
4312
|
+
/**
|
|
4313
|
+
* Retrieves a registered custom judge executor by name.
|
|
4314
|
+
*
|
|
4315
|
+
* @param name - The judge name to look up
|
|
4316
|
+
* @returns The registered executor
|
|
4317
|
+
* @throws {Error} If no judge with the given name is registered
|
|
4318
|
+
*/
|
|
4319
|
+
declare function getRegisteredJudge(name: string): CustomJudgeExecutor;
|
|
4320
|
+
/**
|
|
4321
|
+
* Clears all registered judges. Intended for test teardown.
|
|
4322
|
+
*/
|
|
4323
|
+
declare function clearJudgeRegistry(): void;
|
|
4324
|
+
|
|
3908
4325
|
/**
|
|
3909
4326
|
* Options for conformance checks
|
|
3910
4327
|
*/
|
|
@@ -4066,4 +4483,4 @@ interface MCPEvalReporterConfig {
|
|
|
4066
4483
|
includeAutoTracking?: boolean;
|
|
4067
4484
|
}
|
|
4068
4485
|
|
|
4069
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4486
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseRequest, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunMetadata, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|