@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +284 -24
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +649 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +504 -115
- package/dist/index.d.ts +504 -115
- package/dist/index.js +648 -64
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +12 -7
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.d.cts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { z, ZodType } from 'zod';
|
|
2
|
+
import { Page, TestInfo, Expect } from '@playwright/test';
|
|
2
3
|
import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
|
|
3
4
|
import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
|
|
4
5
|
import * as oauth from 'oauth4webapi';
|
|
5
6
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
6
7
|
import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
|
|
7
|
-
import { TestInfo, Expect } from '@playwright/test';
|
|
8
8
|
import * as playwright_test from 'playwright/test';
|
|
9
9
|
|
|
10
10
|
/**
|
|
@@ -311,6 +311,7 @@ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
|
|
|
311
311
|
/**
|
|
312
312
|
* Auth types for MCP OAuth integration
|
|
313
313
|
*/
|
|
314
|
+
|
|
314
315
|
/**
|
|
315
316
|
* Stored OAuth tokens
|
|
316
317
|
*/
|
|
@@ -384,70 +385,90 @@ interface StoredOAuthState {
|
|
|
384
385
|
savedAt: number;
|
|
385
386
|
}
|
|
386
387
|
/**
|
|
387
|
-
*
|
|
388
|
+
* Login form selectors for standard OAuth login automation
|
|
389
|
+
*/
|
|
390
|
+
interface OAuthLoginSelectors {
|
|
391
|
+
/** Selector for username/email input field */
|
|
392
|
+
usernameInput: string;
|
|
393
|
+
/** Selector for password input field */
|
|
394
|
+
passwordInput: string;
|
|
395
|
+
/** Selector for login submit button */
|
|
396
|
+
submitButton: string;
|
|
397
|
+
/** Selector for consent/authorize button (optional) */
|
|
398
|
+
consentButton?: string;
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Base configuration shared by all OAuth setup strategies
|
|
388
402
|
*/
|
|
389
|
-
interface
|
|
390
|
-
/**
|
|
391
|
-
* OAuth authorization server metadata URL
|
|
392
|
-
*/
|
|
403
|
+
interface OAuthSetupBaseConfig {
|
|
404
|
+
/** OAuth authorization server metadata URL */
|
|
393
405
|
authServerUrl: string;
|
|
394
|
-
/**
|
|
395
|
-
* Scopes to request
|
|
396
|
-
*/
|
|
406
|
+
/** Scopes to request */
|
|
397
407
|
scopes: Array<string>;
|
|
398
|
-
/**
|
|
399
|
-
|
|
400
|
-
|
|
408
|
+
/** Path to save OAuth state file */
|
|
409
|
+
outputPath: string;
|
|
410
|
+
/** Pre-registered client ID (optional, uses DCR if not provided) */
|
|
411
|
+
clientId?: string;
|
|
412
|
+
/** Pre-registered client secret (optional) */
|
|
413
|
+
clientSecret?: string;
|
|
414
|
+
/** Redirect URI for OAuth callback */
|
|
415
|
+
redirectUri?: string;
|
|
416
|
+
/** Resource indicator (RFC 8707) */
|
|
401
417
|
resource?: string;
|
|
402
|
-
/**
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
passwordInput: string;
|
|
414
|
-
/**
|
|
415
|
-
* Selector for login submit button
|
|
416
|
-
*/
|
|
417
|
-
submitButton: string;
|
|
418
|
-
/**
|
|
419
|
-
* Selector for consent/authorize button (optional)
|
|
420
|
-
*/
|
|
421
|
-
consentButton?: string;
|
|
422
|
-
};
|
|
423
|
-
/**
|
|
424
|
-
* Test user credentials
|
|
425
|
-
*/
|
|
418
|
+
/** Timeout for login flow in milliseconds (default: 30000) */
|
|
419
|
+
timeoutMs?: number;
|
|
420
|
+
}
|
|
421
|
+
/**
|
|
422
|
+
* Standard login strategy: automates a form with username, password, and submit button.
|
|
423
|
+
* Use when the IdP presents all login fields on a single page.
|
|
424
|
+
*/
|
|
425
|
+
interface StandardLoginConfig {
|
|
426
|
+
/** Login form selectors for Playwright automation */
|
|
427
|
+
loginSelectors: OAuthLoginSelectors;
|
|
428
|
+
/** Test user credentials */
|
|
426
429
|
credentials: {
|
|
427
430
|
username: string;
|
|
428
431
|
password: string;
|
|
429
432
|
};
|
|
433
|
+
customLoginFlow?: never;
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* Custom login strategy: full control over the browser-based login flow.
|
|
437
|
+
* Use for multi-step logins, MFA, custom consent screens, or any flow
|
|
438
|
+
* that doesn't fit the standard username/password/submit pattern.
|
|
439
|
+
*
|
|
440
|
+
* The callback receives a Playwright Page already navigated to the OAuth
|
|
441
|
+
* authorization URL. Complete the login so the IdP redirects to the
|
|
442
|
+
* callback URL — `performOAuthSetup` handles PKCE, token exchange,
|
|
443
|
+
* and state persistence automatically.
|
|
444
|
+
*/
|
|
445
|
+
interface CustomLoginConfig {
|
|
430
446
|
/**
|
|
431
|
-
*
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
*
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
*
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
*
|
|
444
|
-
*/
|
|
445
|
-
redirectUri?: string;
|
|
446
|
-
/**
|
|
447
|
-
* Timeout for login flow in milliseconds (default: 30000)
|
|
447
|
+
* Custom Playwright automation for the IdP login flow.
|
|
448
|
+
*
|
|
449
|
+
* @param page - Playwright Page already navigated to the OAuth authorization URL
|
|
450
|
+
*
|
|
451
|
+
* @example
|
|
452
|
+
* ```typescript
|
|
453
|
+
* customLoginFlow: async (page) => {
|
|
454
|
+
* await page.fill('#username', process.env.TEST_USER!);
|
|
455
|
+
* await page.click('#continue');
|
|
456
|
+
* await page.fill('#password', process.env.TEST_PASS!);
|
|
457
|
+
* await page.click('#submit');
|
|
458
|
+
* }
|
|
459
|
+
* ```
|
|
448
460
|
*/
|
|
449
|
-
|
|
461
|
+
customLoginFlow: (page: Page) => Promise<void>;
|
|
462
|
+
loginSelectors?: never;
|
|
463
|
+
credentials?: never;
|
|
450
464
|
}
|
|
465
|
+
/**
|
|
466
|
+
* Configuration for OAuth setup flow.
|
|
467
|
+
*
|
|
468
|
+
* Provide either `loginSelectors` + `credentials` for standard form-based login,
|
|
469
|
+
* or `customLoginFlow` for full control over the browser automation.
|
|
470
|
+
*/
|
|
471
|
+
type OAuthSetupConfig = OAuthSetupBaseConfig & (StandardLoginConfig | CustomLoginConfig);
|
|
451
472
|
/**
|
|
452
473
|
* Result of token exchange or refresh
|
|
453
474
|
*/
|
|
@@ -1632,7 +1653,7 @@ interface UsageMetrics {
|
|
|
1632
1653
|
cacheCreationInputTokens?: number;
|
|
1633
1654
|
}
|
|
1634
1655
|
/** Valid LLM judge provider kinds. */
|
|
1635
|
-
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
1656
|
+
type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
1636
1657
|
/**
|
|
1637
1658
|
* Configuration for an LLM judge
|
|
1638
1659
|
*/
|
|
@@ -1744,8 +1765,11 @@ interface Judge {
|
|
|
1744
1765
|
* Configuration for the judge validator
|
|
1745
1766
|
*/
|
|
1746
1767
|
interface JudgeValidatorConfig {
|
|
1747
|
-
/**
|
|
1748
|
-
|
|
1768
|
+
/**
|
|
1769
|
+
* The evaluation rubric: a built-in name or custom { text: string }.
|
|
1770
|
+
* Required when no named `judge` is specified.
|
|
1771
|
+
*/
|
|
1772
|
+
rubric?: RubricSpec;
|
|
1749
1773
|
/** Optional reference response to compare against */
|
|
1750
1774
|
reference?: unknown;
|
|
1751
1775
|
/** Minimum score required to pass (0-1, default: 0.7) */
|
|
@@ -1766,6 +1790,13 @@ interface JudgeValidatorConfig {
|
|
|
1766
1790
|
maxBudgetUsd?: number;
|
|
1767
1791
|
/** Fail if response exceeds this size in bytes before judging */
|
|
1768
1792
|
maxToolOutputSize?: number;
|
|
1793
|
+
/**
|
|
1794
|
+
* Name of a registered custom judge executor.
|
|
1795
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
1796
|
+
* and returns a normalized score. The `threshold` determines pass/fail.
|
|
1797
|
+
* Register judges with `registerJudge()` before tests run.
|
|
1798
|
+
*/
|
|
1799
|
+
judge?: string;
|
|
1769
1800
|
}
|
|
1770
1801
|
declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
|
|
1771
1802
|
|
|
@@ -1823,6 +1854,12 @@ interface JudgeMatcherOptions {
|
|
|
1823
1854
|
provider?: ProviderKind;
|
|
1824
1855
|
/** Override the judge model */
|
|
1825
1856
|
model?: string;
|
|
1857
|
+
/**
|
|
1858
|
+
* Name of a registered custom judge executor.
|
|
1859
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
1860
|
+
* and its `pass` result is authoritative.
|
|
1861
|
+
*/
|
|
1862
|
+
judge?: string;
|
|
1826
1863
|
}
|
|
1827
1864
|
/**
|
|
1828
1865
|
* Declaration merging for Playwright matchers
|
|
@@ -1913,21 +1950,30 @@ declare global {
|
|
|
1913
1950
|
*/
|
|
1914
1951
|
toBeToolError(expected?: boolean | string | string[]): R;
|
|
1915
1952
|
/**
|
|
1916
|
-
* Validates that a response passes LLM-as-judge evaluation
|
|
1953
|
+
* Validates that a response passes LLM-as-judge evaluation.
|
|
1917
1954
|
*
|
|
1918
|
-
*
|
|
1919
|
-
*
|
|
1955
|
+
* Two call signatures:
|
|
1956
|
+
* - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
|
|
1957
|
+
* - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
|
|
1920
1958
|
*
|
|
1921
1959
|
* @example
|
|
1922
1960
|
* ```typescript
|
|
1961
|
+
* // Built-in LLM judge with rubric
|
|
1923
1962
|
* expect(result).toPassToolJudge('Response should be helpful and accurate');
|
|
1924
|
-
* expect(result).toPassToolJudge('
|
|
1963
|
+
* expect(result).toPassToolJudge('correctness', {
|
|
1925
1964
|
* reference: expectedOutput,
|
|
1926
1965
|
* passingThreshold: 0.8,
|
|
1927
1966
|
* });
|
|
1967
|
+
*
|
|
1968
|
+
* // Named custom judge (registered via registerJudge)
|
|
1969
|
+
* expect(result).toPassToolJudge({ judge: 'glean-completeness' });
|
|
1928
1970
|
* ```
|
|
1929
1971
|
*/
|
|
1930
1972
|
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
1973
|
+
toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
|
|
1974
|
+
toPassToolJudge(judges: Array<JudgeMatcherOptions & {
|
|
1975
|
+
rubric?: RubricSpec;
|
|
1976
|
+
}>): Promise<R>;
|
|
1931
1977
|
/**
|
|
1932
1978
|
* Validates that a response meets size constraints
|
|
1933
1979
|
*
|
|
@@ -2050,6 +2096,33 @@ interface EvalExpectationResult {
|
|
|
2050
2096
|
* Optional details about the result
|
|
2051
2097
|
*/
|
|
2052
2098
|
details?: string;
|
|
2099
|
+
/**
|
|
2100
|
+
* Judge score (0-1). Populated for passesJudge expectations.
|
|
2101
|
+
*/
|
|
2102
|
+
score?: number;
|
|
2103
|
+
/**
|
|
2104
|
+
* Judge reasoning. Populated for passesJudge expectations.
|
|
2105
|
+
*/
|
|
2106
|
+
reasoning?: string;
|
|
2107
|
+
/**
|
|
2108
|
+
* Judge name — rubric name (e.g. 'correctness') or custom judge name.
|
|
2109
|
+
* Populated for passesJudge expectations.
|
|
2110
|
+
*/
|
|
2111
|
+
judgeName?: string;
|
|
2112
|
+
/**
|
|
2113
|
+
* Judge provider used. Populated for passesJudge expectations.
|
|
2114
|
+
*/
|
|
2115
|
+
judgeProvider?: string;
|
|
2116
|
+
/**
|
|
2117
|
+
* Judge model used. Populated for passesJudge expectations.
|
|
2118
|
+
*/
|
|
2119
|
+
judgeModel?: string;
|
|
2120
|
+
/**
|
|
2121
|
+
* Per-judge breakdown when multiple judges are used.
|
|
2122
|
+
* Each entry contains the individual judge's result.
|
|
2123
|
+
* Only populated when passesJudge is an array with 2+ entries.
|
|
2124
|
+
*/
|
|
2125
|
+
judgeResults?: EvalExpectationResult[];
|
|
2053
2126
|
}
|
|
2054
2127
|
/**
|
|
2055
2128
|
* Map of expectation type to result
|
|
@@ -2274,16 +2347,26 @@ declare function toBeToolError(this: {
|
|
|
2274
2347
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
2275
2348
|
* Delegates evaluation logic to validateJudge() for consistency
|
|
2276
2349
|
* with the validator/matcher duality pattern.
|
|
2350
|
+
*
|
|
2351
|
+
* Supports three call signatures:
|
|
2352
|
+
* - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
|
|
2353
|
+
* - toPassToolJudge({ judge: 'name', ... }) — named custom judge
|
|
2354
|
+
* - toPassToolJudge([...judges]) — multi-judge (all must pass)
|
|
2277
2355
|
*/
|
|
2278
2356
|
|
|
2279
2357
|
/**
|
|
2280
|
-
*
|
|
2358
|
+
* The toPassToolJudge matcher function.
|
|
2281
2359
|
*
|
|
2282
|
-
*
|
|
2360
|
+
* Accepts either:
|
|
2361
|
+
* (received, rubric, options?) — rubric-based LLM judge
|
|
2362
|
+
* (received, options) — named custom judge (options.judge required)
|
|
2363
|
+
* (received, judges[]) — multi-judge (all must pass)
|
|
2283
2364
|
*/
|
|
2284
2365
|
declare function toPassToolJudge(this: {
|
|
2285
2366
|
isNot: boolean;
|
|
2286
|
-
}, received: unknown,
|
|
2367
|
+
}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
|
|
2368
|
+
rubric?: RubricSpec;
|
|
2369
|
+
}>, maybeOptions?: JudgeMatcherOptions): Promise<{
|
|
2287
2370
|
pass: boolean;
|
|
2288
2371
|
message: () => string;
|
|
2289
2372
|
}>;
|
|
@@ -2485,10 +2568,19 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
|
|
|
2485
2568
|
*/
|
|
2486
2569
|
|
|
2487
2570
|
/**
|
|
2488
|
-
*
|
|
2571
|
+
* Host type for MCP host simulation.
|
|
2572
|
+
*
|
|
2573
|
+
* - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
|
|
2574
|
+
* - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
|
|
2575
|
+
* - 'browser': Web-based hosts (e.g., claude.ai). Uses Playwright/CDP. (Not yet implemented.)
|
|
2576
|
+
* - 'desktop': Desktop app hosts (e.g., Claude Desktop). Uses computer use. (Not yet implemented.)
|
|
2577
|
+
*/
|
|
2578
|
+
type HostType = 'sdk' | 'cli' | 'browser' | 'desktop';
|
|
2579
|
+
/**
|
|
2580
|
+
* LLM provider for SDK-based host simulation.
|
|
2489
2581
|
*
|
|
2490
|
-
*
|
|
2491
|
-
*
|
|
2582
|
+
* Each provider runs through the Vercel AI SDK (`ai` package)
|
|
2583
|
+
* and requires its corresponding @ai-sdk/* package:
|
|
2492
2584
|
*
|
|
2493
2585
|
* openai → npm install ai @ai-sdk/openai
|
|
2494
2586
|
* anthropic → npm install ai @ai-sdk/anthropic
|
|
@@ -2508,14 +2600,81 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
|
|
|
2508
2600
|
* @example model: 'claude-3-5-haiku@20241022'
|
|
2509
2601
|
*/
|
|
2510
2602
|
| 'vertex-anthropic';
|
|
2603
|
+
/**
|
|
2604
|
+
* Output format for CLI host processes.
|
|
2605
|
+
*
|
|
2606
|
+
* - 'stream-json': NDJSON (one JSON object per line). Used by Claude Code (`--output-format stream-json`).
|
|
2607
|
+
* - 'json': Single JSON object on stdout.
|
|
2608
|
+
*/
|
|
2609
|
+
type CLIOutputFormat = 'stream-json' | 'json';
|
|
2610
|
+
/**
|
|
2611
|
+
* Configuration for a CLI host process.
|
|
2612
|
+
*
|
|
2613
|
+
* The process is spawned directly (no shell) with `command` and `args`.
|
|
2614
|
+
* Use `{{scenario}}` in any args entry as a placeholder for the natural
|
|
2615
|
+
* language prompt — the framework replaces it before spawning.
|
|
2616
|
+
*
|
|
2617
|
+
* Because args are passed directly to the process (not through a shell),
|
|
2618
|
+
* special characters in the scenario (quotes, newlines, `$`, etc.) are
|
|
2619
|
+
* handled safely without escaping.
|
|
2620
|
+
*
|
|
2621
|
+
* @example Claude Code
|
|
2622
|
+
* ```json
|
|
2623
|
+
* {
|
|
2624
|
+
* "command": "claude",
|
|
2625
|
+
* "args": ["-p", "{{scenario}}", "--output-format", "stream-json",
|
|
2626
|
+
* "--verbose", "--mcp-config", "{...}"]
|
|
2627
|
+
* }
|
|
2628
|
+
* ```
|
|
2629
|
+
*
|
|
2630
|
+
* @example Custom CLI
|
|
2631
|
+
* ```json
|
|
2632
|
+
* {
|
|
2633
|
+
* "command": "my-agent",
|
|
2634
|
+
* "args": ["--prompt", "{{scenario}}", "--config", "./mcp.json"],
|
|
2635
|
+
* "outputFormat": "json"
|
|
2636
|
+
* }
|
|
2637
|
+
* ```
|
|
2638
|
+
*/
|
|
2639
|
+
interface CLIConfig {
|
|
2640
|
+
/**
|
|
2641
|
+
* CLI binary to invoke.
|
|
2642
|
+
*/
|
|
2643
|
+
command: string;
|
|
2644
|
+
/**
|
|
2645
|
+
* Arguments to pass. Use `{{scenario}}` as a placeholder for the prompt.
|
|
2646
|
+
*/
|
|
2647
|
+
args: string[];
|
|
2648
|
+
/**
|
|
2649
|
+
* How to parse stdout.
|
|
2650
|
+
* @default 'stream-json'
|
|
2651
|
+
*/
|
|
2652
|
+
outputFormat?: CLIOutputFormat;
|
|
2653
|
+
/**
|
|
2654
|
+
* Timeout in milliseconds.
|
|
2655
|
+
* @default 120000 (2 minutes)
|
|
2656
|
+
*/
|
|
2657
|
+
timeout?: number;
|
|
2658
|
+
}
|
|
2511
2659
|
/**
|
|
2512
2660
|
* Configuration for MCP host simulation
|
|
2513
2661
|
*/
|
|
2514
2662
|
interface MCPHostConfig {
|
|
2515
2663
|
/**
|
|
2516
|
-
*
|
|
2664
|
+
* Host type for the simulation.
|
|
2665
|
+
*
|
|
2666
|
+
* - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
|
|
2667
|
+
* - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
|
|
2668
|
+
* - 'browser': Web-based hosts (not yet implemented).
|
|
2669
|
+
* - 'desktop': Desktop app hosts (not yet implemented).
|
|
2670
|
+
*
|
|
2671
|
+
* @default 'sdk'
|
|
2672
|
+
*/
|
|
2673
|
+
hostType?: HostType;
|
|
2674
|
+
/**
|
|
2675
|
+
* LLM provider (required for 'sdk' host type, ignored for 'cli')
|
|
2517
2676
|
*/
|
|
2518
|
-
provider
|
|
2677
|
+
provider?: LLMProvider;
|
|
2519
2678
|
/**
|
|
2520
2679
|
* Environment variable name containing the API key
|
|
2521
2680
|
*/
|
|
@@ -2538,6 +2697,10 @@ interface MCPHostConfig {
|
|
|
2538
2697
|
* @default 10
|
|
2539
2698
|
*/
|
|
2540
2699
|
maxToolCalls?: number;
|
|
2700
|
+
/**
|
|
2701
|
+
* CLI host configuration (required for 'cli' host type).
|
|
2702
|
+
*/
|
|
2703
|
+
cli?: CLIConfig;
|
|
2541
2704
|
}
|
|
2542
2705
|
/**
|
|
2543
2706
|
* A tool call made by the LLM
|
|
@@ -2709,6 +2872,42 @@ interface EvalCase {
|
|
|
2709
2872
|
*/
|
|
2710
2873
|
expect?: EvalExpectBlock;
|
|
2711
2874
|
}
|
|
2875
|
+
/**
|
|
2876
|
+
* Configuration for a single LLM-as-judge evaluation
|
|
2877
|
+
*/
|
|
2878
|
+
interface JudgeExpectConfig {
|
|
2879
|
+
/**
|
|
2880
|
+
* Name of a registered custom judge executor.
|
|
2881
|
+
* When set, the named judge handles evaluation and returns a normalized score.
|
|
2882
|
+
* The `threshold` determines pass/fail. `reps` and LLM config fields
|
|
2883
|
+
* (provider, model, etc.) are ignored.
|
|
2884
|
+
*/
|
|
2885
|
+
judge?: string;
|
|
2886
|
+
/** Built-in rubric name or custom rubric object. Required when no `judge` is specified. */
|
|
2887
|
+
rubric?: BuiltInRubric | {
|
|
2888
|
+
text: string;
|
|
2889
|
+
};
|
|
2890
|
+
/** Reference response to compare against */
|
|
2891
|
+
reference?: unknown;
|
|
2892
|
+
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2893
|
+
threshold?: number;
|
|
2894
|
+
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
2895
|
+
reps?: number;
|
|
2896
|
+
/** Judge provider. @default 'anthropic' */
|
|
2897
|
+
provider?: 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
2898
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2899
|
+
model?: string;
|
|
2900
|
+
/** Environment variable name for API key */
|
|
2901
|
+
apiKeyEnvVar?: string;
|
|
2902
|
+
/** Max tokens for judge response */
|
|
2903
|
+
maxTokens?: number;
|
|
2904
|
+
/** Temperature for judge LLM (0–1) */
|
|
2905
|
+
temperature?: number;
|
|
2906
|
+
/** Max budget in USD per evaluation */
|
|
2907
|
+
maxBudgetUsd?: number;
|
|
2908
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
2909
|
+
maxToolOutputSize?: number;
|
|
2910
|
+
}
|
|
2712
2911
|
/**
|
|
2713
2912
|
* Unified expectation block for eval cases
|
|
2714
2913
|
*
|
|
@@ -2748,33 +2947,11 @@ interface EvalExpectBlock {
|
|
|
2748
2947
|
isError?: boolean | string | string[];
|
|
2749
2948
|
/**
|
|
2750
2949
|
* LLM-as-judge evaluation (toPassToolJudge)
|
|
2950
|
+
*
|
|
2951
|
+
* Accepts a single judge config or an array for multi-judge evaluation.
|
|
2952
|
+
* When an array is provided, all judges must pass (AND semantics).
|
|
2751
2953
|
*/
|
|
2752
|
-
passesJudge?:
|
|
2753
|
-
/** Built-in rubric name or custom rubric object */
|
|
2754
|
-
rubric: BuiltInRubric | {
|
|
2755
|
-
text: string;
|
|
2756
|
-
};
|
|
2757
|
-
/** Reference response to compare against */
|
|
2758
|
-
reference?: unknown;
|
|
2759
|
-
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2760
|
-
threshold?: number;
|
|
2761
|
-
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
2762
|
-
reps?: number;
|
|
2763
|
-
/** Judge provider. @default 'anthropic' */
|
|
2764
|
-
provider?: 'anthropic' | 'openai' | 'google';
|
|
2765
|
-
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2766
|
-
model?: string;
|
|
2767
|
-
/** Environment variable name for API key */
|
|
2768
|
-
apiKeyEnvVar?: string;
|
|
2769
|
-
/** Max tokens for judge response */
|
|
2770
|
-
maxTokens?: number;
|
|
2771
|
-
/** Temperature for judge LLM (0–1) */
|
|
2772
|
-
temperature?: number;
|
|
2773
|
-
/** Max budget in USD per evaluation */
|
|
2774
|
-
maxBudgetUsd?: number;
|
|
2775
|
-
/** Fail if response exceeds this size in bytes before judging */
|
|
2776
|
-
maxToolOutputSize?: number;
|
|
2777
|
-
};
|
|
2954
|
+
passesJudge?: JudgeExpectConfig | JudgeExpectConfig[];
|
|
2778
2955
|
/**
|
|
2779
2956
|
* Response size validation (toHaveToolResponseSize)
|
|
2780
2957
|
*/
|
|
@@ -2859,7 +3036,13 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2859
3036
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2860
3037
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2861
3038
|
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2862
|
-
|
|
3039
|
+
hostType: z.ZodOptional<z.ZodEnum<{
|
|
3040
|
+
sdk: "sdk";
|
|
3041
|
+
cli: "cli";
|
|
3042
|
+
browser: "browser";
|
|
3043
|
+
desktop: "desktop";
|
|
3044
|
+
}>>;
|
|
3045
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2863
3046
|
openai: "openai";
|
|
2864
3047
|
anthropic: "anthropic";
|
|
2865
3048
|
azure: "azure";
|
|
@@ -2869,12 +3052,21 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2869
3052
|
openrouter: "openrouter";
|
|
2870
3053
|
xai: "xai";
|
|
2871
3054
|
"vertex-anthropic": "vertex-anthropic";
|
|
2872
|
-
}
|
|
3055
|
+
}>>;
|
|
2873
3056
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2874
3057
|
model: z.ZodOptional<z.ZodString>;
|
|
2875
3058
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2876
3059
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2877
3060
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3061
|
+
cli: z.ZodOptional<z.ZodObject<{
|
|
3062
|
+
command: z.ZodString;
|
|
3063
|
+
args: z.ZodArray<z.ZodString>;
|
|
3064
|
+
outputFormat: z.ZodOptional<z.ZodEnum<{
|
|
3065
|
+
json: "json";
|
|
3066
|
+
"stream-json": "stream-json";
|
|
3067
|
+
}>>;
|
|
3068
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
3069
|
+
}, z.core.$strip>>;
|
|
2878
3070
|
}, z.core.$strip>>;
|
|
2879
3071
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2880
3072
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2901,8 +3093,9 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2901
3093
|
remove: z.ZodArray<z.ZodString>;
|
|
2902
3094
|
}, z.core.$strip>]>>>;
|
|
2903
3095
|
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2904
|
-
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2905
|
-
|
|
3096
|
+
passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
|
|
3097
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3098
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2906
3099
|
correctness: "correctness";
|
|
2907
3100
|
completeness: "completeness";
|
|
2908
3101
|
groundedness: "groundedness";
|
|
@@ -2910,7 +3103,7 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2910
3103
|
conciseness: "conciseness";
|
|
2911
3104
|
}>, z.ZodObject<{
|
|
2912
3105
|
text: z.ZodString;
|
|
2913
|
-
}, z.core.$strip>]
|
|
3106
|
+
}, z.core.$strip>]>>;
|
|
2914
3107
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2915
3108
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2916
3109
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2918,6 +3111,8 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2918
3111
|
openai: "openai";
|
|
2919
3112
|
anthropic: "anthropic";
|
|
2920
3113
|
google: "google";
|
|
3114
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3115
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
2921
3116
|
}>>;
|
|
2922
3117
|
model: z.ZodOptional<z.ZodString>;
|
|
2923
3118
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -2925,7 +3120,34 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2925
3120
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2926
3121
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
2927
3122
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2928
|
-
}, z.core.$strip
|
|
3123
|
+
}, z.core.$strip>, z.ZodArray<z.ZodObject<{
|
|
3124
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3125
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3126
|
+
correctness: "correctness";
|
|
3127
|
+
completeness: "completeness";
|
|
3128
|
+
groundedness: "groundedness";
|
|
3129
|
+
"instruction-following": "instruction-following";
|
|
3130
|
+
conciseness: "conciseness";
|
|
3131
|
+
}>, z.ZodObject<{
|
|
3132
|
+
text: z.ZodString;
|
|
3133
|
+
}, z.core.$strip>]>>;
|
|
3134
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3135
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3136
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3137
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3138
|
+
openai: "openai";
|
|
3139
|
+
anthropic: "anthropic";
|
|
3140
|
+
google: "google";
|
|
3141
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3142
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3143
|
+
}>>;
|
|
3144
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3145
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3146
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3147
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3148
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3149
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3150
|
+
}, z.core.$strip>>]>>;
|
|
2929
3151
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2930
3152
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
2931
3153
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2966,7 +3188,13 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2966
3188
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2967
3189
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2968
3190
|
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2969
|
-
|
|
3191
|
+
hostType: z.ZodOptional<z.ZodEnum<{
|
|
3192
|
+
sdk: "sdk";
|
|
3193
|
+
cli: "cli";
|
|
3194
|
+
browser: "browser";
|
|
3195
|
+
desktop: "desktop";
|
|
3196
|
+
}>>;
|
|
3197
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2970
3198
|
openai: "openai";
|
|
2971
3199
|
anthropic: "anthropic";
|
|
2972
3200
|
azure: "azure";
|
|
@@ -2976,12 +3204,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2976
3204
|
openrouter: "openrouter";
|
|
2977
3205
|
xai: "xai";
|
|
2978
3206
|
"vertex-anthropic": "vertex-anthropic";
|
|
2979
|
-
}
|
|
3207
|
+
}>>;
|
|
2980
3208
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2981
3209
|
model: z.ZodOptional<z.ZodString>;
|
|
2982
3210
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2983
3211
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2984
3212
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3213
|
+
cli: z.ZodOptional<z.ZodObject<{
|
|
3214
|
+
command: z.ZodString;
|
|
3215
|
+
args: z.ZodArray<z.ZodString>;
|
|
3216
|
+
outputFormat: z.ZodOptional<z.ZodEnum<{
|
|
3217
|
+
json: "json";
|
|
3218
|
+
"stream-json": "stream-json";
|
|
3219
|
+
}>>;
|
|
3220
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
3221
|
+
}, z.core.$strip>>;
|
|
2985
3222
|
}, z.core.$strip>>;
|
|
2986
3223
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2987
3224
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3008,8 +3245,9 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3008
3245
|
remove: z.ZodArray<z.ZodString>;
|
|
3009
3246
|
}, z.core.$strip>]>>>;
|
|
3010
3247
|
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3011
|
-
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3012
|
-
|
|
3248
|
+
passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
|
|
3249
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3250
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3013
3251
|
correctness: "correctness";
|
|
3014
3252
|
completeness: "completeness";
|
|
3015
3253
|
groundedness: "groundedness";
|
|
@@ -3017,7 +3255,7 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3017
3255
|
conciseness: "conciseness";
|
|
3018
3256
|
}>, z.ZodObject<{
|
|
3019
3257
|
text: z.ZodString;
|
|
3020
|
-
}, z.core.$strip>]
|
|
3258
|
+
}, z.core.$strip>]>>;
|
|
3021
3259
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3022
3260
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3023
3261
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3025,6 +3263,8 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3025
3263
|
openai: "openai";
|
|
3026
3264
|
anthropic: "anthropic";
|
|
3027
3265
|
google: "google";
|
|
3266
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3267
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3028
3268
|
}>>;
|
|
3029
3269
|
model: z.ZodOptional<z.ZodString>;
|
|
3030
3270
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
@@ -3032,7 +3272,34 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3032
3272
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3033
3273
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3034
3274
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3035
|
-
}, z.core.$strip
|
|
3275
|
+
}, z.core.$strip>, z.ZodArray<z.ZodObject<{
|
|
3276
|
+
judge: z.ZodOptional<z.ZodString>;
|
|
3277
|
+
rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
3278
|
+
correctness: "correctness";
|
|
3279
|
+
completeness: "completeness";
|
|
3280
|
+
groundedness: "groundedness";
|
|
3281
|
+
"instruction-following": "instruction-following";
|
|
3282
|
+
conciseness: "conciseness";
|
|
3283
|
+
}>, z.ZodObject<{
|
|
3284
|
+
text: z.ZodString;
|
|
3285
|
+
}, z.core.$strip>]>>;
|
|
3286
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3287
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3288
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3289
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3290
|
+
openai: "openai";
|
|
3291
|
+
anthropic: "anthropic";
|
|
3292
|
+
google: "google";
|
|
3293
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
3294
|
+
"anthropic-agent-sdk": "anthropic-agent-sdk";
|
|
3295
|
+
}>>;
|
|
3296
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3297
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3298
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3299
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3300
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3301
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3302
|
+
}, z.core.$strip>>]>>;
|
|
3036
3303
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3037
3304
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3038
3305
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -3268,6 +3535,23 @@ interface IterationResult {
|
|
|
3268
3535
|
}>;
|
|
3269
3536
|
};
|
|
3270
3537
|
}
|
|
3538
|
+
/**
|
|
3539
|
+
* Request data captured from the eval case input.
|
|
3540
|
+
* Preserves what was sent so results are self-contained for debugging.
|
|
3541
|
+
*/
|
|
3542
|
+
interface EvalCaseRequest {
|
|
3543
|
+
/** Human-readable description of the case */
|
|
3544
|
+
description?: string;
|
|
3545
|
+
/** Tool arguments (direct mode) */
|
|
3546
|
+
args?: Record<string, unknown>;
|
|
3547
|
+
/** Natural language scenario sent to the LLM (mcp_host mode) */
|
|
3548
|
+
scenario?: string;
|
|
3549
|
+
/** LLM provider/model configuration (mcp_host mode) */
|
|
3550
|
+
mcpHostConfig?: {
|
|
3551
|
+
provider?: string;
|
|
3552
|
+
model?: string;
|
|
3553
|
+
};
|
|
3554
|
+
}
|
|
3271
3555
|
/**
|
|
3272
3556
|
* Result of a single eval case
|
|
3273
3557
|
*/
|
|
@@ -3292,6 +3576,11 @@ interface EvalCaseResult {
|
|
|
3292
3576
|
* Overall pass/fail status
|
|
3293
3577
|
*/
|
|
3294
3578
|
pass: boolean;
|
|
3579
|
+
/**
|
|
3580
|
+
* Request data from the eval case input (tool args, scenario, LLM config).
|
|
3581
|
+
* Populated so results are self-contained for debugging without the original dataset.
|
|
3582
|
+
*/
|
|
3583
|
+
request?: EvalCaseRequest;
|
|
3295
3584
|
/**
|
|
3296
3585
|
* Tool response
|
|
3297
3586
|
*/
|
|
@@ -3835,24 +4124,31 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
3835
4124
|
* schemas, testing discoverability and parameter clarity at the level a real
|
|
3836
4125
|
* user (via Claude Desktop, ChatGPT, etc.) would experience.
|
|
3837
4126
|
*
|
|
3838
|
-
*
|
|
3839
|
-
* which handles multi-turn tool calling natively and provides per-step latency
|
|
3840
|
-
* decomposition (llmDurationMs vs. mcpDurationMs).
|
|
3841
|
-
*
|
|
3842
|
-
* @param mcp - MCP fixture API
|
|
4127
|
+
* @param mcp - MCP fixture API (used by SDK hosts; ignored by CLI/browser hosts which establish their own connections)
|
|
3843
4128
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
3844
4129
|
* @param config - MCP host configuration (provider, model, temperature, etc.)
|
|
3845
4130
|
* @returns Simulation result with tool calls, final response, and latency data
|
|
3846
4131
|
*
|
|
3847
4132
|
* @example
|
|
3848
4133
|
* ```typescript
|
|
4134
|
+
* // SDK host (default) — uses the framework's existing MCP connection
|
|
3849
4135
|
* const result = await simulateMCPHost(mcp,
|
|
3850
4136
|
* "Find recent documents about MCP testing frameworks",
|
|
3851
4137
|
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
3852
4138
|
* );
|
|
3853
4139
|
*
|
|
3854
|
-
*
|
|
3855
|
-
*
|
|
4140
|
+
* // CLI host — spawns a CLI process with its own MCP connection
|
|
4141
|
+
* const result = await simulateMCPHost(mcp,
|
|
4142
|
+
* "Find recent documents about MCP testing frameworks",
|
|
4143
|
+
* {
|
|
4144
|
+
* hostType: 'cli',
|
|
4145
|
+
* provider: 'anthropic',
|
|
4146
|
+
* cli: {
|
|
4147
|
+
* command: 'claude',
|
|
4148
|
+
* args: ['-p', '{{scenario}}', '--output-format', 'stream-json', '--verbose'],
|
|
4149
|
+
* },
|
|
4150
|
+
* }
|
|
4151
|
+
* );
|
|
3856
4152
|
* ```
|
|
3857
4153
|
*/
|
|
3858
4154
|
declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
@@ -3905,6 +4201,99 @@ declare function getMissingDependencyMessage(provider: LLMProvider): string;
|
|
|
3905
4201
|
*/
|
|
3906
4202
|
declare function createJudge(config?: JudgeConfig): Judge;
|
|
3907
4203
|
|
|
4204
|
+
/**
|
|
4205
|
+
* Custom Judge Registry
|
|
4206
|
+
*
|
|
4207
|
+
* Allows consumers to register named judge executors that can be referenced
|
|
4208
|
+
* by string ID in eval fixtures and programmatic tests. This enables
|
|
4209
|
+
* multi-step judge pipelines (LLM call + post-processing), custom scoring
|
|
4210
|
+
* logic, and reusable judge configurations without duplicating rubrics.
|
|
4211
|
+
*/
|
|
4212
|
+
/**
|
|
4213
|
+
* Result returned by a custom judge executor.
|
|
4214
|
+
*
|
|
4215
|
+
* Custom judges must return a normalized score (0–1). The framework applies
|
|
4216
|
+
* the caller's `threshold` (default 0.7) to determine pass/fail. This keeps
|
|
4217
|
+
* judges reusable — the same judge can be used with different thresholds in
|
|
4218
|
+
* different tests.
|
|
4219
|
+
*/
|
|
4220
|
+
interface CustomJudgeResult {
|
|
4221
|
+
/** Normalized score (0–1, where 1 is best) */
|
|
4222
|
+
score: number;
|
|
4223
|
+
/** Optional reasoning/explanation */
|
|
4224
|
+
reasoning?: string;
|
|
4225
|
+
}
|
|
4226
|
+
/**
|
|
4227
|
+
* A user-defined judge executor function.
|
|
4228
|
+
*
|
|
4229
|
+
* Custom executors own their entire evaluation pipeline — prompt construction,
|
|
4230
|
+
* LLM calls, and post-processing — but return a normalized score. The framework
|
|
4231
|
+
* determines pass/fail by comparing the score against the caller's threshold.
|
|
4232
|
+
*
|
|
4233
|
+
* @param candidate - The actual response to evaluate
|
|
4234
|
+
* @param reference - Optional reference/expected response
|
|
4235
|
+
* @returns Evaluation result with a normalized score and optional reasoning
|
|
4236
|
+
*
|
|
4237
|
+
* @example
|
|
4238
|
+
* ```typescript
|
|
4239
|
+
* const completenessJudge: CustomJudgeExecutor = async (candidate, reference) => {
|
|
4240
|
+
* // Step 1: LLM call with your own prompt and schema
|
|
4241
|
+
* const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
|
|
4242
|
+
* const { verdict, reasoning } = JSON.parse(llmResult);
|
|
4243
|
+
*
|
|
4244
|
+
* // Step 2: Deterministic post-processing into a normalized score
|
|
4245
|
+
* const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
|
|
4246
|
+
*
|
|
4247
|
+
* return { score, reasoning };
|
|
4248
|
+
* };
|
|
4249
|
+
* ```
|
|
4250
|
+
*/
|
|
4251
|
+
type CustomJudgeExecutor = (candidate: unknown, reference?: unknown) => Promise<CustomJudgeResult>;
|
|
4252
|
+
/**
|
|
4253
|
+
* Registers a named custom judge executor.
|
|
4254
|
+
*
|
|
4255
|
+
* Call this in your test setup (e.g., `playwright.config.ts` or a global setup file)
|
|
4256
|
+
* before tests run. The name can then be referenced in JSON eval fixtures via the
|
|
4257
|
+
* `judge` field on `passesJudge`.
|
|
4258
|
+
*
|
|
4259
|
+
* @param name - Unique identifier for the judge
|
|
4260
|
+
* @param executor - The judge executor function
|
|
4261
|
+
* @throws {Error} If a judge with the same name is already registered
|
|
4262
|
+
*
|
|
4263
|
+
* @example
|
|
4264
|
+
* ```typescript
|
|
4265
|
+
* import { registerJudge } from '@gleanwork/mcp-server-tester';
|
|
4266
|
+
*
|
|
4267
|
+
* registerJudge('glean-completeness', async (candidate, reference) => {
|
|
4268
|
+
* // Step 1: LLM call with your own prompt and schema
|
|
4269
|
+
* const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
|
|
4270
|
+
* const { verdict, reasoning } = JSON.parse(llmResult);
|
|
4271
|
+
*
|
|
4272
|
+
* // Step 2: Deterministic post-processing into a normalized score
|
|
4273
|
+
* const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
|
|
4274
|
+
*
|
|
4275
|
+
* return { score, reasoning };
|
|
4276
|
+
* });
|
|
4277
|
+
*
|
|
4278
|
+
* // Then in tests — same judge, different thresholds:
|
|
4279
|
+
* // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.8 });
|
|
4280
|
+
* // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.5 });
|
|
4281
|
+
* ```
|
|
4282
|
+
*/
|
|
4283
|
+
declare function registerJudge(name: string, executor: CustomJudgeExecutor): void;
|
|
4284
|
+
/**
|
|
4285
|
+
* Retrieves a registered custom judge executor by name.
|
|
4286
|
+
*
|
|
4287
|
+
* @param name - The judge name to look up
|
|
4288
|
+
* @returns The registered executor
|
|
4289
|
+
* @throws {Error} If no judge with the given name is registered
|
|
4290
|
+
*/
|
|
4291
|
+
declare function getRegisteredJudge(name: string): CustomJudgeExecutor;
|
|
4292
|
+
/**
|
|
4293
|
+
* Clears all registered judges. Intended for test teardown.
|
|
4294
|
+
*/
|
|
4295
|
+
declare function clearJudgeRegistry(): void;
|
|
4296
|
+
|
|
3908
4297
|
/**
|
|
3909
4298
|
* Options for conformance checks
|
|
3910
4299
|
*/
|
|
@@ -4066,4 +4455,4 @@ interface MCPEvalReporterConfig {
|
|
|
4066
4455
|
includeAutoTracking?: boolean;
|
|
4067
4456
|
}
|
|
4068
4457
|
|
|
4069
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4458
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|