@gleanwork/mcp-server-tester 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +421 -0
- package/dist/cli/index.js +2785 -0
- package/dist/fixtures/mcp.d.ts +605 -0
- package/dist/fixtures/mcp.js +2378 -0
- package/dist/fixtures/mcp.js.map +1 -0
- package/dist/fixtures/mcpAuth.d.ts +31 -0
- package/dist/fixtures/mcpAuth.js +317 -0
- package/dist/fixtures/mcpAuth.js.map +1 -0
- package/dist/index.cjs +3658 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3857 -0
- package/dist/index.d.ts +3857 -0
- package/dist/index.js +3582 -0
- package/dist/index.js.map +1 -0
- package/dist/reporters/mcpReporter.cjs +301 -0
- package/dist/reporters/mcpReporter.cjs.map +1 -0
- package/dist/reporters/mcpReporter.d.cts +85 -0
- package/dist/reporters/mcpReporter.d.ts +85 -0
- package/dist/reporters/mcpReporter.js +297 -0
- package/dist/reporters/mcpReporter.js.map +1 -0
- package/dist/reporters/ui-dist/app.js +174 -0
- package/dist/reporters/ui-dist/index.html +28 -0
- package/dist/reporters/ui-dist/styles.css +1 -0
- package/package.json +138 -0
- package/src/reporters/ui-dist/app.js +174 -0
- package/src/reporters/ui-dist/index.html +28 -0
- package/src/reporters/ui-dist/styles.css +1 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,3857 @@
|
|
|
1
|
+
import { z, ZodType } from 'zod';
|
|
2
|
+
import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
|
|
3
|
+
import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
|
|
4
|
+
import * as oauth from 'oauth4webapi';
|
|
5
|
+
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
6
|
+
import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
|
|
7
|
+
import { TestInfo, Expect } from '@playwright/test';
|
|
8
|
+
import * as playwright_test from 'playwright/test';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* OAuth configuration for MCP authentication
|
|
12
|
+
*/
|
|
13
|
+
interface MCPOAuthConfig {
|
|
14
|
+
/**
|
|
15
|
+
* OAuth authorization server metadata URL
|
|
16
|
+
* (e.g., https://auth.example.com/.well-known/oauth-authorization-server)
|
|
17
|
+
*/
|
|
18
|
+
serverUrl: string;
|
|
19
|
+
/**
|
|
20
|
+
* Scopes to request during authorization
|
|
21
|
+
*/
|
|
22
|
+
scopes?: Array<string>;
|
|
23
|
+
/**
|
|
24
|
+
* Resource indicator (RFC 8707, required by MCP 2025-06-18 spec)
|
|
25
|
+
*/
|
|
26
|
+
resource?: string;
|
|
27
|
+
/**
|
|
28
|
+
* Path to Playwright auth state file
|
|
29
|
+
* (e.g., playwright/.auth/oauth-state.json)
|
|
30
|
+
*/
|
|
31
|
+
authStatePath?: string;
|
|
32
|
+
/**
|
|
33
|
+
* Client ID (if pre-registered; otherwise uses Dynamic Client Registration)
|
|
34
|
+
*/
|
|
35
|
+
clientId?: string;
|
|
36
|
+
/**
|
|
37
|
+
* Client secret (for confidential clients)
|
|
38
|
+
*/
|
|
39
|
+
clientSecret?: string;
|
|
40
|
+
/**
|
|
41
|
+
* Redirect URI for OAuth callback
|
|
42
|
+
*/
|
|
43
|
+
redirectUri?: string;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Authentication configuration for MCP connections
|
|
47
|
+
*/
|
|
48
|
+
interface MCPAuthConfig {
|
|
49
|
+
/**
|
|
50
|
+
* Pre-acquired access token (simplest authentication mode)
|
|
51
|
+
*/
|
|
52
|
+
accessToken?: string;
|
|
53
|
+
/**
|
|
54
|
+
* Full OAuth configuration for browser-based authentication
|
|
55
|
+
*/
|
|
56
|
+
oauth?: MCPOAuthConfig;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* MCP host capabilities that can be registered with the server
|
|
60
|
+
*/
|
|
61
|
+
interface MCPHostCapabilities {
|
|
62
|
+
/**
|
|
63
|
+
* Sampling capabilities (for LLM sampling)
|
|
64
|
+
*/
|
|
65
|
+
sampling?: Record<string, unknown>;
|
|
66
|
+
/**
|
|
67
|
+
* Roots capabilities (for file system roots)
|
|
68
|
+
*/
|
|
69
|
+
roots?: {
|
|
70
|
+
/**
|
|
71
|
+
* Whether the client can notify the server when roots change
|
|
72
|
+
*/
|
|
73
|
+
listChanged: boolean;
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Configuration for MCP client connection
|
|
78
|
+
*
|
|
79
|
+
* Supports both stdio (local) and HTTP (remote) transports
|
|
80
|
+
*/
|
|
81
|
+
interface MCPConfig {
|
|
82
|
+
/**
|
|
83
|
+
* Transport type
|
|
84
|
+
*/
|
|
85
|
+
transport: 'http' | 'stdio';
|
|
86
|
+
/**
|
|
87
|
+
* Server URL (required when transport === 'http')
|
|
88
|
+
*/
|
|
89
|
+
serverUrl?: string;
|
|
90
|
+
/**
|
|
91
|
+
* HTTP headers (optional for http transport, e.g., Authorization)
|
|
92
|
+
*/
|
|
93
|
+
headers?: Record<string, string>;
|
|
94
|
+
/**
|
|
95
|
+
* Command to execute (required when transport === 'stdio')
|
|
96
|
+
*/
|
|
97
|
+
command?: string;
|
|
98
|
+
/**
|
|
99
|
+
* Command arguments (optional for stdio)
|
|
100
|
+
*/
|
|
101
|
+
args?: Array<string>;
|
|
102
|
+
/**
|
|
103
|
+
* Working directory for the command (optional for stdio)
|
|
104
|
+
*/
|
|
105
|
+
cwd?: string;
|
|
106
|
+
/**
|
|
107
|
+
* Host capabilities to register with the server
|
|
108
|
+
*/
|
|
109
|
+
capabilities?: MCPHostCapabilities;
|
|
110
|
+
/**
|
|
111
|
+
* Connection timeout in milliseconds
|
|
112
|
+
*/
|
|
113
|
+
connectTimeoutMs?: number;
|
|
114
|
+
/**
|
|
115
|
+
* Request timeout in milliseconds
|
|
116
|
+
*/
|
|
117
|
+
requestTimeoutMs?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Suppress stderr output from the server process (stdio only)
|
|
120
|
+
* When true, server stderr is ignored instead of inherited
|
|
121
|
+
*/
|
|
122
|
+
quiet?: boolean;
|
|
123
|
+
/**
|
|
124
|
+
* Authentication configuration (optional for http transport)
|
|
125
|
+
*/
|
|
126
|
+
auth?: MCPAuthConfig;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Union schema for MCPConfig (validates based on transport type)
|
|
130
|
+
*/
|
|
131
|
+
declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject<{
|
|
132
|
+
transport: z.ZodLiteral<"stdio">;
|
|
133
|
+
command: z.ZodString;
|
|
134
|
+
args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
135
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
136
|
+
capabilities: z.ZodOptional<z.ZodObject<{
|
|
137
|
+
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
138
|
+
roots: z.ZodOptional<z.ZodObject<{
|
|
139
|
+
listChanged: z.ZodBoolean;
|
|
140
|
+
}, "strip", z.ZodTypeAny, {
|
|
141
|
+
listChanged: boolean;
|
|
142
|
+
}, {
|
|
143
|
+
listChanged: boolean;
|
|
144
|
+
}>>;
|
|
145
|
+
}, "strip", z.ZodTypeAny, {
|
|
146
|
+
sampling?: Record<string, unknown> | undefined;
|
|
147
|
+
roots?: {
|
|
148
|
+
listChanged: boolean;
|
|
149
|
+
} | undefined;
|
|
150
|
+
}, {
|
|
151
|
+
sampling?: Record<string, unknown> | undefined;
|
|
152
|
+
roots?: {
|
|
153
|
+
listChanged: boolean;
|
|
154
|
+
} | undefined;
|
|
155
|
+
}>>;
|
|
156
|
+
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
157
|
+
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
158
|
+
quiet: z.ZodOptional<z.ZodBoolean>;
|
|
159
|
+
}, "strip", z.ZodTypeAny, {
|
|
160
|
+
transport: "stdio";
|
|
161
|
+
command: string;
|
|
162
|
+
args?: string[] | undefined;
|
|
163
|
+
cwd?: string | undefined;
|
|
164
|
+
capabilities?: {
|
|
165
|
+
sampling?: Record<string, unknown> | undefined;
|
|
166
|
+
roots?: {
|
|
167
|
+
listChanged: boolean;
|
|
168
|
+
} | undefined;
|
|
169
|
+
} | undefined;
|
|
170
|
+
connectTimeoutMs?: number | undefined;
|
|
171
|
+
requestTimeoutMs?: number | undefined;
|
|
172
|
+
quiet?: boolean | undefined;
|
|
173
|
+
}, {
|
|
174
|
+
transport: "stdio";
|
|
175
|
+
command: string;
|
|
176
|
+
args?: string[] | undefined;
|
|
177
|
+
cwd?: string | undefined;
|
|
178
|
+
capabilities?: {
|
|
179
|
+
sampling?: Record<string, unknown> | undefined;
|
|
180
|
+
roots?: {
|
|
181
|
+
listChanged: boolean;
|
|
182
|
+
} | undefined;
|
|
183
|
+
} | undefined;
|
|
184
|
+
connectTimeoutMs?: number | undefined;
|
|
185
|
+
requestTimeoutMs?: number | undefined;
|
|
186
|
+
quiet?: boolean | undefined;
|
|
187
|
+
}>, z.ZodObject<{
|
|
188
|
+
transport: z.ZodLiteral<"http">;
|
|
189
|
+
serverUrl: z.ZodString;
|
|
190
|
+
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
191
|
+
capabilities: z.ZodOptional<z.ZodObject<{
|
|
192
|
+
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
193
|
+
roots: z.ZodOptional<z.ZodObject<{
|
|
194
|
+
listChanged: z.ZodBoolean;
|
|
195
|
+
}, "strip", z.ZodTypeAny, {
|
|
196
|
+
listChanged: boolean;
|
|
197
|
+
}, {
|
|
198
|
+
listChanged: boolean;
|
|
199
|
+
}>>;
|
|
200
|
+
}, "strip", z.ZodTypeAny, {
|
|
201
|
+
sampling?: Record<string, unknown> | undefined;
|
|
202
|
+
roots?: {
|
|
203
|
+
listChanged: boolean;
|
|
204
|
+
} | undefined;
|
|
205
|
+
}, {
|
|
206
|
+
sampling?: Record<string, unknown> | undefined;
|
|
207
|
+
roots?: {
|
|
208
|
+
listChanged: boolean;
|
|
209
|
+
} | undefined;
|
|
210
|
+
}>>;
|
|
211
|
+
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
212
|
+
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
213
|
+
auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
|
|
214
|
+
accessToken: z.ZodOptional<z.ZodString>;
|
|
215
|
+
oauth: z.ZodOptional<z.ZodObject<{
|
|
216
|
+
serverUrl: z.ZodString;
|
|
217
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
218
|
+
resource: z.ZodOptional<z.ZodString>;
|
|
219
|
+
authStatePath: z.ZodOptional<z.ZodString>;
|
|
220
|
+
clientId: z.ZodOptional<z.ZodString>;
|
|
221
|
+
clientSecret: z.ZodOptional<z.ZodString>;
|
|
222
|
+
redirectUri: z.ZodOptional<z.ZodString>;
|
|
223
|
+
}, "strip", z.ZodTypeAny, {
|
|
224
|
+
serverUrl: string;
|
|
225
|
+
scopes?: string[] | undefined;
|
|
226
|
+
resource?: string | undefined;
|
|
227
|
+
authStatePath?: string | undefined;
|
|
228
|
+
clientId?: string | undefined;
|
|
229
|
+
clientSecret?: string | undefined;
|
|
230
|
+
redirectUri?: string | undefined;
|
|
231
|
+
}, {
|
|
232
|
+
serverUrl: string;
|
|
233
|
+
scopes?: string[] | undefined;
|
|
234
|
+
resource?: string | undefined;
|
|
235
|
+
authStatePath?: string | undefined;
|
|
236
|
+
clientId?: string | undefined;
|
|
237
|
+
clientSecret?: string | undefined;
|
|
238
|
+
redirectUri?: string | undefined;
|
|
239
|
+
}>>;
|
|
240
|
+
}, "strip", z.ZodTypeAny, {
|
|
241
|
+
accessToken?: string | undefined;
|
|
242
|
+
oauth?: {
|
|
243
|
+
serverUrl: string;
|
|
244
|
+
scopes?: string[] | undefined;
|
|
245
|
+
resource?: string | undefined;
|
|
246
|
+
authStatePath?: string | undefined;
|
|
247
|
+
clientId?: string | undefined;
|
|
248
|
+
clientSecret?: string | undefined;
|
|
249
|
+
redirectUri?: string | undefined;
|
|
250
|
+
} | undefined;
|
|
251
|
+
}, {
|
|
252
|
+
accessToken?: string | undefined;
|
|
253
|
+
oauth?: {
|
|
254
|
+
serverUrl: string;
|
|
255
|
+
scopes?: string[] | undefined;
|
|
256
|
+
resource?: string | undefined;
|
|
257
|
+
authStatePath?: string | undefined;
|
|
258
|
+
clientId?: string | undefined;
|
|
259
|
+
clientSecret?: string | undefined;
|
|
260
|
+
redirectUri?: string | undefined;
|
|
261
|
+
} | undefined;
|
|
262
|
+
}>, {
|
|
263
|
+
accessToken?: string | undefined;
|
|
264
|
+
oauth?: {
|
|
265
|
+
serverUrl: string;
|
|
266
|
+
scopes?: string[] | undefined;
|
|
267
|
+
resource?: string | undefined;
|
|
268
|
+
authStatePath?: string | undefined;
|
|
269
|
+
clientId?: string | undefined;
|
|
270
|
+
clientSecret?: string | undefined;
|
|
271
|
+
redirectUri?: string | undefined;
|
|
272
|
+
} | undefined;
|
|
273
|
+
}, {
|
|
274
|
+
accessToken?: string | undefined;
|
|
275
|
+
oauth?: {
|
|
276
|
+
serverUrl: string;
|
|
277
|
+
scopes?: string[] | undefined;
|
|
278
|
+
resource?: string | undefined;
|
|
279
|
+
authStatePath?: string | undefined;
|
|
280
|
+
clientId?: string | undefined;
|
|
281
|
+
clientSecret?: string | undefined;
|
|
282
|
+
redirectUri?: string | undefined;
|
|
283
|
+
} | undefined;
|
|
284
|
+
}>>;
|
|
285
|
+
}, "strip", z.ZodTypeAny, {
|
|
286
|
+
serverUrl: string;
|
|
287
|
+
transport: "http";
|
|
288
|
+
capabilities?: {
|
|
289
|
+
sampling?: Record<string, unknown> | undefined;
|
|
290
|
+
roots?: {
|
|
291
|
+
listChanged: boolean;
|
|
292
|
+
} | undefined;
|
|
293
|
+
} | undefined;
|
|
294
|
+
connectTimeoutMs?: number | undefined;
|
|
295
|
+
requestTimeoutMs?: number | undefined;
|
|
296
|
+
headers?: Record<string, string> | undefined;
|
|
297
|
+
auth?: {
|
|
298
|
+
accessToken?: string | undefined;
|
|
299
|
+
oauth?: {
|
|
300
|
+
serverUrl: string;
|
|
301
|
+
scopes?: string[] | undefined;
|
|
302
|
+
resource?: string | undefined;
|
|
303
|
+
authStatePath?: string | undefined;
|
|
304
|
+
clientId?: string | undefined;
|
|
305
|
+
clientSecret?: string | undefined;
|
|
306
|
+
redirectUri?: string | undefined;
|
|
307
|
+
} | undefined;
|
|
308
|
+
} | undefined;
|
|
309
|
+
}, {
|
|
310
|
+
serverUrl: string;
|
|
311
|
+
transport: "http";
|
|
312
|
+
capabilities?: {
|
|
313
|
+
sampling?: Record<string, unknown> | undefined;
|
|
314
|
+
roots?: {
|
|
315
|
+
listChanged: boolean;
|
|
316
|
+
} | undefined;
|
|
317
|
+
} | undefined;
|
|
318
|
+
connectTimeoutMs?: number | undefined;
|
|
319
|
+
requestTimeoutMs?: number | undefined;
|
|
320
|
+
headers?: Record<string, string> | undefined;
|
|
321
|
+
auth?: {
|
|
322
|
+
accessToken?: string | undefined;
|
|
323
|
+
oauth?: {
|
|
324
|
+
serverUrl: string;
|
|
325
|
+
scopes?: string[] | undefined;
|
|
326
|
+
resource?: string | undefined;
|
|
327
|
+
authStatePath?: string | undefined;
|
|
328
|
+
clientId?: string | undefined;
|
|
329
|
+
clientSecret?: string | undefined;
|
|
330
|
+
redirectUri?: string | undefined;
|
|
331
|
+
} | undefined;
|
|
332
|
+
} | undefined;
|
|
333
|
+
}>]>;
|
|
334
|
+
/**
|
|
335
|
+
* Validates an MCPConfig object
|
|
336
|
+
*
|
|
337
|
+
* @param config - The config to validate
|
|
338
|
+
* @returns The validated config
|
|
339
|
+
* @throws {z.ZodError} If validation fails
|
|
340
|
+
*/
|
|
341
|
+
declare function validateMCPConfig(config: unknown): MCPConfig;
|
|
342
|
+
/**
|
|
343
|
+
* Type guard to check if a config is for stdio transport
|
|
344
|
+
*/
|
|
345
|
+
declare function isStdioConfig(config: MCPConfig): config is MCPConfig & {
|
|
346
|
+
transport: 'stdio';
|
|
347
|
+
command: string;
|
|
348
|
+
};
|
|
349
|
+
/**
|
|
350
|
+
* Type guard to check if a config is for HTTP transport
|
|
351
|
+
*/
|
|
352
|
+
declare function isHttpConfig(config: MCPConfig): config is MCPConfig & {
|
|
353
|
+
transport: 'http';
|
|
354
|
+
serverUrl: string;
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Auth types for MCP OAuth integration
|
|
359
|
+
*/
|
|
360
|
+
/**
|
|
361
|
+
* Stored OAuth tokens
|
|
362
|
+
*/
|
|
363
|
+
interface StoredTokens {
|
|
364
|
+
/**
|
|
365
|
+
* OAuth access token
|
|
366
|
+
*/
|
|
367
|
+
accessToken: string;
|
|
368
|
+
/**
|
|
369
|
+
* OAuth refresh token (if provided)
|
|
370
|
+
*/
|
|
371
|
+
refreshToken?: string;
|
|
372
|
+
/**
|
|
373
|
+
* Token expiration timestamp (Unix milliseconds)
|
|
374
|
+
*/
|
|
375
|
+
expiresAt?: number;
|
|
376
|
+
/**
|
|
377
|
+
* Token type (typically "Bearer")
|
|
378
|
+
*/
|
|
379
|
+
tokenType: string;
|
|
380
|
+
/**
|
|
381
|
+
* Client ID that was used to obtain these tokens.
|
|
382
|
+
* Required for token refresh since refresh tokens are bound to the client.
|
|
383
|
+
*/
|
|
384
|
+
clientId?: string;
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Stored client information from Dynamic Client Registration
|
|
388
|
+
*/
|
|
389
|
+
interface StoredClientInfo {
|
|
390
|
+
/**
|
|
391
|
+
* Client ID from DCR
|
|
392
|
+
*/
|
|
393
|
+
clientId: string;
|
|
394
|
+
/**
|
|
395
|
+
* Client secret from DCR (for confidential clients)
|
|
396
|
+
*/
|
|
397
|
+
clientSecret?: string;
|
|
398
|
+
/**
|
|
399
|
+
* Client ID issued at timestamp
|
|
400
|
+
*/
|
|
401
|
+
clientIdIssuedAt?: number;
|
|
402
|
+
/**
|
|
403
|
+
* Client secret expiration timestamp
|
|
404
|
+
*/
|
|
405
|
+
clientSecretExpiresAt?: number;
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Complete OAuth state persisted to disk for Playwright auth state pattern
|
|
409
|
+
*/
|
|
410
|
+
interface StoredOAuthState {
|
|
411
|
+
/**
|
|
412
|
+
* OAuth tokens
|
|
413
|
+
*/
|
|
414
|
+
tokens?: StoredTokens;
|
|
415
|
+
/**
|
|
416
|
+
* DCR client information
|
|
417
|
+
*/
|
|
418
|
+
clientInfo?: StoredClientInfo;
|
|
419
|
+
/**
|
|
420
|
+
* PKCE code verifier (used during authorization flow)
|
|
421
|
+
*/
|
|
422
|
+
codeVerifier?: string;
|
|
423
|
+
/**
|
|
424
|
+
* OAuth state parameter (for CSRF protection)
|
|
425
|
+
*/
|
|
426
|
+
state?: string;
|
|
427
|
+
/**
|
|
428
|
+
* Timestamp when this state was saved
|
|
429
|
+
*/
|
|
430
|
+
savedAt: number;
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Configuration for OAuth setup flow
|
|
434
|
+
*/
|
|
435
|
+
interface OAuthSetupConfig {
|
|
436
|
+
/**
|
|
437
|
+
* OAuth authorization server metadata URL
|
|
438
|
+
*/
|
|
439
|
+
authServerUrl: string;
|
|
440
|
+
/**
|
|
441
|
+
* Scopes to request
|
|
442
|
+
*/
|
|
443
|
+
scopes: Array<string>;
|
|
444
|
+
/**
|
|
445
|
+
* Resource indicator (RFC 8707)
|
|
446
|
+
*/
|
|
447
|
+
resource?: string;
|
|
448
|
+
/**
|
|
449
|
+
* Login form selectors for automation
|
|
450
|
+
*/
|
|
451
|
+
loginSelectors: {
|
|
452
|
+
/**
|
|
453
|
+
* Selector for username/email input field
|
|
454
|
+
*/
|
|
455
|
+
usernameInput: string;
|
|
456
|
+
/**
|
|
457
|
+
* Selector for password input field
|
|
458
|
+
*/
|
|
459
|
+
passwordInput: string;
|
|
460
|
+
/**
|
|
461
|
+
* Selector for login submit button
|
|
462
|
+
*/
|
|
463
|
+
submitButton: string;
|
|
464
|
+
/**
|
|
465
|
+
* Selector for consent/authorize button (optional)
|
|
466
|
+
*/
|
|
467
|
+
consentButton?: string;
|
|
468
|
+
};
|
|
469
|
+
/**
|
|
470
|
+
* Test user credentials
|
|
471
|
+
*/
|
|
472
|
+
credentials: {
|
|
473
|
+
username: string;
|
|
474
|
+
password: string;
|
|
475
|
+
};
|
|
476
|
+
/**
|
|
477
|
+
* Path to save OAuth state file
|
|
478
|
+
*/
|
|
479
|
+
outputPath: string;
|
|
480
|
+
/**
|
|
481
|
+
* Pre-registered client ID (optional, uses DCR if not provided)
|
|
482
|
+
*/
|
|
483
|
+
clientId?: string;
|
|
484
|
+
/**
|
|
485
|
+
* Pre-registered client secret (optional)
|
|
486
|
+
*/
|
|
487
|
+
clientSecret?: string;
|
|
488
|
+
/**
|
|
489
|
+
* Redirect URI for OAuth callback
|
|
490
|
+
*/
|
|
491
|
+
redirectUri?: string;
|
|
492
|
+
/**
|
|
493
|
+
* Timeout for login flow in milliseconds (default: 30000)
|
|
494
|
+
*/
|
|
495
|
+
timeoutMs?: number;
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* Result of token exchange or refresh
|
|
499
|
+
*/
|
|
500
|
+
interface TokenResult {
|
|
501
|
+
/**
|
|
502
|
+
* Access token
|
|
503
|
+
*/
|
|
504
|
+
accessToken: string;
|
|
505
|
+
/**
|
|
506
|
+
* Token type (typically "Bearer")
|
|
507
|
+
*/
|
|
508
|
+
tokenType: string;
|
|
509
|
+
/**
|
|
510
|
+
* Expires in seconds
|
|
511
|
+
*/
|
|
512
|
+
expiresIn?: number;
|
|
513
|
+
/**
|
|
514
|
+
* Refresh token (if provided)
|
|
515
|
+
*/
|
|
516
|
+
refreshToken?: string;
|
|
517
|
+
/**
|
|
518
|
+
* Granted scopes (space-separated)
|
|
519
|
+
*/
|
|
520
|
+
scope?: string;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
/**
|
|
524
|
+
* OAuth client provider implementation for MCP SDK
|
|
525
|
+
*
|
|
526
|
+
* Implements the MCP SDK's OAuthClientProvider interface using file-based storage
|
|
527
|
+
* for integration with Playwright's auth state pattern.
|
|
528
|
+
*/
|
|
529
|
+
|
|
530
|
+
/**
|
|
531
|
+
* Configuration for the Playwright OAuth client provider
|
|
532
|
+
*/
|
|
533
|
+
interface PlaywrightOAuthClientProviderConfig {
|
|
534
|
+
/**
|
|
535
|
+
* Path to the auth state file (e.g., playwright/.auth/oauth-state.json)
|
|
536
|
+
*/
|
|
537
|
+
storagePath: string;
|
|
538
|
+
/**
|
|
539
|
+
* OAuth redirect URI for callback
|
|
540
|
+
*/
|
|
541
|
+
redirectUri: string;
|
|
542
|
+
/**
|
|
543
|
+
* Client metadata for DCR or display
|
|
544
|
+
*/
|
|
545
|
+
clientMetadata?: Partial<OAuthClientMetadata>;
|
|
546
|
+
/**
|
|
547
|
+
* Pre-registered client ID (if not using DCR)
|
|
548
|
+
*/
|
|
549
|
+
clientId?: string;
|
|
550
|
+
/**
|
|
551
|
+
* Pre-registered client secret (if not using DCR)
|
|
552
|
+
*/
|
|
553
|
+
clientSecret?: string;
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* OAuth client provider that implements the MCP SDK's OAuthClientProvider interface
|
|
557
|
+
*
|
|
558
|
+
* Uses file-based storage for integration with Playwright's auth state pattern.
|
|
559
|
+
* Auth state is persisted to disk so it can be reused across test runs.
|
|
560
|
+
*
|
|
561
|
+
* @example
|
|
562
|
+
* ```typescript
|
|
563
|
+
* const provider = new PlaywrightOAuthClientProvider({
|
|
564
|
+
* storagePath: 'playwright/.auth/oauth-state.json',
|
|
565
|
+
* redirectUri: 'http://localhost:3000/callback',
|
|
566
|
+
* });
|
|
567
|
+
*
|
|
568
|
+
* const transport = new StreamableHTTPClientTransport(serverUrl, {
|
|
569
|
+
* authProvider: provider,
|
|
570
|
+
* });
|
|
571
|
+
* ```
|
|
572
|
+
*/
|
|
573
|
+
declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
|
|
574
|
+
private readonly config;
|
|
575
|
+
private cachedState;
|
|
576
|
+
private stateParam;
|
|
577
|
+
constructor(config: PlaywrightOAuthClientProviderConfig);
|
|
578
|
+
/**
|
|
579
|
+
* The URL to redirect the user agent to after authorization
|
|
580
|
+
*/
|
|
581
|
+
get redirectUrl(): string;
|
|
582
|
+
/**
|
|
583
|
+
* Metadata about this OAuth client
|
|
584
|
+
*/
|
|
585
|
+
get clientMetadata(): OAuthClientMetadata;
|
|
586
|
+
/**
|
|
587
|
+
* Returns an OAuth2 state parameter
|
|
588
|
+
*/
|
|
589
|
+
state(): string;
|
|
590
|
+
/**
|
|
591
|
+
* Loads information about this OAuth client
|
|
592
|
+
*/
|
|
593
|
+
clientInformation(): Promise<OAuthClientInformationFull | undefined>;
|
|
594
|
+
/**
|
|
595
|
+
* Saves client information from Dynamic Client Registration
|
|
596
|
+
*/
|
|
597
|
+
saveClientInformation(clientInformation: OAuthClientInformationFull): Promise<void>;
|
|
598
|
+
/**
|
|
599
|
+
* Loads any existing OAuth tokens for the current session
|
|
600
|
+
*/
|
|
601
|
+
tokens(): Promise<OAuthTokens | undefined>;
|
|
602
|
+
/**
|
|
603
|
+
* Stores new OAuth tokens for the current session
|
|
604
|
+
*/
|
|
605
|
+
saveTokens(tokens: OAuthTokens): Promise<void>;
|
|
606
|
+
/**
|
|
607
|
+
* Invoked to redirect the user agent to the given URL
|
|
608
|
+
*
|
|
609
|
+
* In a testing context, this is typically handled by Playwright automation.
|
|
610
|
+
* This implementation throws an error to signal that the caller needs to
|
|
611
|
+
* handle the redirect externally.
|
|
612
|
+
*/
|
|
613
|
+
redirectToAuthorization(authorizationUrl: URL): Promise<void>;
|
|
614
|
+
/**
|
|
615
|
+
* Saves a PKCE code verifier for the current session
|
|
616
|
+
*/
|
|
617
|
+
saveCodeVerifier(codeVerifier: string): Promise<void>;
|
|
618
|
+
/**
|
|
619
|
+
* Loads the PKCE code verifier for the current session
|
|
620
|
+
*/
|
|
621
|
+
codeVerifier(): Promise<string>;
|
|
622
|
+
/**
|
|
623
|
+
* Invalidates the specified credentials
|
|
624
|
+
*/
|
|
625
|
+
invalidateCredentials(scope: 'all' | 'client' | 'tokens' | 'verifier'): Promise<void>;
|
|
626
|
+
private loadState;
|
|
627
|
+
private saveState;
|
|
628
|
+
private deleteState;
|
|
629
|
+
private createEmptyState;
|
|
630
|
+
private generateRandomString;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* Static token authentication utilities
|
|
635
|
+
*
|
|
636
|
+
* Simple utilities for pre-acquired token authentication
|
|
637
|
+
*/
|
|
638
|
+
/**
|
|
639
|
+
* Creates HTTP headers for static token authentication
|
|
640
|
+
*
|
|
641
|
+
* @param accessToken - The pre-acquired access token
|
|
642
|
+
* @param tokenType - The token type (default: "Bearer")
|
|
643
|
+
* @returns HTTP headers with Authorization header
|
|
644
|
+
*
|
|
645
|
+
* @example
|
|
646
|
+
* ```typescript
|
|
647
|
+
* const headers = createTokenAuthHeaders(process.env.MCP_ACCESS_TOKEN);
|
|
648
|
+
* // { Authorization: 'Bearer eyJ...' }
|
|
649
|
+
* ```
|
|
650
|
+
*/
|
|
651
|
+
declare function createTokenAuthHeaders(accessToken: string, tokenType?: string): Record<string, string>;
|
|
652
|
+
/**
|
|
653
|
+
* Validates that an access token is present and non-empty
|
|
654
|
+
*
|
|
655
|
+
* @param accessToken - The access token to validate
|
|
656
|
+
* @throws Error if token is missing or empty
|
|
657
|
+
*/
|
|
658
|
+
declare function validateAccessToken(accessToken: string | undefined): void;
|
|
659
|
+
/**
|
|
660
|
+
* Checks if a token appears to be expired based on common JWT structure
|
|
661
|
+
*
|
|
662
|
+
* Note: This is a best-effort check and may not work for all token formats.
|
|
663
|
+
* For reliable expiration checking, use the token's associated expiration time.
|
|
664
|
+
*
|
|
665
|
+
* @param accessToken - The access token to check
|
|
666
|
+
* @returns true if the token appears to be expired, false otherwise
|
|
667
|
+
*/
|
|
668
|
+
declare function isTokenExpired(accessToken: string): boolean;
|
|
669
|
+
/**
|
|
670
|
+
* Checks if a token will expire within the specified buffer time
|
|
671
|
+
*
|
|
672
|
+
* @param expiresAt - Token expiration timestamp in milliseconds
|
|
673
|
+
* @param bufferMs - Buffer time in milliseconds (default: 60000 = 1 minute)
|
|
674
|
+
* @returns true if the token will expire within the buffer time
|
|
675
|
+
*/
|
|
676
|
+
declare function isTokenExpiringSoon(expiresAt: number | undefined, bufferMs?: number): boolean;
|
|
677
|
+
|
|
678
|
+
/**
|
|
679
|
+
* OAuth setup utility for Playwright globalSetup
|
|
680
|
+
*
|
|
681
|
+
* Performs the browser-based OAuth flow and saves the auth state
|
|
682
|
+
* for reuse across tests following Playwright's auth state pattern.
|
|
683
|
+
*/
|
|
684
|
+
|
|
685
|
+
/**
|
|
686
|
+
* Performs the OAuth authorization flow using Playwright browser automation
|
|
687
|
+
*
|
|
688
|
+
* This function is designed to be used in Playwright's globalSetup to
|
|
689
|
+
* authenticate once before running tests. The resulting auth state is
|
|
690
|
+
* saved to disk and reused across tests.
|
|
691
|
+
*
|
|
692
|
+
* @param config - OAuth setup configuration
|
|
693
|
+
*
|
|
694
|
+
* @example
|
|
695
|
+
* ```typescript
|
|
696
|
+
* // global-setup.ts
|
|
697
|
+
* import { performOAuthSetup } from '@gleanwork/mcp-server-tester';
|
|
698
|
+
*
|
|
699
|
+
* export default async function globalSetup() {
|
|
700
|
+
* await performOAuthSetup({
|
|
701
|
+
* authServerUrl: 'https://auth.example.com',
|
|
702
|
+
* scopes: ['mcp:read', 'mcp:write'],
|
|
703
|
+
* loginSelectors: {
|
|
704
|
+
* usernameInput: '#username',
|
|
705
|
+
* passwordInput: '#password',
|
|
706
|
+
* submitButton: 'button[type="submit"]',
|
|
707
|
+
* },
|
|
708
|
+
* credentials: {
|
|
709
|
+
* username: process.env.TEST_USER!,
|
|
710
|
+
* password: process.env.TEST_PASSWORD!,
|
|
711
|
+
* },
|
|
712
|
+
* outputPath: 'playwright/.auth/oauth-state.json',
|
|
713
|
+
* });
|
|
714
|
+
* }
|
|
715
|
+
* ```
|
|
716
|
+
*/
|
|
717
|
+
declare function performOAuthSetup(config: OAuthSetupConfig): Promise<void>;
|
|
718
|
+
/**
|
|
719
|
+
* Performs OAuth setup only if valid state doesn't already exist
|
|
720
|
+
*
|
|
721
|
+
* Use this in globalSetup to avoid re-authenticating on every test run.
|
|
722
|
+
*
|
|
723
|
+
* @param config - OAuth setup configuration
|
|
724
|
+
*
|
|
725
|
+
* @example
|
|
726
|
+
* ```typescript
|
|
727
|
+
* // global-setup.ts
|
|
728
|
+
* export default async function globalSetup() {
|
|
729
|
+
* await performOAuthSetupIfNeeded({
|
|
730
|
+
* authServerUrl: 'https://auth.example.com',
|
|
731
|
+
* scopes: ['mcp:read'],
|
|
732
|
+
* loginSelectors: { ... },
|
|
733
|
+
* credentials: { ... },
|
|
734
|
+
* outputPath: 'playwright/.auth/oauth-state.json',
|
|
735
|
+
* });
|
|
736
|
+
* }
|
|
737
|
+
* ```
|
|
738
|
+
*/
|
|
739
|
+
declare function performOAuthSetupIfNeeded(config: OAuthSetupConfig): Promise<void>;
|
|
740
|
+
|
|
741
|
+
/**
|
|
742
|
+
* OAuth flow utilities using oauth4webapi
|
|
743
|
+
*
|
|
744
|
+
* Implements OAuth 2.1 with PKCE as required by MCP specification
|
|
745
|
+
*/
|
|
746
|
+
|
|
747
|
+
/**
|
|
748
|
+
* Discovered OAuth authorization server metadata
|
|
749
|
+
*/
|
|
750
|
+
interface AuthServerMetadata {
|
|
751
|
+
/**
|
|
752
|
+
* The oauth4webapi AuthorizationServer object
|
|
753
|
+
*/
|
|
754
|
+
server: oauth.AuthorizationServer;
|
|
755
|
+
/**
|
|
756
|
+
* Issuer URL
|
|
757
|
+
*/
|
|
758
|
+
issuer: string;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* OAuth Protected Resource and Authorization Server discovery
|
|
763
|
+
*
|
|
764
|
+
* Implements RFC 9728 (OAuth Protected Resource Metadata) and
|
|
765
|
+
* RFC 8414 (Authorization Server Metadata) for MCP servers.
|
|
766
|
+
*/
|
|
767
|
+
|
|
768
|
+
/**
|
|
769
|
+
* MCP Protocol version header value
|
|
770
|
+
*/
|
|
771
|
+
declare const MCP_PROTOCOL_VERSION = "2025-06-18";
|
|
772
|
+
/**
|
|
773
|
+
* Protected Resource Metadata (RFC 9728)
|
|
774
|
+
*/
|
|
775
|
+
interface ProtectedResourceMetadata {
|
|
776
|
+
/**
|
|
777
|
+
* The protected resource URL
|
|
778
|
+
*/
|
|
779
|
+
resource: string;
|
|
780
|
+
/**
|
|
781
|
+
* Array of authorization server URLs
|
|
782
|
+
*/
|
|
783
|
+
authorization_servers?: Array<string>;
|
|
784
|
+
/**
|
|
785
|
+
* Scopes supported by the protected resource
|
|
786
|
+
*/
|
|
787
|
+
scopes_supported?: Array<string>;
|
|
788
|
+
/**
|
|
789
|
+
* Bearer token formats supported
|
|
790
|
+
*/
|
|
791
|
+
bearer_methods_supported?: Array<string>;
|
|
792
|
+
/**
|
|
793
|
+
* Resource documentation URL
|
|
794
|
+
*/
|
|
795
|
+
resource_documentation?: string;
|
|
796
|
+
/**
|
|
797
|
+
* Resource signing algorithms
|
|
798
|
+
*/
|
|
799
|
+
resource_signing_alg_values_supported?: Array<string>;
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Result of protected resource discovery
|
|
803
|
+
*/
|
|
804
|
+
interface ProtectedResourceDiscoveryResult {
|
|
805
|
+
/**
|
|
806
|
+
* The discovered metadata
|
|
807
|
+
*/
|
|
808
|
+
metadata: ProtectedResourceMetadata;
|
|
809
|
+
/**
|
|
810
|
+
* The URL where metadata was found
|
|
811
|
+
*/
|
|
812
|
+
discoveryUrl: string;
|
|
813
|
+
/**
|
|
814
|
+
* Whether path-aware discovery was used (vs base discovery)
|
|
815
|
+
*/
|
|
816
|
+
usedPathAwareDiscovery: boolean;
|
|
817
|
+
}
|
|
818
|
+
/**
|
|
819
|
+
* Discovers protected resource metadata per RFC 9728
|
|
820
|
+
*
|
|
821
|
+
* Follows RFC 9728 Section 4.1 for path-aware discovery:
|
|
822
|
+
* 1. First tries: {origin}/.well-known/oauth-protected-resource{pathname}
|
|
823
|
+
* 2. Falls back to: {origin}/.well-known/oauth-protected-resource
|
|
824
|
+
*
|
|
825
|
+
* @param mcpServerUrl - The MCP server URL
|
|
826
|
+
* @returns Protected resource discovery result
|
|
827
|
+
* @throws Error if discovery fails completely
|
|
828
|
+
*
|
|
829
|
+
* @example
|
|
830
|
+
* const result = await discoverProtectedResource('https://api.example.com/mcp/default');
|
|
831
|
+
* console.log(result.metadata.authorization_servers);
|
|
832
|
+
*/
|
|
833
|
+
declare function discoverProtectedResource(mcpServerUrl: string): Promise<ProtectedResourceDiscoveryResult>;
|
|
834
|
+
/**
|
|
835
|
+
* Error thrown when discovery fails
|
|
836
|
+
*/
|
|
837
|
+
declare class DiscoveryError extends Error {
|
|
838
|
+
readonly status?: number | undefined;
|
|
839
|
+
readonly url?: string | undefined;
|
|
840
|
+
constructor(message: string, status?: number | undefined, url?: string | undefined);
|
|
841
|
+
}
|
|
842
|
+
/**
|
|
843
|
+
* Discovers OAuth Authorization Server metadata per RFC 8414
|
|
844
|
+
*
|
|
845
|
+
* Wraps oauth4webapi's discovery with MCP-specific headers.
|
|
846
|
+
*
|
|
847
|
+
* @param authServerUrl - The authorization server URL
|
|
848
|
+
* @returns Authorization server metadata
|
|
849
|
+
* @throws Error if discovery fails
|
|
850
|
+
*
|
|
851
|
+
* @example
|
|
852
|
+
* const authServer = await discoverAuthorizationServer('https://auth.example.com');
|
|
853
|
+
* console.log(authServer.server.token_endpoint);
|
|
854
|
+
*/
|
|
855
|
+
declare function discoverAuthorizationServer(authServerUrl: string): Promise<AuthServerMetadata>;
|
|
856
|
+
|
|
857
|
+
/**
|
|
858
|
+
* OAuth token storage with environment variable support for CI/CD
|
|
859
|
+
*
|
|
860
|
+
* Provides file-based storage for OAuth state per MCP server, with support
|
|
861
|
+
* for token injection via environment variables for automated testing.
|
|
862
|
+
*/
|
|
863
|
+
|
|
864
|
+
/**
|
|
865
|
+
* Combined server metadata (auth server + protected resource)
|
|
866
|
+
*/
|
|
867
|
+
interface StoredServerMetadata {
|
|
868
|
+
/**
|
|
869
|
+
* Authorization server metadata
|
|
870
|
+
*/
|
|
871
|
+
authServer: AuthServerMetadata;
|
|
872
|
+
/**
|
|
873
|
+
* Protected resource metadata
|
|
874
|
+
*/
|
|
875
|
+
protectedResource: ProtectedResourceMetadata;
|
|
876
|
+
/**
|
|
877
|
+
* Timestamp when metadata was discovered
|
|
878
|
+
*/
|
|
879
|
+
discoveredAt: number;
|
|
880
|
+
}
|
|
881
|
+
/**
|
|
882
|
+
* Environment variable names for CI/CD token injection
|
|
883
|
+
*/
|
|
884
|
+
declare const ENV_VAR_NAMES: {
|
|
885
|
+
readonly accessToken: "MCP_ACCESS_TOKEN";
|
|
886
|
+
readonly refreshToken: "MCP_REFRESH_TOKEN";
|
|
887
|
+
readonly tokenType: "MCP_TOKEN_TYPE";
|
|
888
|
+
readonly expiresAt: "MCP_TOKEN_EXPIRES_AT";
|
|
889
|
+
};
|
|
890
|
+
/**
|
|
891
|
+
* Reads tokens from environment variables (for CI/CD)
|
|
892
|
+
*
|
|
893
|
+
* @returns StoredTokens if MCP_ACCESS_TOKEN is set, null otherwise
|
|
894
|
+
*/
|
|
895
|
+
declare function loadTokensFromEnv(): StoredTokens | null;
|
|
896
|
+
/**
|
|
897
|
+
* Programmatically inject tokens into storage (for CI/CD setup)
|
|
898
|
+
*
|
|
899
|
+
* @param serverUrl - The MCP server URL
|
|
900
|
+
* @param tokens - The tokens to inject
|
|
901
|
+
* @param stateDir - Optional custom state directory
|
|
902
|
+
*/
|
|
903
|
+
declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?: string): Promise<void>;
|
|
904
|
+
/**
|
|
905
|
+
* Load stored OAuth tokens for an MCP server
|
|
906
|
+
*
|
|
907
|
+
* Reads tokens from the standard storage location for the given server URL.
|
|
908
|
+
* Tokens are stored by `mcp-server-tester login` or `injectTokens()`.
|
|
909
|
+
*
|
|
910
|
+
* @param serverUrl - The MCP server URL
|
|
911
|
+
* @param stateDir - Optional custom state directory
|
|
912
|
+
* @returns StoredTokens if found, null otherwise
|
|
913
|
+
*
|
|
914
|
+
* @example
|
|
915
|
+
* ```typescript
|
|
916
|
+
* // After running: npx mcp-server-tester login https://api.example.com/mcp
|
|
917
|
+
* const tokens = await loadTokens('https://api.example.com/mcp');
|
|
918
|
+
* if (tokens) {
|
|
919
|
+
* console.log('Access token:', tokens.accessToken);
|
|
920
|
+
* }
|
|
921
|
+
* ```
|
|
922
|
+
*/
|
|
923
|
+
declare function loadTokens(serverUrl: string, stateDir?: string): Promise<StoredTokens | null>;
|
|
924
|
+
/**
|
|
925
|
+
* Check if valid OAuth tokens exist for an MCP server
|
|
926
|
+
*
|
|
927
|
+
* Returns true if tokens exist and are not expired (with buffer).
|
|
928
|
+
* Use this to check if authentication is needed before making requests.
|
|
929
|
+
*
|
|
930
|
+
* @param serverUrl - The MCP server URL
|
|
931
|
+
* @param options - Optional configuration
|
|
932
|
+
* @param options.stateDir - Custom state directory
|
|
933
|
+
* @param options.bufferMs - Buffer time before expiration (default: 60000ms)
|
|
934
|
+
* @returns true if valid (non-expired) tokens exist
|
|
935
|
+
*
|
|
936
|
+
* @example
|
|
937
|
+
* ```typescript
|
|
938
|
+
* if (await hasValidTokens('https://api.example.com/mcp')) {
|
|
939
|
+
* // Use stored tokens
|
|
940
|
+
* const tokens = await loadTokens('https://api.example.com/mcp');
|
|
941
|
+
* } else {
|
|
942
|
+
* console.log('Run: npx mcp-server-tester login https://api.example.com/mcp');
|
|
943
|
+
* }
|
|
944
|
+
* ```
|
|
945
|
+
*/
|
|
946
|
+
declare function hasValidTokens(serverUrl: string, options?: {
|
|
947
|
+
stateDir?: string;
|
|
948
|
+
bufferMs?: number;
|
|
949
|
+
}): Promise<boolean>;
|
|
950
|
+
|
|
951
|
+
/**
|
|
952
|
+
* CLI OAuth client for command-line authentication flows
|
|
953
|
+
*
|
|
954
|
+
* Provides browser-based OAuth authentication for CLI environments,
|
|
955
|
+
* with support for environment variable token injection for CI/CD.
|
|
956
|
+
*/
|
|
957
|
+
/**
|
|
958
|
+
* Configuration for CLI OAuth client
|
|
959
|
+
*/
|
|
960
|
+
interface CLIOAuthClientConfig {
|
|
961
|
+
/**
|
|
962
|
+
* MCP server URL (for protected resource discovery)
|
|
963
|
+
*/
|
|
964
|
+
mcpServerUrl: string;
|
|
965
|
+
/**
|
|
966
|
+
* Scopes to request (optional, uses discovered scopes if not provided)
|
|
967
|
+
*/
|
|
968
|
+
scopes?: Array<string>;
|
|
969
|
+
/**
|
|
970
|
+
* Custom storage directory
|
|
971
|
+
*/
|
|
972
|
+
stateDir?: string;
|
|
973
|
+
/**
|
|
974
|
+
* Pre-registered client ID (skips DCR if provided)
|
|
975
|
+
*/
|
|
976
|
+
clientId?: string;
|
|
977
|
+
/**
|
|
978
|
+
* Pre-registered client secret
|
|
979
|
+
*/
|
|
980
|
+
clientSecret?: string;
|
|
981
|
+
/**
|
|
982
|
+
* Preferred callback port (default: random available port)
|
|
983
|
+
*/
|
|
984
|
+
callbackPort?: number;
|
|
985
|
+
/**
|
|
986
|
+
* Timeout for OAuth flow in milliseconds (default: 300000 = 5 min)
|
|
987
|
+
*/
|
|
988
|
+
timeoutMs?: number;
|
|
989
|
+
/**
|
|
990
|
+
* Client name for DCR registration
|
|
991
|
+
*/
|
|
992
|
+
clientName?: string;
|
|
993
|
+
}
|
|
994
|
+
/**
|
|
995
|
+
* Result of CLI OAuth authentication
|
|
996
|
+
*/
|
|
997
|
+
interface CLIOAuthResult {
|
|
998
|
+
/**
|
|
999
|
+
* Access token
|
|
1000
|
+
*/
|
|
1001
|
+
accessToken: string;
|
|
1002
|
+
/**
|
|
1003
|
+
* Token type (typically "Bearer")
|
|
1004
|
+
*/
|
|
1005
|
+
tokenType: string;
|
|
1006
|
+
/**
|
|
1007
|
+
* Expiration timestamp (Unix ms)
|
|
1008
|
+
*/
|
|
1009
|
+
expiresAt?: number;
|
|
1010
|
+
/**
|
|
1011
|
+
* Whether token was refreshed vs newly acquired
|
|
1012
|
+
*/
|
|
1013
|
+
refreshed: boolean;
|
|
1014
|
+
/**
|
|
1015
|
+
* Scopes that were requested (only set for new authentications)
|
|
1016
|
+
*/
|
|
1017
|
+
requestedScopes?: string[];
|
|
1018
|
+
/**
|
|
1019
|
+
* Whether token came from environment variables
|
|
1020
|
+
*/
|
|
1021
|
+
fromEnv: boolean;
|
|
1022
|
+
}
|
|
1023
|
+
/**
|
|
1024
|
+
* CLI OAuth client for command-line authentication flows
|
|
1025
|
+
*/
|
|
1026
|
+
declare class CLIOAuthClient {
|
|
1027
|
+
private readonly config;
|
|
1028
|
+
private readonly storage;
|
|
1029
|
+
constructor(config: CLIOAuthClientConfig);
|
|
1030
|
+
/**
|
|
1031
|
+
* Get a valid access token, authenticating if necessary
|
|
1032
|
+
*
|
|
1033
|
+
* Token resolution priority:
|
|
1034
|
+
* 1. Check environment variables (for CI/CD)
|
|
1035
|
+
* 2. Check file storage for cached tokens
|
|
1036
|
+
* 3. Try to refresh if expired but refresh token exists
|
|
1037
|
+
* 4. Run full OAuth flow if needed
|
|
1038
|
+
*/
|
|
1039
|
+
getAccessToken(): Promise<CLIOAuthResult>;
|
|
1040
|
+
/**
|
|
1041
|
+
* Try to get a valid access token without triggering browser auth
|
|
1042
|
+
*
|
|
1043
|
+
* Returns null if no valid token is available (no stored tokens,
|
|
1044
|
+
* expired without refresh token, or refresh failed). Unlike getAccessToken(),
|
|
1045
|
+
* this will NOT open a browser for authentication.
|
|
1046
|
+
*
|
|
1047
|
+
* Use this for CLI commands that should prompt the user to run `login`
|
|
1048
|
+
* instead of automatically starting the OAuth flow.
|
|
1049
|
+
*/
|
|
1050
|
+
tryGetAccessToken(): Promise<CLIOAuthResult | null>;
|
|
1051
|
+
/**
|
|
1052
|
+
* Force a new authentication flow
|
|
1053
|
+
*/
|
|
1054
|
+
authenticate(): Promise<CLIOAuthResult>;
|
|
1055
|
+
/**
|
|
1056
|
+
* Check if stored credentials exist (may be expired)
|
|
1057
|
+
*/
|
|
1058
|
+
hasStoredCredentials(): Promise<boolean>;
|
|
1059
|
+
/**
|
|
1060
|
+
* Clear stored credentials
|
|
1061
|
+
*/
|
|
1062
|
+
clearCredentials(): Promise<void>;
|
|
1063
|
+
/**
|
|
1064
|
+
* Discover protected resource and authorization server
|
|
1065
|
+
*/
|
|
1066
|
+
private discoverServers;
|
|
1067
|
+
/**
|
|
1068
|
+
* Get existing client or register new one via DCR
|
|
1069
|
+
*/
|
|
1070
|
+
private getOrRegisterClient;
|
|
1071
|
+
/**
|
|
1072
|
+
* Register a new client via Dynamic Client Registration
|
|
1073
|
+
*/
|
|
1074
|
+
private registerClient;
|
|
1075
|
+
/**
|
|
1076
|
+
* Perform the full OAuth authorization flow
|
|
1077
|
+
*/
|
|
1078
|
+
private performOAuthFlow;
|
|
1079
|
+
/**
|
|
1080
|
+
* Refresh an expired token
|
|
1081
|
+
*
|
|
1082
|
+
* Uses the clientId stored with the tokens (if available) to ensure
|
|
1083
|
+
* the refresh request uses the same client that obtained the original tokens.
|
|
1084
|
+
* This is important because refresh tokens are bound to the client_id.
|
|
1085
|
+
*/
|
|
1086
|
+
private refreshStoredToken;
|
|
1087
|
+
/**
|
|
1088
|
+
* Start local callback server
|
|
1089
|
+
*/
|
|
1090
|
+
private startCallbackServer;
|
|
1091
|
+
/**
|
|
1092
|
+
* Open browser or print URL for headless environments
|
|
1093
|
+
*/
|
|
1094
|
+
private openBrowserOrPrintUrl;
|
|
1095
|
+
/**
|
|
1096
|
+
* Convert TokenResult to StoredTokens
|
|
1097
|
+
*
|
|
1098
|
+
* @param result - Token result from exchange or refresh
|
|
1099
|
+
* @param clientId - Client ID that was used to obtain these tokens
|
|
1100
|
+
*/
|
|
1101
|
+
private tokenResultToStoredTokens;
|
|
1102
|
+
/**
|
|
1103
|
+
* HTML page for successful authentication
|
|
1104
|
+
*/
|
|
1105
|
+
private successHtml;
|
|
1106
|
+
/**
|
|
1107
|
+
* HTML page for authentication error
|
|
1108
|
+
*/
|
|
1109
|
+
private errorHtml;
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
/**
|
|
1113
|
+
* Options for creating an MCP client
|
|
1114
|
+
*/
|
|
1115
|
+
interface CreateMCPClientOptions {
|
|
1116
|
+
/**
|
|
1117
|
+
* Client information (name and version)
|
|
1118
|
+
*/
|
|
1119
|
+
clientInfo?: {
|
|
1120
|
+
name?: string;
|
|
1121
|
+
version?: string;
|
|
1122
|
+
};
|
|
1123
|
+
/**
|
|
1124
|
+
* OAuth client provider for authentication
|
|
1125
|
+
*
|
|
1126
|
+
* When provided, the MCP SDK handles OAuth flow automatically.
|
|
1127
|
+
* This takes precedence over static token auth in config.auth.accessToken.
|
|
1128
|
+
*/
|
|
1129
|
+
authProvider?: OAuthClientProvider;
|
|
1130
|
+
}
|
|
1131
|
+
/**
|
|
1132
|
+
* Creates and connects an MCP client based on the provided configuration
|
|
1133
|
+
*
|
|
1134
|
+
* @param config - MCP configuration (will be validated)
|
|
1135
|
+
* @param options - Optional client options including auth provider
|
|
1136
|
+
* @returns Connected MCP Client instance
|
|
1137
|
+
* @throws {Error} If config is invalid or connection fails
|
|
1138
|
+
*
|
|
1139
|
+
* @example
|
|
1140
|
+
* // Stdio transport
|
|
1141
|
+
* const client = await createMCPClientForConfig({
|
|
1142
|
+
* transport: 'stdio',
|
|
1143
|
+
* command: 'node',
|
|
1144
|
+
* args: ['server.js']
|
|
1145
|
+
* });
|
|
1146
|
+
*
|
|
1147
|
+
* @example
|
|
1148
|
+
* // HTTP transport with static token auth
|
|
1149
|
+
* const client = await createMCPClientForConfig({
|
|
1150
|
+
* transport: 'http',
|
|
1151
|
+
* serverUrl: 'http://localhost:3000/mcp',
|
|
1152
|
+
* auth: { accessToken: 'your-token' }
|
|
1153
|
+
* });
|
|
1154
|
+
*
|
|
1155
|
+
* @example
|
|
1156
|
+
* // HTTP transport with OAuth provider
|
|
1157
|
+
* const client = await createMCPClientForConfig(
|
|
1158
|
+
* { transport: 'http', serverUrl: 'http://localhost:3000/mcp' },
|
|
1159
|
+
* { authProvider: myOAuthProvider }
|
|
1160
|
+
* );
|
|
1161
|
+
*/
|
|
1162
|
+
declare function createMCPClientForConfig(config: MCPConfig, options?: CreateMCPClientOptions): Promise<Client>;
|
|
1163
|
+
/**
|
|
1164
|
+
* Safely closes an MCP client connection
|
|
1165
|
+
*
|
|
1166
|
+
* @param client - The client to close
|
|
1167
|
+
*/
|
|
1168
|
+
declare function closeMCPClient(client: Client): Promise<void>;
|
|
1169
|
+
|
|
1170
|
+
/**
|
|
1171
|
+
* A single content block from an MCP response
|
|
1172
|
+
*/
|
|
1173
|
+
interface ContentBlock {
|
|
1174
|
+
type: string;
|
|
1175
|
+
text?: string;
|
|
1176
|
+
data?: unknown;
|
|
1177
|
+
mimeType?: string;
|
|
1178
|
+
}
|
|
1179
|
+
/**
|
|
1180
|
+
* Normalized representation of an MCP tool response
|
|
1181
|
+
*
|
|
1182
|
+
* This provides a consistent interface regardless of the response format
|
|
1183
|
+
* returned by the MCP server.
|
|
1184
|
+
*/
|
|
1185
|
+
interface NormalizedToolResponse {
|
|
1186
|
+
/**
|
|
1187
|
+
* Extracted text content (concatenated from all text blocks)
|
|
1188
|
+
*/
|
|
1189
|
+
text: string;
|
|
1190
|
+
/**
|
|
1191
|
+
* Original raw response from the MCP SDK
|
|
1192
|
+
*/
|
|
1193
|
+
raw: CallToolResult;
|
|
1194
|
+
/**
|
|
1195
|
+
* Whether the tool call resulted in an error
|
|
1196
|
+
*/
|
|
1197
|
+
isError: boolean;
|
|
1198
|
+
/**
|
|
1199
|
+
* Parsed content blocks from the response
|
|
1200
|
+
*/
|
|
1201
|
+
contentBlocks: ContentBlock[];
|
|
1202
|
+
/**
|
|
1203
|
+
* Structured content if present (parsed JSON or raw data)
|
|
1204
|
+
*/
|
|
1205
|
+
structuredContent: unknown;
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* Normalizes an MCP CallToolResult into a consistent format
|
|
1209
|
+
*
|
|
1210
|
+
* @param result - Raw CallToolResult from the MCP SDK
|
|
1211
|
+
* @returns Normalized response with extracted text, content blocks, etc.
|
|
1212
|
+
*
|
|
1213
|
+
* @example
|
|
1214
|
+
* ```typescript
|
|
1215
|
+
* const result = await client.callTool({ name: 'read_file', arguments: { path: 'readme.txt' } });
|
|
1216
|
+
* const normalized = normalizeToolResponse(result);
|
|
1217
|
+
*
|
|
1218
|
+
* console.log(normalized.text); // "Hello World"
|
|
1219
|
+
* console.log(normalized.isError); // false
|
|
1220
|
+
* console.log(normalized.contentBlocks); // [{ type: 'text', text: 'Hello World' }]
|
|
1221
|
+
* ```
|
|
1222
|
+
*/
|
|
1223
|
+
declare function normalizeToolResponse(result: CallToolResult): NormalizedToolResponse;
|
|
1224
|
+
/**
|
|
1225
|
+
* Extracts just the text content from a normalized or raw response
|
|
1226
|
+
*
|
|
1227
|
+
* This is a convenience function that works with both:
|
|
1228
|
+
* - Raw CallToolResult from the MCP SDK
|
|
1229
|
+
* - NormalizedToolResponse from normalizeToolResponse()
|
|
1230
|
+
* - Plain strings or other legacy formats
|
|
1231
|
+
*
|
|
1232
|
+
* @param response - Response in any supported format
|
|
1233
|
+
* @returns Extracted text content
|
|
1234
|
+
*/
|
|
1235
|
+
declare function extractText(response: unknown): string;
|
|
1236
|
+
|
|
1237
|
+
/**
|
|
1238
|
+
* Validator Types
|
|
1239
|
+
*
|
|
1240
|
+
* Core types for the unified assertion architecture.
|
|
1241
|
+
* These types are used by both Playwright matchers and the eval runner.
|
|
1242
|
+
*/
|
|
1243
|
+
|
|
1244
|
+
/**
|
|
1245
|
+
* Result of a validation operation
|
|
1246
|
+
*/
|
|
1247
|
+
interface ValidationResult {
|
|
1248
|
+
/** Whether the validation passed */
|
|
1249
|
+
pass: boolean;
|
|
1250
|
+
/** Human-readable message explaining the result */
|
|
1251
|
+
message: string;
|
|
1252
|
+
/** Additional structured details about the validation */
|
|
1253
|
+
details?: Record<string, unknown>;
|
|
1254
|
+
}
|
|
1255
|
+
/**
|
|
1256
|
+
* Options for text validation
|
|
1257
|
+
*/
|
|
1258
|
+
interface TextValidatorOptions {
|
|
1259
|
+
/** Whether to perform case-sensitive matching (default: true) */
|
|
1260
|
+
caseSensitive?: boolean;
|
|
1261
|
+
}
|
|
1262
|
+
/**
|
|
1263
|
+
* Options for response size validation
|
|
1264
|
+
*/
|
|
1265
|
+
interface SizeValidatorOptions {
|
|
1266
|
+
/** Maximum allowed size in bytes */
|
|
1267
|
+
maxBytes?: number;
|
|
1268
|
+
/** Minimum required size in bytes */
|
|
1269
|
+
minBytes?: number;
|
|
1270
|
+
}
|
|
1271
|
+
/**
|
|
1272
|
+
* Options for schema validation
|
|
1273
|
+
*/
|
|
1274
|
+
interface SchemaValidatorOptions {
|
|
1275
|
+
/** Whether to use strict mode (fail on extra properties) */
|
|
1276
|
+
strict?: boolean;
|
|
1277
|
+
}
|
|
1278
|
+
/**
|
|
1279
|
+
* Options for pattern validation
|
|
1280
|
+
*/
|
|
1281
|
+
interface PatternValidatorOptions {
|
|
1282
|
+
/** Whether to perform case-sensitive matching (default: true) */
|
|
1283
|
+
caseSensitive?: boolean;
|
|
1284
|
+
}
|
|
1285
|
+
/**
|
|
1286
|
+
* Built-in sanitizer names for common variable patterns
|
|
1287
|
+
*/
|
|
1288
|
+
type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
|
|
1289
|
+
/**
|
|
1290
|
+
* Custom regex-based sanitizer
|
|
1291
|
+
*/
|
|
1292
|
+
interface RegexSanitizer {
|
|
1293
|
+
/** Regex pattern to match */
|
|
1294
|
+
pattern: string | RegExp;
|
|
1295
|
+
/** Replacement string (default: "[SANITIZED]") */
|
|
1296
|
+
replacement?: string;
|
|
1297
|
+
}
|
|
1298
|
+
/**
|
|
1299
|
+
* Field removal sanitizer - removes specified fields from objects
|
|
1300
|
+
*/
|
|
1301
|
+
interface FieldRemovalSanitizer {
|
|
1302
|
+
/** Field paths to remove (supports dot notation for nested fields) */
|
|
1303
|
+
remove: string[];
|
|
1304
|
+
}
|
|
1305
|
+
/**
|
|
1306
|
+
* Snapshot sanitizer configuration
|
|
1307
|
+
*
|
|
1308
|
+
* Sanitizers transform response data before snapshot comparison,
|
|
1309
|
+
* allowing variable content (timestamps, IDs, etc.) to be normalized.
|
|
1310
|
+
*
|
|
1311
|
+
* Can be:
|
|
1312
|
+
* - A built-in sanitizer name: 'timestamp', 'uuid', 'iso-date', 'objectId', 'jwt'
|
|
1313
|
+
* - A regex sanitizer: { pattern: /regex/, replacement: '[REPLACED]' }
|
|
1314
|
+
* - A field removal sanitizer: { remove: ['field1', 'nested.field'] }
|
|
1315
|
+
*/
|
|
1316
|
+
type SnapshotSanitizer = BuiltInSanitizer | RegexSanitizer | FieldRemovalSanitizer;
|
|
1317
|
+
/**
|
|
1318
|
+
* Schema registry for named schemas in datasets
|
|
1319
|
+
*/
|
|
1320
|
+
type SchemaRegistry = Record<string, ZodType>;
|
|
1321
|
+
|
|
1322
|
+
/**
|
|
1323
|
+
* Response Validator
|
|
1324
|
+
*
|
|
1325
|
+
* Validates that a response exactly matches an expected value.
|
|
1326
|
+
*/
|
|
1327
|
+
|
|
1328
|
+
/**
|
|
1329
|
+
* Validates that a response exactly matches the expected value
|
|
1330
|
+
*
|
|
1331
|
+
* Performs deep equality comparison using JSON serialization.
|
|
1332
|
+
*
|
|
1333
|
+
* @param actual - The actual response
|
|
1334
|
+
* @param expected - The expected response
|
|
1335
|
+
* @returns Validation result
|
|
1336
|
+
*
|
|
1337
|
+
* @example
|
|
1338
|
+
* ```typescript
|
|
1339
|
+
* const result = validateResponse(response, { status: 'ok', count: 42 });
|
|
1340
|
+
* if (!result.pass) {
|
|
1341
|
+
* console.log(result.message);
|
|
1342
|
+
* }
|
|
1343
|
+
* ```
|
|
1344
|
+
*/
|
|
1345
|
+
declare function validateResponse(actual: unknown, expected: unknown): ValidationResult;
|
|
1346
|
+
|
|
1347
|
+
/**
|
|
1348
|
+
* Schema Validator
|
|
1349
|
+
*
|
|
1350
|
+
* Validates that a response matches a Zod schema.
|
|
1351
|
+
*/
|
|
1352
|
+
|
|
1353
|
+
/**
|
|
1354
|
+
* Validates that a response matches a Zod schema
|
|
1355
|
+
*
|
|
1356
|
+
* Attempts to parse the response with the provided Zod schema.
|
|
1357
|
+
* If the response is a text representation of JSON, it will be parsed first.
|
|
1358
|
+
*
|
|
1359
|
+
* @param response - The response to validate
|
|
1360
|
+
* @param schema - The Zod schema to validate against
|
|
1361
|
+
* @param options - Validation options
|
|
1362
|
+
* @returns Validation result
|
|
1363
|
+
*
|
|
1364
|
+
* @example
|
|
1365
|
+
* ```typescript
|
|
1366
|
+
* import { z } from 'zod';
|
|
1367
|
+
*
|
|
1368
|
+
* const WeatherSchema = z.object({
|
|
1369
|
+
* temperature: z.number(),
|
|
1370
|
+
* conditions: z.string(),
|
|
1371
|
+
* });
|
|
1372
|
+
*
|
|
1373
|
+
* const result = validateSchema(response, WeatherSchema);
|
|
1374
|
+
* if (!result.pass) {
|
|
1375
|
+
* console.log(result.message);
|
|
1376
|
+
* }
|
|
1377
|
+
* ```
|
|
1378
|
+
*/
|
|
1379
|
+
declare function validateSchema(response: unknown, schema: ZodType, options?: SchemaValidatorOptions): ValidationResult;
|
|
1380
|
+
|
|
1381
|
+
/**
|
|
1382
|
+
* Text Validator
|
|
1383
|
+
*
|
|
1384
|
+
* Validates that a response contains expected text substrings.
|
|
1385
|
+
*/
|
|
1386
|
+
|
|
1387
|
+
/**
|
|
1388
|
+
* Validates that a response contains all expected text substrings
|
|
1389
|
+
*
|
|
1390
|
+
* Extracts text from the response and checks that each expected substring
|
|
1391
|
+
* is present. By default, matching is case-sensitive.
|
|
1392
|
+
*
|
|
1393
|
+
* @param response - The response to validate
|
|
1394
|
+
* @param expected - Expected substring(s) to find
|
|
1395
|
+
* @param options - Validation options
|
|
1396
|
+
* @returns Validation result
|
|
1397
|
+
*
|
|
1398
|
+
* @example
|
|
1399
|
+
* ```typescript
|
|
1400
|
+
* const result = validateText(response, ['temperature', 'conditions']);
|
|
1401
|
+
* if (!result.pass) {
|
|
1402
|
+
* console.log(result.message);
|
|
1403
|
+
* }
|
|
1404
|
+
*
|
|
1405
|
+
* // Case-insensitive matching
|
|
1406
|
+
* const result2 = validateText(response, 'HELLO', { caseSensitive: false });
|
|
1407
|
+
* ```
|
|
1408
|
+
*/
|
|
1409
|
+
declare function validateText(response: unknown, expected: string | string[], options?: TextValidatorOptions): ValidationResult;
|
|
1410
|
+
|
|
1411
|
+
/**
|
|
1412
|
+
* Pattern Validator
|
|
1413
|
+
*
|
|
1414
|
+
* Validates that a response matches regex patterns.
|
|
1415
|
+
*/
|
|
1416
|
+
|
|
1417
|
+
/**
|
|
1418
|
+
* Validates that a response matches all expected regex patterns
|
|
1419
|
+
*
|
|
1420
|
+
* Extracts text from the response and checks that each pattern matches.
|
|
1421
|
+
* Patterns can be strings (which are compiled to RegExp) or RegExp objects.
|
|
1422
|
+
*
|
|
1423
|
+
* @param response - The response to validate
|
|
1424
|
+
* @param patterns - Expected pattern(s) to match
|
|
1425
|
+
* @param options - Validation options
|
|
1426
|
+
* @returns Validation result
|
|
1427
|
+
*
|
|
1428
|
+
* @example
|
|
1429
|
+
* ```typescript
|
|
1430
|
+
* // String pattern
|
|
1431
|
+
* const result = validatePattern(response, 'temperature: \\d+');
|
|
1432
|
+
*
|
|
1433
|
+
* // RegExp pattern
|
|
1434
|
+
* const result2 = validatePattern(response, /temperature: \d+/);
|
|
1435
|
+
*
|
|
1436
|
+
* // Multiple patterns
|
|
1437
|
+
* const result3 = validatePattern(response, [
|
|
1438
|
+
* /temperature: \d+/,
|
|
1439
|
+
* /humidity: \d+%/,
|
|
1440
|
+
* ]);
|
|
1441
|
+
*
|
|
1442
|
+
* // Case-insensitive matching
|
|
1443
|
+
* const result4 = validatePattern(response, 'HELLO', { caseSensitive: false });
|
|
1444
|
+
* ```
|
|
1445
|
+
*/
|
|
1446
|
+
declare function validatePattern(response: unknown, patterns: string | RegExp | (string | RegExp)[], options?: PatternValidatorOptions): ValidationResult;
|
|
1447
|
+
|
|
1448
|
+
/**
|
|
1449
|
+
* Error Validator
|
|
1450
|
+
*
|
|
1451
|
+
* Validates error response behavior.
|
|
1452
|
+
*/
|
|
1453
|
+
|
|
1454
|
+
/**
|
|
1455
|
+
* Validates that a response is (or is not) an error
|
|
1456
|
+
*
|
|
1457
|
+
* Can check for:
|
|
1458
|
+
* - Any error (expected = true)
|
|
1459
|
+
* - No error (expected = false)
|
|
1460
|
+
* - Error with specific message(s) (expected = string or string[])
|
|
1461
|
+
*
|
|
1462
|
+
* @param response - The response to validate
|
|
1463
|
+
* @param expected - What to expect (true for any error, false for no error, string for specific message)
|
|
1464
|
+
* @returns Validation result
|
|
1465
|
+
*
|
|
1466
|
+
* @example
|
|
1467
|
+
* ```typescript
|
|
1468
|
+
* // Expect any error
|
|
1469
|
+
* const result = validateError(response, true);
|
|
1470
|
+
*
|
|
1471
|
+
* // Expect no error
|
|
1472
|
+
* const result2 = validateError(response, false);
|
|
1473
|
+
*
|
|
1474
|
+
* // Expect error with specific message
|
|
1475
|
+
* const result3 = validateError(response, 'File not found');
|
|
1476
|
+
*
|
|
1477
|
+
* // Expect error containing one of several messages
|
|
1478
|
+
* const result4 = validateError(response, ['not found', 'does not exist']);
|
|
1479
|
+
* ```
|
|
1480
|
+
*/
|
|
1481
|
+
declare function validateError(response: unknown, expected?: boolean | string | string[]): ValidationResult;
|
|
1482
|
+
|
|
1483
|
+
/**
|
|
1484
|
+
* Size Validator
|
|
1485
|
+
*
|
|
1486
|
+
* Validates that a response meets size constraints.
|
|
1487
|
+
*/
|
|
1488
|
+
|
|
1489
|
+
/**
|
|
1490
|
+
* Validates that a response meets size constraints
|
|
1491
|
+
*
|
|
1492
|
+
* Checks that the response size in bytes is within the specified bounds.
|
|
1493
|
+
* At least one of minBytes or maxBytes must be provided.
|
|
1494
|
+
*
|
|
1495
|
+
* @param response - The response to validate
|
|
1496
|
+
* @param options - Size constraints
|
|
1497
|
+
* @returns Validation result
|
|
1498
|
+
*
|
|
1499
|
+
* @example
|
|
1500
|
+
* ```typescript
|
|
1501
|
+
* // Maximum size check
|
|
1502
|
+
* const result = validateSize(response, { maxBytes: 10000 });
|
|
1503
|
+
*
|
|
1504
|
+
* // Minimum size check
|
|
1505
|
+
* const result2 = validateSize(response, { minBytes: 100 });
|
|
1506
|
+
*
|
|
1507
|
+
* // Both bounds
|
|
1508
|
+
* const result3 = validateSize(response, { minBytes: 100, maxBytes: 10000 });
|
|
1509
|
+
* ```
|
|
1510
|
+
*/
|
|
1511
|
+
declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
|
|
1512
|
+
|
|
1513
|
+
/**
|
|
1514
|
+
* Validator Utilities
|
|
1515
|
+
*
|
|
1516
|
+
* Shared utility functions for validation operations.
|
|
1517
|
+
* Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
|
|
1518
|
+
*/
|
|
1519
|
+
|
|
1520
|
+
/**
|
|
1521
|
+
* Gets the size of a response in bytes
|
|
1522
|
+
*
|
|
1523
|
+
* Serializes the response to JSON (with pretty printing for consistency)
|
|
1524
|
+
* and returns the byte length using UTF-8 encoding.
|
|
1525
|
+
*
|
|
1526
|
+
* @param response - Response in any format
|
|
1527
|
+
* @returns Size in bytes
|
|
1528
|
+
*/
|
|
1529
|
+
declare function getResponseSizeBytes(response: unknown): number;
|
|
1530
|
+
/**
|
|
1531
|
+
* Normalizes whitespace in text for consistent comparison
|
|
1532
|
+
*
|
|
1533
|
+
* Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
|
|
1534
|
+
* and trims leading/trailing whitespace.
|
|
1535
|
+
*
|
|
1536
|
+
* @param text - Text to normalize
|
|
1537
|
+
* @returns Normalized text with collapsed whitespace
|
|
1538
|
+
*
|
|
1539
|
+
* @example
|
|
1540
|
+
* ```typescript
|
|
1541
|
+
* normalizeWhitespace(' hello\n\n world ');
|
|
1542
|
+
* // Returns: "hello world"
|
|
1543
|
+
* ```
|
|
1544
|
+
*/
|
|
1545
|
+
declare function normalizeWhitespace(text: string): string;
|
|
1546
|
+
|
|
1547
|
+
/**
|
|
1548
|
+
* Usage metrics from Claude Agent SDK response
|
|
1549
|
+
*/
|
|
1550
|
+
interface UsageMetrics {
|
|
1551
|
+
/**
|
|
1552
|
+
* Number of input tokens consumed
|
|
1553
|
+
*/
|
|
1554
|
+
inputTokens: number;
|
|
1555
|
+
/**
|
|
1556
|
+
* Number of output tokens generated
|
|
1557
|
+
*/
|
|
1558
|
+
outputTokens: number;
|
|
1559
|
+
/**
|
|
1560
|
+
* Total cost in USD
|
|
1561
|
+
*/
|
|
1562
|
+
totalCostUsd: number;
|
|
1563
|
+
/**
|
|
1564
|
+
* Execution duration in milliseconds
|
|
1565
|
+
*/
|
|
1566
|
+
durationMs: number;
|
|
1567
|
+
/**
|
|
1568
|
+
* API call duration in milliseconds (excluding network overhead)
|
|
1569
|
+
*/
|
|
1570
|
+
durationApiMs?: number;
|
|
1571
|
+
/**
|
|
1572
|
+
* Number of tokens read from cache
|
|
1573
|
+
*/
|
|
1574
|
+
cacheReadInputTokens?: number;
|
|
1575
|
+
/**
|
|
1576
|
+
* Number of tokens written to cache
|
|
1577
|
+
*/
|
|
1578
|
+
cacheCreationInputTokens?: number;
|
|
1579
|
+
}
|
|
1580
|
+
/**
|
|
1581
|
+
* Supported LLM provider types
|
|
1582
|
+
*/
|
|
1583
|
+
type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
|
|
1584
|
+
/**
|
|
1585
|
+
* Configuration for an LLM judge
|
|
1586
|
+
*/
|
|
1587
|
+
interface JudgeConfig {
|
|
1588
|
+
/**
|
|
1589
|
+
* LLM provider to use
|
|
1590
|
+
* @default 'claude'
|
|
1591
|
+
*/
|
|
1592
|
+
provider?: ProviderKind;
|
|
1593
|
+
/**
|
|
1594
|
+
* Environment variable name containing the API key
|
|
1595
|
+
* @default 'ANTHROPIC_API_KEY'
|
|
1596
|
+
*/
|
|
1597
|
+
apiKeyEnvVar?: string;
|
|
1598
|
+
/**
|
|
1599
|
+
* Model to use for judging
|
|
1600
|
+
* @default 'claude-sonnet-4-20250514'
|
|
1601
|
+
*/
|
|
1602
|
+
model?: string;
|
|
1603
|
+
/**
|
|
1604
|
+
* Maximum tokens for response
|
|
1605
|
+
* @default 1000
|
|
1606
|
+
*/
|
|
1607
|
+
maxTokens?: number;
|
|
1608
|
+
/**
|
|
1609
|
+
* Temperature (0-1, lower is more deterministic)
|
|
1610
|
+
* @default 0.0
|
|
1611
|
+
*/
|
|
1612
|
+
temperature?: number;
|
|
1613
|
+
/**
|
|
1614
|
+
* Maximum budget in USD for the judge evaluation
|
|
1615
|
+
* @default 0.10
|
|
1616
|
+
*/
|
|
1617
|
+
maxBudgetUsd?: number;
|
|
1618
|
+
/**
|
|
1619
|
+
* Maximum size (in bytes) for tool output before failing the test
|
|
1620
|
+
* When set, the judge will fail if the candidate response exceeds this size
|
|
1621
|
+
*/
|
|
1622
|
+
maxToolOutputSize?: number;
|
|
1623
|
+
}
|
|
1624
|
+
/**
|
|
1625
|
+
* Result from LLM judge evaluation
|
|
1626
|
+
*/
|
|
1627
|
+
interface JudgeResult {
|
|
1628
|
+
/**
|
|
1629
|
+
* Whether the evaluation passed
|
|
1630
|
+
*/
|
|
1631
|
+
pass: boolean;
|
|
1632
|
+
/**
|
|
1633
|
+
* Numeric score (0-1, where 1 is best)
|
|
1634
|
+
*/
|
|
1635
|
+
score?: number;
|
|
1636
|
+
/**
|
|
1637
|
+
* Reasoning/explanation from the judge
|
|
1638
|
+
*/
|
|
1639
|
+
reasoning?: string;
|
|
1640
|
+
/**
|
|
1641
|
+
* Usage metrics from the Claude Agent SDK
|
|
1642
|
+
*/
|
|
1643
|
+
usage?: UsageMetrics;
|
|
1644
|
+
/**
|
|
1645
|
+
* Size of the candidate response in bytes (for maxToolOutputSize tracking)
|
|
1646
|
+
*/
|
|
1647
|
+
candidateSizeBytes?: number;
|
|
1648
|
+
/**
|
|
1649
|
+
* Whether the candidate exceeded maxToolOutputSize
|
|
1650
|
+
*/
|
|
1651
|
+
exceedsMaxToolOutputSize?: boolean;
|
|
1652
|
+
}
|
|
1653
|
+
/**
|
|
1654
|
+
* LLM judge client interface
|
|
1655
|
+
*/
|
|
1656
|
+
interface Judge {
|
|
1657
|
+
/**
|
|
1658
|
+
* Evaluates a candidate response against a reference
|
|
1659
|
+
*
|
|
1660
|
+
* @param candidate - The actual response to evaluate
|
|
1661
|
+
* @param reference - The expected/reference response (or null if not applicable)
|
|
1662
|
+
* @param rubric - The evaluation rubric/criteria
|
|
1663
|
+
* @returns Evaluation result with usage metrics
|
|
1664
|
+
*/
|
|
1665
|
+
evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
/**
|
|
1669
|
+
* Matcher Types
|
|
1670
|
+
*
|
|
1671
|
+
* TypeScript declarations for custom Playwright matchers.
|
|
1672
|
+
*/
|
|
1673
|
+
|
|
1674
|
+
/**
|
|
1675
|
+
* Options for the LLM judge matcher
|
|
1676
|
+
*/
|
|
1677
|
+
interface JudgeMatcherOptions {
|
|
1678
|
+
/** Reference response to compare against */
|
|
1679
|
+
reference?: unknown;
|
|
1680
|
+
/** Score threshold for passing (default: 0.7) */
|
|
1681
|
+
passingThreshold?: number;
|
|
1682
|
+
/** Judge configuration override */
|
|
1683
|
+
judgeConfig?: JudgeConfig;
|
|
1684
|
+
}
|
|
1685
|
+
/**
|
|
1686
|
+
* Declaration merging for Playwright matchers
|
|
1687
|
+
*/
|
|
1688
|
+
declare global {
|
|
1689
|
+
namespace PlaywrightTest {
|
|
1690
|
+
interface Matchers<R, T = unknown> {
|
|
1691
|
+
/**
|
|
1692
|
+
* Validates that a response exactly matches the expected value
|
|
1693
|
+
*
|
|
1694
|
+
* @param expected - The expected response value
|
|
1695
|
+
*
|
|
1696
|
+
* @example
|
|
1697
|
+
* ```typescript
|
|
1698
|
+
* expect(result).toMatchToolResponse({ status: 'ok', count: 42 });
|
|
1699
|
+
* ```
|
|
1700
|
+
*/
|
|
1701
|
+
toMatchToolResponse(expected: unknown): R;
|
|
1702
|
+
/**
|
|
1703
|
+
* Validates that a response matches a Zod schema
|
|
1704
|
+
*
|
|
1705
|
+
* @param schema - The Zod schema to validate against
|
|
1706
|
+
* @param options - Validation options
|
|
1707
|
+
*
|
|
1708
|
+
* @example
|
|
1709
|
+
* ```typescript
|
|
1710
|
+
* const WeatherSchema = z.object({
|
|
1711
|
+
* temperature: z.number(),
|
|
1712
|
+
* conditions: z.string(),
|
|
1713
|
+
* });
|
|
1714
|
+
* expect(result).toMatchToolSchema(WeatherSchema);
|
|
1715
|
+
* ```
|
|
1716
|
+
*/
|
|
1717
|
+
toMatchToolSchema(schema: ZodType, options?: SchemaValidatorOptions): R;
|
|
1718
|
+
/**
|
|
1719
|
+
* Validates that a response contains expected text substrings
|
|
1720
|
+
*
|
|
1721
|
+
* @param expected - Expected substring(s) to find
|
|
1722
|
+
* @param options - Validation options
|
|
1723
|
+
*
|
|
1724
|
+
* @example
|
|
1725
|
+
* ```typescript
|
|
1726
|
+
* expect(result).toContainToolText('temperature');
|
|
1727
|
+
* expect(result).toContainToolText(['temperature', 'conditions']);
|
|
1728
|
+
* expect(result).toContainToolText('HELLO', { caseSensitive: false });
|
|
1729
|
+
* ```
|
|
1730
|
+
*/
|
|
1731
|
+
toContainToolText(expected: string | string[], options?: TextValidatorOptions): R;
|
|
1732
|
+
/**
|
|
1733
|
+
* Validates that a response matches regex patterns
|
|
1734
|
+
*
|
|
1735
|
+
* @param patterns - Expected pattern(s) to match
|
|
1736
|
+
* @param options - Validation options
|
|
1737
|
+
*
|
|
1738
|
+
* @example
|
|
1739
|
+
* ```typescript
|
|
1740
|
+
* expect(result).toMatchToolPattern(/temperature: \d+/);
|
|
1741
|
+
* expect(result).toMatchToolPattern(['temp: \\d+', 'humidity: \\d+%']);
|
|
1742
|
+
* ```
|
|
1743
|
+
*/
|
|
1744
|
+
toMatchToolPattern(patterns: string | RegExp | (string | RegExp)[], options?: PatternValidatorOptions): R;
|
|
1745
|
+
/**
|
|
1746
|
+
* Validates that a response matches a saved snapshot
|
|
1747
|
+
*
|
|
1748
|
+
* @param name - Snapshot name
|
|
1749
|
+
* @param sanitizers - Optional sanitizers for non-deterministic values
|
|
1750
|
+
*
|
|
1751
|
+
* @example
|
|
1752
|
+
* ```typescript
|
|
1753
|
+
* expect(result).toMatchToolSnapshot('weather-response');
|
|
1754
|
+
* expect(result).toMatchToolSnapshot('user-data', [
|
|
1755
|
+
* { pattern: /\d{4}-\d{2}-\d{2}/, replacement: '[DATE]' },
|
|
1756
|
+
* ]);
|
|
1757
|
+
* ```
|
|
1758
|
+
*/
|
|
1759
|
+
toMatchToolSnapshot(name: string, sanitizers?: SnapshotSanitizer[]): Promise<R>;
|
|
1760
|
+
/**
|
|
1761
|
+
* Validates that a response is (or is not) an error
|
|
1762
|
+
*
|
|
1763
|
+
* @param expected - What to expect (true for error, false for success, string for specific message)
|
|
1764
|
+
*
|
|
1765
|
+
* @example
|
|
1766
|
+
* ```typescript
|
|
1767
|
+
* expect(result).toBeToolError(); // Expects any error
|
|
1768
|
+
* expect(result).not.toBeToolError(); // Expects success
|
|
1769
|
+
* expect(result).toBeToolError('File not found'); // Expects specific error
|
|
1770
|
+
* ```
|
|
1771
|
+
*/
|
|
1772
|
+
toBeToolError(expected?: boolean | string | string[]): R;
|
|
1773
|
+
/**
|
|
1774
|
+
* Validates that a response passes LLM-as-judge evaluation
|
|
1775
|
+
*
|
|
1776
|
+
* @param rubric - Evaluation rubric/criteria
|
|
1777
|
+
* @param options - Judge options
|
|
1778
|
+
*
|
|
1779
|
+
* @example
|
|
1780
|
+
* ```typescript
|
|
1781
|
+
* expect(result).toPassToolJudge('Response should be helpful and accurate');
|
|
1782
|
+
* expect(result).toPassToolJudge('Response should match reference', {
|
|
1783
|
+
* reference: expectedOutput,
|
|
1784
|
+
* passingThreshold: 0.8,
|
|
1785
|
+
* });
|
|
1786
|
+
* ```
|
|
1787
|
+
*/
|
|
1788
|
+
toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
|
|
1789
|
+
/**
|
|
1790
|
+
* Validates that a response meets size constraints
|
|
1791
|
+
*
|
|
1792
|
+
* @param options - Size constraints (maxBytes, minBytes)
|
|
1793
|
+
*
|
|
1794
|
+
* @example
|
|
1795
|
+
* ```typescript
|
|
1796
|
+
* expect(result).toHaveToolResponseSize({ maxBytes: 10000 });
|
|
1797
|
+
* expect(result).toHaveToolResponseSize({ minBytes: 100, maxBytes: 50000 });
|
|
1798
|
+
* ```
|
|
1799
|
+
*/
|
|
1800
|
+
toHaveToolResponseSize(options: SizeValidatorOptions): R;
|
|
1801
|
+
/**
|
|
1802
|
+
* Validates that a response satisfies a custom predicate function
|
|
1803
|
+
*
|
|
1804
|
+
* Use this as an escape hatch when built-in matchers don't cover your use case.
|
|
1805
|
+
* The predicate receives both the raw response and extracted text for convenience.
|
|
1806
|
+
*
|
|
1807
|
+
* @param predicate - Function that validates the response
|
|
1808
|
+
* @param description - Optional description for error messages
|
|
1809
|
+
*
|
|
1810
|
+
* @example
|
|
1811
|
+
* ```typescript
|
|
1812
|
+
* // Simple boolean predicate
|
|
1813
|
+
* expect(result).toSatisfyToolPredicate((response) => {
|
|
1814
|
+
* return response.data?.items?.length > 0;
|
|
1815
|
+
* });
|
|
1816
|
+
*
|
|
1817
|
+
* // Predicate with custom message
|
|
1818
|
+
* expect(result).toSatisfyToolPredicate(
|
|
1819
|
+
* (response, text) => ({
|
|
1820
|
+
* pass: text.includes('success'),
|
|
1821
|
+
* message: 'Expected response to contain "success"',
|
|
1822
|
+
* }),
|
|
1823
|
+
* 'success check'
|
|
1824
|
+
* );
|
|
1825
|
+
*
|
|
1826
|
+
* // Async predicate
|
|
1827
|
+
* expect(result).toSatisfyToolPredicate(async (response) => {
|
|
1828
|
+
* return await validateWithExternalService(response);
|
|
1829
|
+
* });
|
|
1830
|
+
* ```
|
|
1831
|
+
*/
|
|
1832
|
+
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
1833
|
+
}
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1836
|
+
/**
|
|
1837
|
+
* Predicate result returned by the user's predicate function
|
|
1838
|
+
*/
|
|
1839
|
+
interface PredicateResult {
|
|
1840
|
+
/** Whether the predicate passed */
|
|
1841
|
+
pass: boolean;
|
|
1842
|
+
/** Message explaining the result (shown on failure) */
|
|
1843
|
+
message?: string;
|
|
1844
|
+
}
|
|
1845
|
+
/**
|
|
1846
|
+
* A predicate function that validates a response
|
|
1847
|
+
*/
|
|
1848
|
+
type ToolPredicate = (response: unknown, text: string) => boolean | PredicateResult | Promise<boolean | PredicateResult>;
|
|
1849
|
+
|
|
1850
|
+
/**
|
|
1851
|
+
* Canonical type definitions for @gleanwork/mcp-server-tester
|
|
1852
|
+
*
|
|
1853
|
+
* This module is the single source of truth for shared types.
|
|
1854
|
+
* All other modules should import from here rather than defining their own.
|
|
1855
|
+
*
|
|
1856
|
+
* @packageDocumentation
|
|
1857
|
+
*/
|
|
1858
|
+
/**
|
|
1859
|
+
* Authentication type for MCP connections
|
|
1860
|
+
*
|
|
1861
|
+
* - 'oauth': Interactive OAuth 2.1 with PKCE (browser-based authentication)
|
|
1862
|
+
* - 'api-token': Static API token (e.g., from a dashboard or environment variable)
|
|
1863
|
+
* - 'none': No authentication
|
|
1864
|
+
*/
|
|
1865
|
+
type AuthType = 'oauth' | 'api-token' | 'none';
|
|
1866
|
+
/**
|
|
1867
|
+
* Source of test results
|
|
1868
|
+
*
|
|
1869
|
+
* - 'eval': From runEvalDataset() using JSON eval datasets
|
|
1870
|
+
* - 'test': From direct API test tracking (MCP fixture calls)
|
|
1871
|
+
*/
|
|
1872
|
+
type ResultSource = 'eval' | 'test';
|
|
1873
|
+
/**
|
|
1874
|
+
* Known expectation types supported by the framework
|
|
1875
|
+
*/
|
|
1876
|
+
type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
|
|
1877
|
+
/**
|
|
1878
|
+
* Result of an expectation check
|
|
1879
|
+
*/
|
|
1880
|
+
interface EvalExpectationResult {
|
|
1881
|
+
/**
|
|
1882
|
+
* Whether the expectation passed
|
|
1883
|
+
*/
|
|
1884
|
+
pass: boolean;
|
|
1885
|
+
/**
|
|
1886
|
+
* Optional details about the result
|
|
1887
|
+
*/
|
|
1888
|
+
details?: string;
|
|
1889
|
+
}
|
|
1890
|
+
/**
|
|
1891
|
+
* Map of expectation type to result
|
|
1892
|
+
*/
|
|
1893
|
+
type ExpectationResultMap = Partial<Record<ExpectationType, EvalExpectationResult>>;
|
|
1894
|
+
/**
|
|
1895
|
+
* Breakdown of expectation types used in a run
|
|
1896
|
+
*/
|
|
1897
|
+
type ExpectationBreakdown = Record<ExpectationType, number>;
|
|
1898
|
+
|
|
1899
|
+
/**
|
|
1900
|
+
* Options for creating an MCP fixture
|
|
1901
|
+
*/
|
|
1902
|
+
interface MCPFixtureOptions {
|
|
1903
|
+
/**
|
|
1904
|
+
* Authentication type used for this test
|
|
1905
|
+
* - 'oauth': Interactive OAuth 2.1 with PKCE (browser-based authentication)
|
|
1906
|
+
* - 'api-token': Static API token (e.g., from a dashboard or environment variable)
|
|
1907
|
+
* - 'none': No authentication
|
|
1908
|
+
*/
|
|
1909
|
+
authType?: AuthType;
|
|
1910
|
+
/**
|
|
1911
|
+
* Playwright project name for this test
|
|
1912
|
+
* Used for filtering and grouping in the reporter
|
|
1913
|
+
*/
|
|
1914
|
+
project?: string;
|
|
1915
|
+
}
|
|
1916
|
+
/**
|
|
1917
|
+
* High-level API for interacting with MCP servers in tests
|
|
1918
|
+
*
|
|
1919
|
+
* This interface wraps the raw MCP Client with test-friendly methods
|
|
1920
|
+
*/
|
|
1921
|
+
interface MCPFixtureApi {
|
|
1922
|
+
/**
|
|
1923
|
+
* The underlying MCP client (for advanced usage)
|
|
1924
|
+
*/
|
|
1925
|
+
client: Client;
|
|
1926
|
+
/**
|
|
1927
|
+
* Authentication type used for this test session
|
|
1928
|
+
*/
|
|
1929
|
+
authType: AuthType;
|
|
1930
|
+
/**
|
|
1931
|
+
* Playwright project name for this test session
|
|
1932
|
+
*/
|
|
1933
|
+
project?: string;
|
|
1934
|
+
/**
|
|
1935
|
+
* Lists all available tools from the MCP server
|
|
1936
|
+
*
|
|
1937
|
+
* @returns Array of tool definitions
|
|
1938
|
+
*/
|
|
1939
|
+
listTools(): Promise<Array<Tool>>;
|
|
1940
|
+
/**
|
|
1941
|
+
* Calls a tool on the MCP server
|
|
1942
|
+
*
|
|
1943
|
+
* @param name - Tool name
|
|
1944
|
+
* @param args - Tool arguments
|
|
1945
|
+
* @returns Tool call result
|
|
1946
|
+
*/
|
|
1947
|
+
callTool<TArgs extends Record<string, unknown> = Record<string, unknown>>(name: string, args: TArgs): Promise<CallToolResult>;
|
|
1948
|
+
/**
|
|
1949
|
+
* Gets information about the connected server
|
|
1950
|
+
*/
|
|
1951
|
+
getServerInfo(): {
|
|
1952
|
+
name?: string;
|
|
1953
|
+
version?: string;
|
|
1954
|
+
} | null;
|
|
1955
|
+
}
|
|
1956
|
+
/**
|
|
1957
|
+
* Creates an MCP fixture wrapper around a Client
|
|
1958
|
+
*
|
|
1959
|
+
* When testInfo is provided, automatically tracks all MCP operations with test.step()
|
|
1960
|
+
* and creates attachments for the MCP Test Reporter.
|
|
1961
|
+
*
|
|
1962
|
+
* @param client - The MCP client to wrap
|
|
1963
|
+
* @param testInfo - Optional Playwright TestInfo for auto-tracking
|
|
1964
|
+
* @returns MCPFixtureApi instance
|
|
1965
|
+
*
|
|
1966
|
+
* @example
|
|
1967
|
+
* ```typescript
|
|
1968
|
+
* // With tracking (recommended)
|
|
1969
|
+
* const test = base.extend<{ mcp: MCPFixtureApi }>({
|
|
1970
|
+
* mcp: async ({}, use, testInfo) => {
|
|
1971
|
+
* const client = await createMCPClientForConfig(config);
|
|
1972
|
+
* const api = createMCPFixture(client, testInfo);
|
|
1973
|
+
* await use(api);
|
|
1974
|
+
* await closeMCPClient(client);
|
|
1975
|
+
* }
|
|
1976
|
+
* });
|
|
1977
|
+
*
|
|
1978
|
+
* // Without tracking
|
|
1979
|
+
* const api = createMCPFixture(client);
|
|
1980
|
+
* ```
|
|
1981
|
+
*/
|
|
1982
|
+
declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
|
|
1983
|
+
|
|
1984
|
+
/**
|
|
1985
|
+
* toMatchToolResponse Matcher
|
|
1986
|
+
*
|
|
1987
|
+
* Validates that a response exactly matches an expected value.
|
|
1988
|
+
*/
|
|
1989
|
+
/**
|
|
1990
|
+
* Creates the toMatchToolResponse matcher function
|
|
1991
|
+
*/
|
|
1992
|
+
declare function toMatchToolResponse(this: {
|
|
1993
|
+
isNot: boolean;
|
|
1994
|
+
}, received: unknown, expected: unknown): {
|
|
1995
|
+
pass: boolean;
|
|
1996
|
+
message: () => string;
|
|
1997
|
+
};
|
|
1998
|
+
|
|
1999
|
+
/**
|
|
2000
|
+
* toMatchToolSchema Matcher
|
|
2001
|
+
*
|
|
2002
|
+
* Validates that a response matches a Zod schema.
|
|
2003
|
+
*/
|
|
2004
|
+
|
|
2005
|
+
/**
|
|
2006
|
+
* Creates the toMatchToolSchema matcher function
|
|
2007
|
+
*/
|
|
2008
|
+
declare function toMatchToolSchema(this: {
|
|
2009
|
+
isNot: boolean;
|
|
2010
|
+
}, received: unknown, schema: ZodType, options?: SchemaValidatorOptions): {
|
|
2011
|
+
pass: boolean;
|
|
2012
|
+
message: () => string;
|
|
2013
|
+
};
|
|
2014
|
+
|
|
2015
|
+
/**
|
|
2016
|
+
* toContainToolText Matcher
|
|
2017
|
+
*
|
|
2018
|
+
* Validates that a response contains expected text substrings.
|
|
2019
|
+
*/
|
|
2020
|
+
|
|
2021
|
+
/**
|
|
2022
|
+
* Creates the toContainToolText matcher function
|
|
2023
|
+
*/
|
|
2024
|
+
declare function toContainToolText(this: {
|
|
2025
|
+
isNot: boolean;
|
|
2026
|
+
}, received: unknown, expected: string | string[], options?: TextValidatorOptions): {
|
|
2027
|
+
pass: boolean;
|
|
2028
|
+
message: () => string;
|
|
2029
|
+
};
|
|
2030
|
+
|
|
2031
|
+
/**
|
|
2032
|
+
* toMatchToolPattern Matcher
|
|
2033
|
+
*
|
|
2034
|
+
* Validates that a response matches regex patterns.
|
|
2035
|
+
*/
|
|
2036
|
+
|
|
2037
|
+
/**
|
|
2038
|
+
* Creates the toMatchToolPattern matcher function
|
|
2039
|
+
*/
|
|
2040
|
+
declare function toMatchToolPattern(this: {
|
|
2041
|
+
isNot: boolean;
|
|
2042
|
+
}, received: unknown, patterns: string | RegExp | (string | RegExp)[], options?: PatternValidatorOptions): {
|
|
2043
|
+
pass: boolean;
|
|
2044
|
+
message: () => string;
|
|
2045
|
+
};
|
|
2046
|
+
|
|
2047
|
+
/**
|
|
2048
|
+
* toMatchToolSnapshot Matcher
|
|
2049
|
+
*
|
|
2050
|
+
* Validates that a response matches a saved snapshot.
|
|
2051
|
+
* Uses Playwright's native snapshot testing functionality.
|
|
2052
|
+
*/
|
|
2053
|
+
|
|
2054
|
+
/**
|
|
2055
|
+
* Creates the toMatchToolSnapshot matcher function
|
|
2056
|
+
*
|
|
2057
|
+
* Note: This is an async matcher that uses Playwright's snapshot testing.
|
|
2058
|
+
*/
|
|
2059
|
+
declare function toMatchToolSnapshot(this: {
|
|
2060
|
+
isNot: boolean;
|
|
2061
|
+
}, received: unknown, name: string, sanitizers?: SnapshotSanitizer[]): Promise<{
|
|
2062
|
+
pass: boolean;
|
|
2063
|
+
message: () => string;
|
|
2064
|
+
}>;
|
|
2065
|
+
|
|
2066
|
+
/**
|
|
2067
|
+
* toBeToolError Matcher
|
|
2068
|
+
*
|
|
2069
|
+
* Validates that a response is (or is not) an error.
|
|
2070
|
+
*/
|
|
2071
|
+
/**
|
|
2072
|
+
* Creates the toBeToolError matcher function
|
|
2073
|
+
*/
|
|
2074
|
+
declare function toBeToolError(this: {
|
|
2075
|
+
isNot: boolean;
|
|
2076
|
+
}, received: unknown, expected?: boolean | string | string[]): {
|
|
2077
|
+
pass: boolean;
|
|
2078
|
+
message: () => string;
|
|
2079
|
+
};
|
|
2080
|
+
|
|
2081
|
+
/**
|
|
2082
|
+
* toPassToolJudge Matcher
|
|
2083
|
+
*
|
|
2084
|
+
* Validates that a response passes LLM-as-judge evaluation.
|
|
2085
|
+
*/
|
|
2086
|
+
|
|
2087
|
+
/**
|
|
2088
|
+
* Creates the toPassToolJudge matcher function
|
|
2089
|
+
*
|
|
2090
|
+
* Note: This is an async matcher that calls an LLM for evaluation.
|
|
2091
|
+
*/
|
|
2092
|
+
declare function toPassToolJudge(this: {
|
|
2093
|
+
isNot: boolean;
|
|
2094
|
+
}, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
|
|
2095
|
+
pass: boolean;
|
|
2096
|
+
message: () => string;
|
|
2097
|
+
}>;
|
|
2098
|
+
|
|
2099
|
+
/**
|
|
2100
|
+
* toHaveToolResponseSize Matcher
|
|
2101
|
+
*
|
|
2102
|
+
* Validates that a response meets size constraints.
|
|
2103
|
+
*/
|
|
2104
|
+
|
|
2105
|
+
/**
|
|
2106
|
+
* Creates the toHaveToolResponseSize matcher function
|
|
2107
|
+
*/
|
|
2108
|
+
declare function toHaveToolResponseSize(this: {
|
|
2109
|
+
isNot: boolean;
|
|
2110
|
+
}, received: unknown, options: SizeValidatorOptions): {
|
|
2111
|
+
pass: boolean;
|
|
2112
|
+
message: () => string;
|
|
2113
|
+
};
|
|
2114
|
+
|
|
2115
|
+
/**
|
|
2116
|
+
* toSatisfyToolPredicate Matcher
|
|
2117
|
+
*
|
|
2118
|
+
* Validates that a response satisfies a custom predicate function.
|
|
2119
|
+
* This is an escape hatch for custom validation logic when built-in
|
|
2120
|
+
* matchers don't cover the use case.
|
|
2121
|
+
*/
|
|
2122
|
+
|
|
2123
|
+
/**
|
|
2124
|
+
* Creates the toSatisfyToolPredicate matcher function
|
|
2125
|
+
*
|
|
2126
|
+
* This matcher allows custom validation logic via a predicate function.
|
|
2127
|
+
* The predicate receives both the raw response and extracted text.
|
|
2128
|
+
*
|
|
2129
|
+
* @example
|
|
2130
|
+
* ```typescript
|
|
2131
|
+
* // Simple boolean predicate
|
|
2132
|
+
* expect(result).toSatisfyToolPredicate((response) => {
|
|
2133
|
+
* return response.data?.length > 0;
|
|
2134
|
+
* });
|
|
2135
|
+
*
|
|
2136
|
+
* // Predicate with custom message
|
|
2137
|
+
* expect(result).toSatisfyToolPredicate((response, text) => {
|
|
2138
|
+
* const hasTemperature = text.includes('temperature');
|
|
2139
|
+
* return {
|
|
2140
|
+
* pass: hasTemperature,
|
|
2141
|
+
* message: hasTemperature
|
|
2142
|
+
* ? 'Found temperature in response'
|
|
2143
|
+
* : 'Expected response to contain temperature',
|
|
2144
|
+
* };
|
|
2145
|
+
* });
|
|
2146
|
+
*
|
|
2147
|
+
* // Async predicate
|
|
2148
|
+
* expect(result).toSatisfyToolPredicate(async (response) => {
|
|
2149
|
+
* const isValid = await validateWithExternalService(response);
|
|
2150
|
+
* return isValid;
|
|
2151
|
+
* });
|
|
2152
|
+
* ```
|
|
2153
|
+
*/
|
|
2154
|
+
declare function toSatisfyToolPredicate(this: {
|
|
2155
|
+
isNot: boolean;
|
|
2156
|
+
}, received: unknown, predicate: ToolPredicate, description?: string): Promise<{
|
|
2157
|
+
pass: boolean;
|
|
2158
|
+
message: () => string;
|
|
2159
|
+
}>;
|
|
2160
|
+
|
|
2161
|
+
/**
|
|
2162
|
+
* Extended Playwright expect with MCP tool matchers
|
|
2163
|
+
*
|
|
2164
|
+
* @example
|
|
2165
|
+
* ```typescript
|
|
2166
|
+
* import { expect } from '@gleanwork/mcp-server-tester';
|
|
2167
|
+
*
|
|
2168
|
+
* test('weather tool', async ({ mcp }) => {
|
|
2169
|
+
* const result = await mcp.callTool('get_weather', { city: 'London' });
|
|
2170
|
+
*
|
|
2171
|
+
* expect(result).toContainToolText('temperature');
|
|
2172
|
+
* expect(result).toMatchToolSchema(WeatherSchema);
|
|
2173
|
+
* expect(result).not.toBeToolError();
|
|
2174
|
+
* });
|
|
2175
|
+
* ```
|
|
2176
|
+
*/
|
|
2177
|
+
declare const expect: playwright_test.Expect<{
|
|
2178
|
+
toMatchToolResponse: typeof toMatchToolResponse;
|
|
2179
|
+
toMatchToolSchema: typeof toMatchToolSchema;
|
|
2180
|
+
toContainToolText: typeof toContainToolText;
|
|
2181
|
+
toMatchToolPattern: typeof toMatchToolPattern;
|
|
2182
|
+
toMatchToolSnapshot: typeof toMatchToolSnapshot;
|
|
2183
|
+
toBeToolError: typeof toBeToolError;
|
|
2184
|
+
toPassToolJudge: typeof toPassToolJudge;
|
|
2185
|
+
toHaveToolResponseSize: typeof toHaveToolResponseSize;
|
|
2186
|
+
toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
|
|
2187
|
+
}>;
|
|
2188
|
+
|
|
2189
|
+
/**
|
|
2190
|
+
* Internal fixture state for passing auth type between fixtures
|
|
2191
|
+
*/
|
|
2192
|
+
interface MCPFixtureState {
|
|
2193
|
+
/**
|
|
2194
|
+
* The resolved authentication type (may differ from config if CLI tokens are used)
|
|
2195
|
+
*/
|
|
2196
|
+
resolvedAuthType: AuthType;
|
|
2197
|
+
}
|
|
2198
|
+
/**
|
|
2199
|
+
* Extended test fixtures for MCP testing
|
|
2200
|
+
*/
|
|
2201
|
+
type MCPFixtures = {
|
|
2202
|
+
/**
|
|
2203
|
+
* Raw MCP client instance (automatically connected and cleaned up)
|
|
2204
|
+
*/
|
|
2205
|
+
mcpClient: Client;
|
|
2206
|
+
/**
|
|
2207
|
+
* High-level MCP API for tests
|
|
2208
|
+
*/
|
|
2209
|
+
mcp: MCPFixtureApi;
|
|
2210
|
+
/**
|
|
2211
|
+
* Internal fixture state (not for external use)
|
|
2212
|
+
*/
|
|
2213
|
+
_mcpFixtureState: MCPFixtureState;
|
|
2214
|
+
};
|
|
2215
|
+
/**
|
|
2216
|
+
* Extended Playwright test with MCP fixtures
|
|
2217
|
+
*
|
|
2218
|
+
* @example
|
|
2219
|
+
* import { test, expect } from '@gleanwork/mcp-server-tester';
|
|
2220
|
+
*
|
|
2221
|
+
* test('lists tools from MCP server', async ({ mcp }) => {
|
|
2222
|
+
* const tools = await mcp.listTools();
|
|
2223
|
+
* expect(tools.length).toBeGreaterThan(0);
|
|
2224
|
+
* });
|
|
2225
|
+
*/
|
|
2226
|
+
declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2227
|
+
|
|
2228
|
+
/**
|
|
2229
|
+
* Types and interfaces for LLM host simulation mode
|
|
2230
|
+
*
|
|
2231
|
+
* This module provides types for testing MCP servers through LLM hosts,
|
|
2232
|
+
* validating tool descriptions, parameter clarity, and discoverability.
|
|
2233
|
+
*/
|
|
2234
|
+
|
|
2235
|
+
/**
|
|
2236
|
+
* LLM provider for host simulation
|
|
2237
|
+
*/
|
|
2238
|
+
type LLMProvider = 'openai' | 'anthropic';
|
|
2239
|
+
/**
|
|
2240
|
+
* Configuration for LLM host simulation
|
|
2241
|
+
*/
|
|
2242
|
+
interface LLMHostConfig {
|
|
2243
|
+
/**
|
|
2244
|
+
* LLM provider to use
|
|
2245
|
+
*/
|
|
2246
|
+
provider: LLMProvider;
|
|
2247
|
+
/**
|
|
2248
|
+
* Environment variable name containing the API key
|
|
2249
|
+
* @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
|
|
2250
|
+
*/
|
|
2251
|
+
apiKeyEnvVar?: string;
|
|
2252
|
+
/**
|
|
2253
|
+
* Model to use
|
|
2254
|
+
* @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
|
|
2255
|
+
*/
|
|
2256
|
+
model?: string;
|
|
2257
|
+
/**
|
|
2258
|
+
* Maximum tokens for response
|
|
2259
|
+
*/
|
|
2260
|
+
maxTokens?: number;
|
|
2261
|
+
/**
|
|
2262
|
+
* Temperature (0-1, lower is more deterministic)
|
|
2263
|
+
* @default 0.0
|
|
2264
|
+
*/
|
|
2265
|
+
temperature?: number;
|
|
2266
|
+
/**
|
|
2267
|
+
* Maximum number of tool calls to allow in a single conversation
|
|
2268
|
+
* @default 10
|
|
2269
|
+
*/
|
|
2270
|
+
maxToolCalls?: number;
|
|
2271
|
+
}
|
|
2272
|
+
/**
|
|
2273
|
+
* A tool call made by the LLM
|
|
2274
|
+
*/
|
|
2275
|
+
interface LLMToolCall {
|
|
2276
|
+
/**
|
|
2277
|
+
* Tool name
|
|
2278
|
+
*/
|
|
2279
|
+
name: string;
|
|
2280
|
+
/**
|
|
2281
|
+
* Tool arguments (as provided by LLM)
|
|
2282
|
+
*/
|
|
2283
|
+
arguments: Record<string, unknown>;
|
|
2284
|
+
/**
|
|
2285
|
+
* Optional tool call ID (for tracking)
|
|
2286
|
+
*/
|
|
2287
|
+
id?: string;
|
|
2288
|
+
}
|
|
2289
|
+
/**
|
|
2290
|
+
* Result of a tool call validation
|
|
2291
|
+
*/
|
|
2292
|
+
interface ToolCallValidationResult {
|
|
2293
|
+
/**
|
|
2294
|
+
* Whether the tool call was valid
|
|
2295
|
+
*/
|
|
2296
|
+
valid: boolean;
|
|
2297
|
+
/**
|
|
2298
|
+
* List of actual tool calls made
|
|
2299
|
+
*/
|
|
2300
|
+
actualCalls: Array<LLMToolCall>;
|
|
2301
|
+
/**
|
|
2302
|
+
* Expected tool calls (if specified in eval case)
|
|
2303
|
+
*/
|
|
2304
|
+
expectedCalls?: Array<LLMToolCall>;
|
|
2305
|
+
/**
|
|
2306
|
+
* Details about validation (e.g., missing calls, incorrect arguments)
|
|
2307
|
+
*/
|
|
2308
|
+
details?: string;
|
|
2309
|
+
}
|
|
2310
|
+
/**
|
|
2311
|
+
* Result from an LLM host simulation
|
|
2312
|
+
*/
|
|
2313
|
+
interface LLMHostSimulationResult {
|
|
2314
|
+
/**
|
|
2315
|
+
* Whether the simulation succeeded
|
|
2316
|
+
*/
|
|
2317
|
+
success: boolean;
|
|
2318
|
+
/**
|
|
2319
|
+
* Tool calls made by the LLM
|
|
2320
|
+
*/
|
|
2321
|
+
toolCalls: Array<LLMToolCall>;
|
|
2322
|
+
/**
|
|
2323
|
+
* Final response from the LLM
|
|
2324
|
+
*/
|
|
2325
|
+
response?: string;
|
|
2326
|
+
/**
|
|
2327
|
+
* Error message if simulation failed
|
|
2328
|
+
*/
|
|
2329
|
+
error?: string;
|
|
2330
|
+
/**
|
|
2331
|
+
* Full conversation history (for debugging)
|
|
2332
|
+
*/
|
|
2333
|
+
conversationHistory?: Array<{
|
|
2334
|
+
role: 'user' | 'assistant' | 'tool';
|
|
2335
|
+
content: string;
|
|
2336
|
+
}>;
|
|
2337
|
+
}
|
|
2338
|
+
/**
|
|
2339
|
+
* Interface for LLM host simulators
|
|
2340
|
+
*
|
|
2341
|
+
* Implementations communicate with MCP servers via the actual MCP protocol
|
|
2342
|
+
*/
|
|
2343
|
+
interface LLMHostSimulator {
|
|
2344
|
+
/**
|
|
2345
|
+
* Simulates an LLM host interacting with an MCP server
|
|
2346
|
+
*
|
|
2347
|
+
* @param mcp - MCP fixture API
|
|
2348
|
+
* @param scenario - Natural language prompt describing what the LLM should do
|
|
2349
|
+
* @param config - LLM host configuration
|
|
2350
|
+
* @returns Simulation result with tool calls and response
|
|
2351
|
+
*/
|
|
2352
|
+
simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
2353
|
+
}
|
|
2354
|
+
/**
|
|
2355
|
+
* Expected tool call specification (for validation)
|
|
2356
|
+
*/
|
|
2357
|
+
interface ExpectedToolCall {
|
|
2358
|
+
/**
|
|
2359
|
+
* Tool name
|
|
2360
|
+
*/
|
|
2361
|
+
name: string;
|
|
2362
|
+
/**
|
|
2363
|
+
* Expected arguments (partial match)
|
|
2364
|
+
*/
|
|
2365
|
+
arguments?: Record<string, unknown>;
|
|
2366
|
+
/**
|
|
2367
|
+
* Whether this call is required
|
|
2368
|
+
* @default true
|
|
2369
|
+
*/
|
|
2370
|
+
required?: boolean;
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
/**
|
|
2374
|
+
* Evaluation mode
|
|
2375
|
+
*/
|
|
2376
|
+
type EvalMode = 'direct' | 'llm_host';
|
|
2377
|
+
/**
|
|
2378
|
+
* A single eval test case
|
|
2379
|
+
*
|
|
2380
|
+
* For 'direct' mode: toolName and args are required
|
|
2381
|
+
* For 'llm_host' mode: scenario and llmHostConfig are required
|
|
2382
|
+
*/
|
|
2383
|
+
interface EvalCase {
|
|
2384
|
+
/**
|
|
2385
|
+
* Unique identifier for this test case
|
|
2386
|
+
*/
|
|
2387
|
+
id: string;
|
|
2388
|
+
/**
|
|
2389
|
+
* Human-readable description of what this test case validates
|
|
2390
|
+
*/
|
|
2391
|
+
description?: string;
|
|
2392
|
+
/**
|
|
2393
|
+
* Evaluation mode
|
|
2394
|
+
* - 'direct': Direct API calls to MCP tools (default)
|
|
2395
|
+
* - 'llm_host': LLM-driven tool selection via natural language
|
|
2396
|
+
*
|
|
2397
|
+
* @default 'direct'
|
|
2398
|
+
*/
|
|
2399
|
+
mode?: EvalMode;
|
|
2400
|
+
/**
|
|
2401
|
+
* Name of the MCP tool to call (required for 'direct' mode, optional for 'llm_host' mode)
|
|
2402
|
+
*/
|
|
2403
|
+
toolName?: string;
|
|
2404
|
+
/**
|
|
2405
|
+
* Arguments to pass to the tool (required for 'direct' mode, optional for 'llm_host' mode)
|
|
2406
|
+
*/
|
|
2407
|
+
args?: Record<string, unknown>;
|
|
2408
|
+
/**
|
|
2409
|
+
* Natural language scenario for LLM to execute (optional, required for 'llm_host' mode)
|
|
2410
|
+
*
|
|
2411
|
+
* @example "Get the weather for London and tell me if I need an umbrella"
|
|
2412
|
+
*/
|
|
2413
|
+
scenario?: string;
|
|
2414
|
+
/**
|
|
2415
|
+
* LLM host configuration (optional for 'llm_host' mode)
|
|
2416
|
+
*
|
|
2417
|
+
* If not specified, uses default configuration from test environment
|
|
2418
|
+
*/
|
|
2419
|
+
llmHostConfig?: LLMHostConfig;
|
|
2420
|
+
/**
|
|
2421
|
+
* Additional metadata for this test case
|
|
2422
|
+
*
|
|
2423
|
+
* For 'llm_host' mode, can include 'expectedToolCalls' for validation
|
|
2424
|
+
*/
|
|
2425
|
+
metadata?: Record<string, unknown>;
|
|
2426
|
+
/**
|
|
2427
|
+
* Expectations to validate against the tool response
|
|
2428
|
+
*
|
|
2429
|
+
* Multiple expectations can be combined and will all be validated.
|
|
2430
|
+
*
|
|
2431
|
+
* @example
|
|
2432
|
+
* ```json
|
|
2433
|
+
* {
|
|
2434
|
+
* "id": "weather-london",
|
|
2435
|
+
* "toolName": "get_weather",
|
|
2436
|
+
* "args": { "city": "London" },
|
|
2437
|
+
* "expect": {
|
|
2438
|
+
* "containsText": ["temperature", "conditions"],
|
|
2439
|
+
* "schema": "WeatherResponse",
|
|
2440
|
+
* "responseSize": { "maxBytes": 10000 },
|
|
2441
|
+
* "isError": false
|
|
2442
|
+
* }
|
|
2443
|
+
* }
|
|
2444
|
+
* ```
|
|
2445
|
+
*/
|
|
2446
|
+
expect?: EvalExpectBlock;
|
|
2447
|
+
}
|
|
2448
|
+
/**
|
|
2449
|
+
* Unified expectation block for eval cases
|
|
2450
|
+
*
|
|
2451
|
+
* Mirrors the Playwright matcher API for consistency.
|
|
2452
|
+
*/
|
|
2453
|
+
interface EvalExpectBlock {
|
|
2454
|
+
/**
|
|
2455
|
+
* Exact response match (toMatchToolResponse)
|
|
2456
|
+
*/
|
|
2457
|
+
response?: unknown;
|
|
2458
|
+
/**
|
|
2459
|
+
* Name of schema to validate against (toMatchToolSchema)
|
|
2460
|
+
*/
|
|
2461
|
+
schema?: string;
|
|
2462
|
+
/**
|
|
2463
|
+
* Text substring(s) that must be present (toContainToolText)
|
|
2464
|
+
*/
|
|
2465
|
+
containsText?: string | string[];
|
|
2466
|
+
/**
|
|
2467
|
+
* Regex pattern(s) that must match (toMatchToolPattern)
|
|
2468
|
+
*/
|
|
2469
|
+
matchesPattern?: string | string[];
|
|
2470
|
+
/**
|
|
2471
|
+
* Snapshot name for comparison (toMatchToolSnapshot)
|
|
2472
|
+
*/
|
|
2473
|
+
snapshot?: string;
|
|
2474
|
+
/**
|
|
2475
|
+
* Snapshot sanitizers to apply
|
|
2476
|
+
*/
|
|
2477
|
+
snapshotSanitizers?: SnapshotSanitizer[];
|
|
2478
|
+
/**
|
|
2479
|
+
* Error expectation (toBeToolError)
|
|
2480
|
+
* - true: expects any error
|
|
2481
|
+
* - false: expects no error
|
|
2482
|
+
* - string: expects error containing this message
|
|
2483
|
+
*/
|
|
2484
|
+
isError?: boolean | string | string[];
|
|
2485
|
+
/**
|
|
2486
|
+
* LLM-as-judge evaluation (toPassToolJudge)
|
|
2487
|
+
*/
|
|
2488
|
+
passesJudge?: {
|
|
2489
|
+
/** Evaluation rubric/criteria */
|
|
2490
|
+
rubric: string;
|
|
2491
|
+
/** Reference response to compare against */
|
|
2492
|
+
reference?: unknown;
|
|
2493
|
+
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2494
|
+
threshold?: number;
|
|
2495
|
+
/** Judge configuration ID */
|
|
2496
|
+
configId?: string;
|
|
2497
|
+
};
|
|
2498
|
+
/**
|
|
2499
|
+
* Response size validation (toHaveToolResponseSize)
|
|
2500
|
+
*/
|
|
2501
|
+
responseSize?: {
|
|
2502
|
+
/** Maximum allowed size in bytes */
|
|
2503
|
+
maxBytes?: number;
|
|
2504
|
+
/** Minimum required size in bytes */
|
|
2505
|
+
minBytes?: number;
|
|
2506
|
+
};
|
|
2507
|
+
}
|
|
2508
|
+
/**
|
|
2509
|
+
* A complete eval dataset containing multiple test cases
|
|
2510
|
+
*/
|
|
2511
|
+
interface EvalDataset {
|
|
2512
|
+
/**
|
|
2513
|
+
* Dataset name
|
|
2514
|
+
*/
|
|
2515
|
+
name: string;
|
|
2516
|
+
/**
|
|
2517
|
+
* Dataset description
|
|
2518
|
+
*/
|
|
2519
|
+
description?: string;
|
|
2520
|
+
/**
|
|
2521
|
+
* Test cases in this dataset
|
|
2522
|
+
*/
|
|
2523
|
+
cases: Array<EvalCase>;
|
|
2524
|
+
/**
|
|
2525
|
+
* Optional schema definitions referenced by test cases
|
|
2526
|
+
*/
|
|
2527
|
+
schemas?: Record<string, z.ZodSchema>;
|
|
2528
|
+
/**
|
|
2529
|
+
* Additional dataset metadata
|
|
2530
|
+
*/
|
|
2531
|
+
metadata?: Record<string, unknown>;
|
|
2532
|
+
}
|
|
2533
|
+
/**
|
|
2534
|
+
* Zod schema for EvalCase
|
|
2535
|
+
*
|
|
2536
|
+
* toolName and args are optional for llm_host mode (which uses scenario instead)
|
|
2537
|
+
*/
|
|
2538
|
+
declare const EvalCaseSchema: z.ZodObject<{
|
|
2539
|
+
id: z.ZodString;
|
|
2540
|
+
description: z.ZodOptional<z.ZodString>;
|
|
2541
|
+
mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
|
|
2542
|
+
toolName: z.ZodOptional<z.ZodString>;
|
|
2543
|
+
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2544
|
+
scenario: z.ZodOptional<z.ZodString>;
|
|
2545
|
+
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2546
|
+
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
2547
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2548
|
+
model: z.ZodOptional<z.ZodString>;
|
|
2549
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2550
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2551
|
+
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2552
|
+
}, "strip", z.ZodTypeAny, {
|
|
2553
|
+
provider: "anthropic" | "openai";
|
|
2554
|
+
model?: string | undefined;
|
|
2555
|
+
maxTokens?: number | undefined;
|
|
2556
|
+
apiKeyEnvVar?: string | undefined;
|
|
2557
|
+
temperature?: number | undefined;
|
|
2558
|
+
maxToolCalls?: number | undefined;
|
|
2559
|
+
}, {
|
|
2560
|
+
provider: "anthropic" | "openai";
|
|
2561
|
+
model?: string | undefined;
|
|
2562
|
+
maxTokens?: number | undefined;
|
|
2563
|
+
apiKeyEnvVar?: string | undefined;
|
|
2564
|
+
temperature?: number | undefined;
|
|
2565
|
+
maxToolCalls?: number | undefined;
|
|
2566
|
+
}>>;
|
|
2567
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2568
|
+
expect: z.ZodOptional<z.ZodObject<{
|
|
2569
|
+
response: z.ZodOptional<z.ZodUnknown>;
|
|
2570
|
+
schema: z.ZodOptional<z.ZodString>;
|
|
2571
|
+
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2572
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2573
|
+
snapshot: z.ZodOptional<z.ZodString>;
|
|
2574
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
|
|
2575
|
+
pattern: z.ZodString;
|
|
2576
|
+
replacement: z.ZodOptional<z.ZodString>;
|
|
2577
|
+
}, "strip", z.ZodTypeAny, {
|
|
2578
|
+
pattern: string;
|
|
2579
|
+
replacement?: string | undefined;
|
|
2580
|
+
}, {
|
|
2581
|
+
pattern: string;
|
|
2582
|
+
replacement?: string | undefined;
|
|
2583
|
+
}>, z.ZodObject<{
|
|
2584
|
+
remove: z.ZodArray<z.ZodString, "many">;
|
|
2585
|
+
}, "strip", z.ZodTypeAny, {
|
|
2586
|
+
remove: string[];
|
|
2587
|
+
}, {
|
|
2588
|
+
remove: string[];
|
|
2589
|
+
}>]>, "many">>;
|
|
2590
|
+
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2591
|
+
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2592
|
+
rubric: z.ZodString;
|
|
2593
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2594
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2595
|
+
configId: z.ZodOptional<z.ZodString>;
|
|
2596
|
+
}, "strip", z.ZodTypeAny, {
|
|
2597
|
+
rubric: string;
|
|
2598
|
+
reference?: unknown;
|
|
2599
|
+
threshold?: number | undefined;
|
|
2600
|
+
configId?: string | undefined;
|
|
2601
|
+
}, {
|
|
2602
|
+
rubric: string;
|
|
2603
|
+
reference?: unknown;
|
|
2604
|
+
threshold?: number | undefined;
|
|
2605
|
+
configId?: string | undefined;
|
|
2606
|
+
}>>;
|
|
2607
|
+
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2608
|
+
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
2609
|
+
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
2610
|
+
}, "strip", z.ZodTypeAny, {
|
|
2611
|
+
maxBytes?: number | undefined;
|
|
2612
|
+
minBytes?: number | undefined;
|
|
2613
|
+
}, {
|
|
2614
|
+
maxBytes?: number | undefined;
|
|
2615
|
+
minBytes?: number | undefined;
|
|
2616
|
+
}>>;
|
|
2617
|
+
}, "strip", z.ZodTypeAny, {
|
|
2618
|
+
isError?: string | boolean | string[] | undefined;
|
|
2619
|
+
schema?: string | undefined;
|
|
2620
|
+
snapshot?: string | undefined;
|
|
2621
|
+
response?: unknown;
|
|
2622
|
+
containsText?: string | string[] | undefined;
|
|
2623
|
+
matchesPattern?: string | string[] | undefined;
|
|
2624
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2625
|
+
pattern: string;
|
|
2626
|
+
replacement?: string | undefined;
|
|
2627
|
+
} | {
|
|
2628
|
+
remove: string[];
|
|
2629
|
+
})[] | undefined;
|
|
2630
|
+
passesJudge?: {
|
|
2631
|
+
rubric: string;
|
|
2632
|
+
reference?: unknown;
|
|
2633
|
+
threshold?: number | undefined;
|
|
2634
|
+
configId?: string | undefined;
|
|
2635
|
+
} | undefined;
|
|
2636
|
+
responseSize?: {
|
|
2637
|
+
maxBytes?: number | undefined;
|
|
2638
|
+
minBytes?: number | undefined;
|
|
2639
|
+
} | undefined;
|
|
2640
|
+
}, {
|
|
2641
|
+
isError?: string | boolean | string[] | undefined;
|
|
2642
|
+
schema?: string | undefined;
|
|
2643
|
+
snapshot?: string | undefined;
|
|
2644
|
+
response?: unknown;
|
|
2645
|
+
containsText?: string | string[] | undefined;
|
|
2646
|
+
matchesPattern?: string | string[] | undefined;
|
|
2647
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2648
|
+
pattern: string;
|
|
2649
|
+
replacement?: string | undefined;
|
|
2650
|
+
} | {
|
|
2651
|
+
remove: string[];
|
|
2652
|
+
})[] | undefined;
|
|
2653
|
+
passesJudge?: {
|
|
2654
|
+
rubric: string;
|
|
2655
|
+
reference?: unknown;
|
|
2656
|
+
threshold?: number | undefined;
|
|
2657
|
+
configId?: string | undefined;
|
|
2658
|
+
} | undefined;
|
|
2659
|
+
responseSize?: {
|
|
2660
|
+
maxBytes?: number | undefined;
|
|
2661
|
+
minBytes?: number | undefined;
|
|
2662
|
+
} | undefined;
|
|
2663
|
+
}>>;
|
|
2664
|
+
}, "strip", z.ZodTypeAny, {
|
|
2665
|
+
id: string;
|
|
2666
|
+
args?: Record<string, unknown> | undefined;
|
|
2667
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2668
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
2669
|
+
description?: string | undefined;
|
|
2670
|
+
toolName?: string | undefined;
|
|
2671
|
+
scenario?: string | undefined;
|
|
2672
|
+
llmHostConfig?: {
|
|
2673
|
+
provider: "anthropic" | "openai";
|
|
2674
|
+
model?: string | undefined;
|
|
2675
|
+
maxTokens?: number | undefined;
|
|
2676
|
+
apiKeyEnvVar?: string | undefined;
|
|
2677
|
+
temperature?: number | undefined;
|
|
2678
|
+
maxToolCalls?: number | undefined;
|
|
2679
|
+
} | undefined;
|
|
2680
|
+
expect?: {
|
|
2681
|
+
isError?: string | boolean | string[] | undefined;
|
|
2682
|
+
schema?: string | undefined;
|
|
2683
|
+
snapshot?: string | undefined;
|
|
2684
|
+
response?: unknown;
|
|
2685
|
+
containsText?: string | string[] | undefined;
|
|
2686
|
+
matchesPattern?: string | string[] | undefined;
|
|
2687
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2688
|
+
pattern: string;
|
|
2689
|
+
replacement?: string | undefined;
|
|
2690
|
+
} | {
|
|
2691
|
+
remove: string[];
|
|
2692
|
+
})[] | undefined;
|
|
2693
|
+
passesJudge?: {
|
|
2694
|
+
rubric: string;
|
|
2695
|
+
reference?: unknown;
|
|
2696
|
+
threshold?: number | undefined;
|
|
2697
|
+
configId?: string | undefined;
|
|
2698
|
+
} | undefined;
|
|
2699
|
+
responseSize?: {
|
|
2700
|
+
maxBytes?: number | undefined;
|
|
2701
|
+
minBytes?: number | undefined;
|
|
2702
|
+
} | undefined;
|
|
2703
|
+
} | undefined;
|
|
2704
|
+
}, {
|
|
2705
|
+
id: string;
|
|
2706
|
+
args?: Record<string, unknown> | undefined;
|
|
2707
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2708
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
2709
|
+
description?: string | undefined;
|
|
2710
|
+
toolName?: string | undefined;
|
|
2711
|
+
scenario?: string | undefined;
|
|
2712
|
+
llmHostConfig?: {
|
|
2713
|
+
provider: "anthropic" | "openai";
|
|
2714
|
+
model?: string | undefined;
|
|
2715
|
+
maxTokens?: number | undefined;
|
|
2716
|
+
apiKeyEnvVar?: string | undefined;
|
|
2717
|
+
temperature?: number | undefined;
|
|
2718
|
+
maxToolCalls?: number | undefined;
|
|
2719
|
+
} | undefined;
|
|
2720
|
+
expect?: {
|
|
2721
|
+
isError?: string | boolean | string[] | undefined;
|
|
2722
|
+
schema?: string | undefined;
|
|
2723
|
+
snapshot?: string | undefined;
|
|
2724
|
+
response?: unknown;
|
|
2725
|
+
containsText?: string | string[] | undefined;
|
|
2726
|
+
matchesPattern?: string | string[] | undefined;
|
|
2727
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2728
|
+
pattern: string;
|
|
2729
|
+
replacement?: string | undefined;
|
|
2730
|
+
} | {
|
|
2731
|
+
remove: string[];
|
|
2732
|
+
})[] | undefined;
|
|
2733
|
+
passesJudge?: {
|
|
2734
|
+
rubric: string;
|
|
2735
|
+
reference?: unknown;
|
|
2736
|
+
threshold?: number | undefined;
|
|
2737
|
+
configId?: string | undefined;
|
|
2738
|
+
} | undefined;
|
|
2739
|
+
responseSize?: {
|
|
2740
|
+
maxBytes?: number | undefined;
|
|
2741
|
+
minBytes?: number | undefined;
|
|
2742
|
+
} | undefined;
|
|
2743
|
+
} | undefined;
|
|
2744
|
+
}>;
|
|
2745
|
+
/**
|
|
2746
|
+
* Zod schema for EvalDataset (without schemas field, as schemas aren't serializable)
|
|
2747
|
+
*/
|
|
2748
|
+
declare const EvalDatasetSchema: z.ZodObject<{
|
|
2749
|
+
name: z.ZodString;
|
|
2750
|
+
description: z.ZodOptional<z.ZodString>;
|
|
2751
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
2752
|
+
id: z.ZodString;
|
|
2753
|
+
description: z.ZodOptional<z.ZodString>;
|
|
2754
|
+
mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
|
|
2755
|
+
toolName: z.ZodOptional<z.ZodString>;
|
|
2756
|
+
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2757
|
+
scenario: z.ZodOptional<z.ZodString>;
|
|
2758
|
+
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2759
|
+
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
2760
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2761
|
+
model: z.ZodOptional<z.ZodString>;
|
|
2762
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2763
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2764
|
+
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2765
|
+
}, "strip", z.ZodTypeAny, {
|
|
2766
|
+
provider: "anthropic" | "openai";
|
|
2767
|
+
model?: string | undefined;
|
|
2768
|
+
maxTokens?: number | undefined;
|
|
2769
|
+
apiKeyEnvVar?: string | undefined;
|
|
2770
|
+
temperature?: number | undefined;
|
|
2771
|
+
maxToolCalls?: number | undefined;
|
|
2772
|
+
}, {
|
|
2773
|
+
provider: "anthropic" | "openai";
|
|
2774
|
+
model?: string | undefined;
|
|
2775
|
+
maxTokens?: number | undefined;
|
|
2776
|
+
apiKeyEnvVar?: string | undefined;
|
|
2777
|
+
temperature?: number | undefined;
|
|
2778
|
+
maxToolCalls?: number | undefined;
|
|
2779
|
+
}>>;
|
|
2780
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2781
|
+
expect: z.ZodOptional<z.ZodObject<{
|
|
2782
|
+
response: z.ZodOptional<z.ZodUnknown>;
|
|
2783
|
+
schema: z.ZodOptional<z.ZodString>;
|
|
2784
|
+
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2785
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2786
|
+
snapshot: z.ZodOptional<z.ZodString>;
|
|
2787
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
|
|
2788
|
+
pattern: z.ZodString;
|
|
2789
|
+
replacement: z.ZodOptional<z.ZodString>;
|
|
2790
|
+
}, "strip", z.ZodTypeAny, {
|
|
2791
|
+
pattern: string;
|
|
2792
|
+
replacement?: string | undefined;
|
|
2793
|
+
}, {
|
|
2794
|
+
pattern: string;
|
|
2795
|
+
replacement?: string | undefined;
|
|
2796
|
+
}>, z.ZodObject<{
|
|
2797
|
+
remove: z.ZodArray<z.ZodString, "many">;
|
|
2798
|
+
}, "strip", z.ZodTypeAny, {
|
|
2799
|
+
remove: string[];
|
|
2800
|
+
}, {
|
|
2801
|
+
remove: string[];
|
|
2802
|
+
}>]>, "many">>;
|
|
2803
|
+
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2804
|
+
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2805
|
+
rubric: z.ZodString;
|
|
2806
|
+
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2807
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2808
|
+
configId: z.ZodOptional<z.ZodString>;
|
|
2809
|
+
}, "strip", z.ZodTypeAny, {
|
|
2810
|
+
rubric: string;
|
|
2811
|
+
reference?: unknown;
|
|
2812
|
+
threshold?: number | undefined;
|
|
2813
|
+
configId?: string | undefined;
|
|
2814
|
+
}, {
|
|
2815
|
+
rubric: string;
|
|
2816
|
+
reference?: unknown;
|
|
2817
|
+
threshold?: number | undefined;
|
|
2818
|
+
configId?: string | undefined;
|
|
2819
|
+
}>>;
|
|
2820
|
+
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2821
|
+
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
2822
|
+
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
2823
|
+
}, "strip", z.ZodTypeAny, {
|
|
2824
|
+
maxBytes?: number | undefined;
|
|
2825
|
+
minBytes?: number | undefined;
|
|
2826
|
+
}, {
|
|
2827
|
+
maxBytes?: number | undefined;
|
|
2828
|
+
minBytes?: number | undefined;
|
|
2829
|
+
}>>;
|
|
2830
|
+
}, "strip", z.ZodTypeAny, {
|
|
2831
|
+
isError?: string | boolean | string[] | undefined;
|
|
2832
|
+
schema?: string | undefined;
|
|
2833
|
+
snapshot?: string | undefined;
|
|
2834
|
+
response?: unknown;
|
|
2835
|
+
containsText?: string | string[] | undefined;
|
|
2836
|
+
matchesPattern?: string | string[] | undefined;
|
|
2837
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2838
|
+
pattern: string;
|
|
2839
|
+
replacement?: string | undefined;
|
|
2840
|
+
} | {
|
|
2841
|
+
remove: string[];
|
|
2842
|
+
})[] | undefined;
|
|
2843
|
+
passesJudge?: {
|
|
2844
|
+
rubric: string;
|
|
2845
|
+
reference?: unknown;
|
|
2846
|
+
threshold?: number | undefined;
|
|
2847
|
+
configId?: string | undefined;
|
|
2848
|
+
} | undefined;
|
|
2849
|
+
responseSize?: {
|
|
2850
|
+
maxBytes?: number | undefined;
|
|
2851
|
+
minBytes?: number | undefined;
|
|
2852
|
+
} | undefined;
|
|
2853
|
+
}, {
|
|
2854
|
+
isError?: string | boolean | string[] | undefined;
|
|
2855
|
+
schema?: string | undefined;
|
|
2856
|
+
snapshot?: string | undefined;
|
|
2857
|
+
response?: unknown;
|
|
2858
|
+
containsText?: string | string[] | undefined;
|
|
2859
|
+
matchesPattern?: string | string[] | undefined;
|
|
2860
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2861
|
+
pattern: string;
|
|
2862
|
+
replacement?: string | undefined;
|
|
2863
|
+
} | {
|
|
2864
|
+
remove: string[];
|
|
2865
|
+
})[] | undefined;
|
|
2866
|
+
passesJudge?: {
|
|
2867
|
+
rubric: string;
|
|
2868
|
+
reference?: unknown;
|
|
2869
|
+
threshold?: number | undefined;
|
|
2870
|
+
configId?: string | undefined;
|
|
2871
|
+
} | undefined;
|
|
2872
|
+
responseSize?: {
|
|
2873
|
+
maxBytes?: number | undefined;
|
|
2874
|
+
minBytes?: number | undefined;
|
|
2875
|
+
} | undefined;
|
|
2876
|
+
}>>;
|
|
2877
|
+
}, "strip", z.ZodTypeAny, {
|
|
2878
|
+
id: string;
|
|
2879
|
+
args?: Record<string, unknown> | undefined;
|
|
2880
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2881
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
2882
|
+
description?: string | undefined;
|
|
2883
|
+
toolName?: string | undefined;
|
|
2884
|
+
scenario?: string | undefined;
|
|
2885
|
+
llmHostConfig?: {
|
|
2886
|
+
provider: "anthropic" | "openai";
|
|
2887
|
+
model?: string | undefined;
|
|
2888
|
+
maxTokens?: number | undefined;
|
|
2889
|
+
apiKeyEnvVar?: string | undefined;
|
|
2890
|
+
temperature?: number | undefined;
|
|
2891
|
+
maxToolCalls?: number | undefined;
|
|
2892
|
+
} | undefined;
|
|
2893
|
+
expect?: {
|
|
2894
|
+
isError?: string | boolean | string[] | undefined;
|
|
2895
|
+
schema?: string | undefined;
|
|
2896
|
+
snapshot?: string | undefined;
|
|
2897
|
+
response?: unknown;
|
|
2898
|
+
containsText?: string | string[] | undefined;
|
|
2899
|
+
matchesPattern?: string | string[] | undefined;
|
|
2900
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2901
|
+
pattern: string;
|
|
2902
|
+
replacement?: string | undefined;
|
|
2903
|
+
} | {
|
|
2904
|
+
remove: string[];
|
|
2905
|
+
})[] | undefined;
|
|
2906
|
+
passesJudge?: {
|
|
2907
|
+
rubric: string;
|
|
2908
|
+
reference?: unknown;
|
|
2909
|
+
threshold?: number | undefined;
|
|
2910
|
+
configId?: string | undefined;
|
|
2911
|
+
} | undefined;
|
|
2912
|
+
responseSize?: {
|
|
2913
|
+
maxBytes?: number | undefined;
|
|
2914
|
+
minBytes?: number | undefined;
|
|
2915
|
+
} | undefined;
|
|
2916
|
+
} | undefined;
|
|
2917
|
+
}, {
|
|
2918
|
+
id: string;
|
|
2919
|
+
args?: Record<string, unknown> | undefined;
|
|
2920
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2921
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
2922
|
+
description?: string | undefined;
|
|
2923
|
+
toolName?: string | undefined;
|
|
2924
|
+
scenario?: string | undefined;
|
|
2925
|
+
llmHostConfig?: {
|
|
2926
|
+
provider: "anthropic" | "openai";
|
|
2927
|
+
model?: string | undefined;
|
|
2928
|
+
maxTokens?: number | undefined;
|
|
2929
|
+
apiKeyEnvVar?: string | undefined;
|
|
2930
|
+
temperature?: number | undefined;
|
|
2931
|
+
maxToolCalls?: number | undefined;
|
|
2932
|
+
} | undefined;
|
|
2933
|
+
expect?: {
|
|
2934
|
+
isError?: string | boolean | string[] | undefined;
|
|
2935
|
+
schema?: string | undefined;
|
|
2936
|
+
snapshot?: string | undefined;
|
|
2937
|
+
response?: unknown;
|
|
2938
|
+
containsText?: string | string[] | undefined;
|
|
2939
|
+
matchesPattern?: string | string[] | undefined;
|
|
2940
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2941
|
+
pattern: string;
|
|
2942
|
+
replacement?: string | undefined;
|
|
2943
|
+
} | {
|
|
2944
|
+
remove: string[];
|
|
2945
|
+
})[] | undefined;
|
|
2946
|
+
passesJudge?: {
|
|
2947
|
+
rubric: string;
|
|
2948
|
+
reference?: unknown;
|
|
2949
|
+
threshold?: number | undefined;
|
|
2950
|
+
configId?: string | undefined;
|
|
2951
|
+
} | undefined;
|
|
2952
|
+
responseSize?: {
|
|
2953
|
+
maxBytes?: number | undefined;
|
|
2954
|
+
minBytes?: number | undefined;
|
|
2955
|
+
} | undefined;
|
|
2956
|
+
} | undefined;
|
|
2957
|
+
}>, "many">;
|
|
2958
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2959
|
+
}, "strip", z.ZodTypeAny, {
|
|
2960
|
+
name: string;
|
|
2961
|
+
cases: {
|
|
2962
|
+
id: string;
|
|
2963
|
+
args?: Record<string, unknown> | undefined;
|
|
2964
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2965
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
2966
|
+
description?: string | undefined;
|
|
2967
|
+
toolName?: string | undefined;
|
|
2968
|
+
scenario?: string | undefined;
|
|
2969
|
+
llmHostConfig?: {
|
|
2970
|
+
provider: "anthropic" | "openai";
|
|
2971
|
+
model?: string | undefined;
|
|
2972
|
+
maxTokens?: number | undefined;
|
|
2973
|
+
apiKeyEnvVar?: string | undefined;
|
|
2974
|
+
temperature?: number | undefined;
|
|
2975
|
+
maxToolCalls?: number | undefined;
|
|
2976
|
+
} | undefined;
|
|
2977
|
+
expect?: {
|
|
2978
|
+
isError?: string | boolean | string[] | undefined;
|
|
2979
|
+
schema?: string | undefined;
|
|
2980
|
+
snapshot?: string | undefined;
|
|
2981
|
+
response?: unknown;
|
|
2982
|
+
containsText?: string | string[] | undefined;
|
|
2983
|
+
matchesPattern?: string | string[] | undefined;
|
|
2984
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
2985
|
+
pattern: string;
|
|
2986
|
+
replacement?: string | undefined;
|
|
2987
|
+
} | {
|
|
2988
|
+
remove: string[];
|
|
2989
|
+
})[] | undefined;
|
|
2990
|
+
passesJudge?: {
|
|
2991
|
+
rubric: string;
|
|
2992
|
+
reference?: unknown;
|
|
2993
|
+
threshold?: number | undefined;
|
|
2994
|
+
configId?: string | undefined;
|
|
2995
|
+
} | undefined;
|
|
2996
|
+
responseSize?: {
|
|
2997
|
+
maxBytes?: number | undefined;
|
|
2998
|
+
minBytes?: number | undefined;
|
|
2999
|
+
} | undefined;
|
|
3000
|
+
} | undefined;
|
|
3001
|
+
}[];
|
|
3002
|
+
metadata?: Record<string, unknown> | undefined;
|
|
3003
|
+
description?: string | undefined;
|
|
3004
|
+
}, {
|
|
3005
|
+
name: string;
|
|
3006
|
+
cases: {
|
|
3007
|
+
id: string;
|
|
3008
|
+
args?: Record<string, unknown> | undefined;
|
|
3009
|
+
metadata?: Record<string, unknown> | undefined;
|
|
3010
|
+
mode?: "direct" | "llm_host" | undefined;
|
|
3011
|
+
description?: string | undefined;
|
|
3012
|
+
toolName?: string | undefined;
|
|
3013
|
+
scenario?: string | undefined;
|
|
3014
|
+
llmHostConfig?: {
|
|
3015
|
+
provider: "anthropic" | "openai";
|
|
3016
|
+
model?: string | undefined;
|
|
3017
|
+
maxTokens?: number | undefined;
|
|
3018
|
+
apiKeyEnvVar?: string | undefined;
|
|
3019
|
+
temperature?: number | undefined;
|
|
3020
|
+
maxToolCalls?: number | undefined;
|
|
3021
|
+
} | undefined;
|
|
3022
|
+
expect?: {
|
|
3023
|
+
isError?: string | boolean | string[] | undefined;
|
|
3024
|
+
schema?: string | undefined;
|
|
3025
|
+
snapshot?: string | undefined;
|
|
3026
|
+
response?: unknown;
|
|
3027
|
+
containsText?: string | string[] | undefined;
|
|
3028
|
+
matchesPattern?: string | string[] | undefined;
|
|
3029
|
+
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
3030
|
+
pattern: string;
|
|
3031
|
+
replacement?: string | undefined;
|
|
3032
|
+
} | {
|
|
3033
|
+
remove: string[];
|
|
3034
|
+
})[] | undefined;
|
|
3035
|
+
passesJudge?: {
|
|
3036
|
+
rubric: string;
|
|
3037
|
+
reference?: unknown;
|
|
3038
|
+
threshold?: number | undefined;
|
|
3039
|
+
configId?: string | undefined;
|
|
3040
|
+
} | undefined;
|
|
3041
|
+
responseSize?: {
|
|
3042
|
+
maxBytes?: number | undefined;
|
|
3043
|
+
minBytes?: number | undefined;
|
|
3044
|
+
} | undefined;
|
|
3045
|
+
} | undefined;
|
|
3046
|
+
}[];
|
|
3047
|
+
metadata?: Record<string, unknown> | undefined;
|
|
3048
|
+
description?: string | undefined;
|
|
3049
|
+
}>;
|
|
3050
|
+
/**
|
|
3051
|
+
* Type for serialized eval dataset (without Zod schemas)
|
|
3052
|
+
*/
|
|
3053
|
+
type SerializedEvalDataset = z.infer<typeof EvalDatasetSchema>;
|
|
3054
|
+
/**
|
|
3055
|
+
* Validates an eval case
|
|
3056
|
+
*
|
|
3057
|
+
* @param evalCase - The eval case to validate
|
|
3058
|
+
* @returns The validated eval case
|
|
3059
|
+
* @throws {z.ZodError} If validation fails
|
|
3060
|
+
*/
|
|
3061
|
+
declare function validateEvalCase(evalCase: unknown): EvalCase;
|
|
3062
|
+
/**
|
|
3063
|
+
* Validates a serialized eval dataset
|
|
3064
|
+
*
|
|
3065
|
+
* @param dataset - The dataset to validate
|
|
3066
|
+
* @returns The validated dataset
|
|
3067
|
+
* @throws {z.ZodError} If validation fails
|
|
3068
|
+
*/
|
|
3069
|
+
declare function validateEvalDataset(dataset: unknown): SerializedEvalDataset;
|
|
3070
|
+
|
|
3071
|
+
/**
|
|
3072
|
+
* Options for loading an eval dataset
|
|
3073
|
+
*/
|
|
3074
|
+
interface LoadDatasetOptions {
|
|
3075
|
+
/**
|
|
3076
|
+
* Optional schema definitions to attach to the dataset
|
|
3077
|
+
*
|
|
3078
|
+
* Keys should match the expectedSchemaName in eval cases
|
|
3079
|
+
*/
|
|
3080
|
+
schemas?: Record<string, z.ZodSchema>;
|
|
3081
|
+
/**
|
|
3082
|
+
* Whether to validate the loaded dataset
|
|
3083
|
+
* @default true
|
|
3084
|
+
*/
|
|
3085
|
+
validate?: boolean;
|
|
3086
|
+
}
|
|
3087
|
+
/**
|
|
3088
|
+
* Loads an eval dataset from a JSON file
|
|
3089
|
+
*
|
|
3090
|
+
* @param filePath - Absolute path to the JSON file
|
|
3091
|
+
* @param options - Load options
|
|
3092
|
+
* @returns The loaded and validated dataset
|
|
3093
|
+
* @throws {Error} If file cannot be read or JSON is invalid
|
|
3094
|
+
* @throws {z.ZodError} If validation fails
|
|
3095
|
+
*
|
|
3096
|
+
* @example
|
|
3097
|
+
* const dataset = await loadEvalDataset('./data/my-evals.json', {
|
|
3098
|
+
* schemas: {
|
|
3099
|
+
* 'weather-response': WeatherResponseSchema,
|
|
3100
|
+
* },
|
|
3101
|
+
* });
|
|
3102
|
+
*/
|
|
3103
|
+
declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions): Promise<EvalDataset>;
|
|
3104
|
+
/**
|
|
3105
|
+
* Loads an eval dataset from a plain object
|
|
3106
|
+
*
|
|
3107
|
+
* Useful for programmatically creating datasets in tests
|
|
3108
|
+
*
|
|
3109
|
+
* @param data - The dataset data
|
|
3110
|
+
* @param options - Load options
|
|
3111
|
+
* @returns The loaded and validated dataset
|
|
3112
|
+
* @throws {z.ZodError} If validation fails
|
|
3113
|
+
*
|
|
3114
|
+
* @example
|
|
3115
|
+
* const dataset = loadEvalDatasetFromObject({
|
|
3116
|
+
* name: 'my-test-dataset',
|
|
3117
|
+
* cases: [
|
|
3118
|
+
* {
|
|
3119
|
+
* id: 'case-1',
|
|
3120
|
+
* toolName: 'get_weather',
|
|
3121
|
+
* args: { city: 'London' },
|
|
3122
|
+
* },
|
|
3123
|
+
* ],
|
|
3124
|
+
* });
|
|
3125
|
+
*/
|
|
3126
|
+
declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
|
|
3127
|
+
|
|
3128
|
+
/**
|
|
3129
|
+
* Context passed to the eval runner
|
|
3130
|
+
*/
|
|
3131
|
+
interface EvalContext {
|
|
3132
|
+
/**
|
|
3133
|
+
* MCP fixture API for interacting with the server
|
|
3134
|
+
*/
|
|
3135
|
+
mcp: MCPFixtureApi;
|
|
3136
|
+
/**
|
|
3137
|
+
* Optional Playwright TestInfo for reporter integration
|
|
3138
|
+
* When provided, eval results will be attached to the test for the MCP reporter
|
|
3139
|
+
*/
|
|
3140
|
+
testInfo?: TestInfo;
|
|
3141
|
+
/**
|
|
3142
|
+
* Optional Playwright expect function for snapshot testing
|
|
3143
|
+
* Required for snapshot expectations to work properly
|
|
3144
|
+
*/
|
|
3145
|
+
expect?: Expect;
|
|
3146
|
+
}
|
|
3147
|
+
|
|
3148
|
+
/**
|
|
3149
|
+
* Result of a single eval case
|
|
3150
|
+
*/
|
|
3151
|
+
interface EvalCaseResult$1 {
|
|
3152
|
+
/**
|
|
3153
|
+
* Case ID
|
|
3154
|
+
*/
|
|
3155
|
+
id: string;
|
|
3156
|
+
/**
|
|
3157
|
+
* Dataset name this case belongs to
|
|
3158
|
+
*/
|
|
3159
|
+
datasetName: string;
|
|
3160
|
+
/**
|
|
3161
|
+
* MCP tool name that was called
|
|
3162
|
+
*/
|
|
3163
|
+
toolName: string;
|
|
3164
|
+
/**
|
|
3165
|
+
* Evaluation mode (direct or llm_host)
|
|
3166
|
+
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
3167
|
+
*/
|
|
3168
|
+
mode?: 'direct' | 'llm_host';
|
|
3169
|
+
/**
|
|
3170
|
+
* Source of this result
|
|
3171
|
+
* - 'eval': From runEvalDataset() using JSON eval datasets
|
|
3172
|
+
* - 'test': From direct API test tracking (MCP fixture calls)
|
|
3173
|
+
*/
|
|
3174
|
+
source: ResultSource;
|
|
3175
|
+
/**
|
|
3176
|
+
* Overall pass/fail status
|
|
3177
|
+
*/
|
|
3178
|
+
pass: boolean;
|
|
3179
|
+
/**
|
|
3180
|
+
* Tool response
|
|
3181
|
+
*/
|
|
3182
|
+
response?: unknown;
|
|
3183
|
+
/**
|
|
3184
|
+
* Error if tool call failed
|
|
3185
|
+
*/
|
|
3186
|
+
error?: string;
|
|
3187
|
+
/**
|
|
3188
|
+
* Expectation results
|
|
3189
|
+
*/
|
|
3190
|
+
expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
|
|
3191
|
+
/**
|
|
3192
|
+
* Authentication type used for this test
|
|
3193
|
+
*/
|
|
3194
|
+
authType?: AuthType;
|
|
3195
|
+
/**
|
|
3196
|
+
* Playwright project name this test belongs to
|
|
3197
|
+
* Used for filtering/grouping results by project in the reporter
|
|
3198
|
+
*/
|
|
3199
|
+
project?: string;
|
|
3200
|
+
/**
|
|
3201
|
+
* Execution time in milliseconds
|
|
3202
|
+
*/
|
|
3203
|
+
durationMs: number;
|
|
3204
|
+
}
|
|
3205
|
+
/**
|
|
3206
|
+
* Overall result of running an eval dataset
|
|
3207
|
+
*/
|
|
3208
|
+
interface EvalRunnerResult {
|
|
3209
|
+
/**
|
|
3210
|
+
* Total number of cases
|
|
3211
|
+
*/
|
|
3212
|
+
total: number;
|
|
3213
|
+
/**
|
|
3214
|
+
* Number of passing cases
|
|
3215
|
+
*/
|
|
3216
|
+
passed: number;
|
|
3217
|
+
/**
|
|
3218
|
+
* Number of failing cases
|
|
3219
|
+
*/
|
|
3220
|
+
failed: number;
|
|
3221
|
+
/**
|
|
3222
|
+
* Individual case results
|
|
3223
|
+
*/
|
|
3224
|
+
caseResults: Array<EvalCaseResult$1>;
|
|
3225
|
+
/**
|
|
3226
|
+
* Overall execution time in milliseconds
|
|
3227
|
+
*/
|
|
3228
|
+
durationMs: number;
|
|
3229
|
+
}
|
|
3230
|
+
/**
|
|
3231
|
+
* Options for running eval dataset
|
|
3232
|
+
*/
|
|
3233
|
+
interface EvalRunnerOptions {
|
|
3234
|
+
/**
|
|
3235
|
+
* The dataset to run
|
|
3236
|
+
*/
|
|
3237
|
+
dataset: EvalDataset;
|
|
3238
|
+
/**
|
|
3239
|
+
* Schema registry for schema validation by name
|
|
3240
|
+
*
|
|
3241
|
+
* Maps schema names to Zod schemas for use with expect.schema
|
|
3242
|
+
*
|
|
3243
|
+
* @example
|
|
3244
|
+
* ```typescript
|
|
3245
|
+
* {
|
|
3246
|
+
* schemas: {
|
|
3247
|
+
* WeatherResponse: z.object({ temperature: z.number() }),
|
|
3248
|
+
* ErrorResponse: z.object({ error: z.string() }),
|
|
3249
|
+
* }
|
|
3250
|
+
* }
|
|
3251
|
+
* ```
|
|
3252
|
+
*/
|
|
3253
|
+
schemas?: Record<string, ZodType>;
|
|
3254
|
+
/**
|
|
3255
|
+
* Judge configuration registry by ID
|
|
3256
|
+
*
|
|
3257
|
+
* Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
|
|
3258
|
+
*/
|
|
3259
|
+
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3260
|
+
/**
|
|
3261
|
+
* Whether to stop on first failure
|
|
3262
|
+
* @default false
|
|
3263
|
+
*/
|
|
3264
|
+
stopOnFailure?: boolean;
|
|
3265
|
+
/**
|
|
3266
|
+
* Optional callback called after each case
|
|
3267
|
+
*/
|
|
3268
|
+
onCaseComplete?: (result: EvalCaseResult$1) => void | Promise<void>;
|
|
3269
|
+
}
|
|
3270
|
+
/**
|
|
3271
|
+
* Options for running a single eval case
|
|
3272
|
+
*/
|
|
3273
|
+
interface EvalCaseOptions {
|
|
3274
|
+
/**
|
|
3275
|
+
* Dataset name for the result (defaults to 'single-case')
|
|
3276
|
+
*/
|
|
3277
|
+
datasetName?: string;
|
|
3278
|
+
/**
|
|
3279
|
+
* Schema registry for schema validation by name
|
|
3280
|
+
*/
|
|
3281
|
+
schemas?: Record<string, ZodType>;
|
|
3282
|
+
/**
|
|
3283
|
+
* Judge configuration registry by ID
|
|
3284
|
+
*/
|
|
3285
|
+
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3286
|
+
}
|
|
3287
|
+
/**
|
|
3288
|
+
* Runs a single eval case and returns the result
|
|
3289
|
+
*
|
|
3290
|
+
* @param evalCase - The eval case to run
|
|
3291
|
+
* @param context - Context containing mcp, testInfo, expect
|
|
3292
|
+
* @param options - Optional configuration (datasetName, schemas, judgeConfigs)
|
|
3293
|
+
* @returns The result of running the eval case
|
|
3294
|
+
*
|
|
3295
|
+
* @example
|
|
3296
|
+
* ```typescript
|
|
3297
|
+
* const result = await runEvalCase(
|
|
3298
|
+
* evalCase,
|
|
3299
|
+
* { mcp, testInfo, expect },
|
|
3300
|
+
* { schemas: { WeatherResponse: WeatherSchema } }
|
|
3301
|
+
* );
|
|
3302
|
+
*
|
|
3303
|
+
* expect(result.pass).toBe(true);
|
|
3304
|
+
* ```
|
|
3305
|
+
*/
|
|
3306
|
+
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult$1>;
|
|
3307
|
+
/**
|
|
3308
|
+
* Runs an eval dataset against an MCP server
|
|
3309
|
+
*
|
|
3310
|
+
* This function composes runEvalCase() for each case in the dataset,
|
|
3311
|
+
* adding dataset-level features like stopOnFailure and callbacks.
|
|
3312
|
+
*
|
|
3313
|
+
* @param options - Eval runner options (dataset, schemas, judgeConfigs)
|
|
3314
|
+
* @param context - Eval context (mcp fixture, optional testInfo, optional expect)
|
|
3315
|
+
* @returns Eval results
|
|
3316
|
+
*
|
|
3317
|
+
* @example
|
|
3318
|
+
* // Basic usage
|
|
3319
|
+
* const result = await runEvalDataset(
|
|
3320
|
+
* {
|
|
3321
|
+
* dataset,
|
|
3322
|
+
* schemas: { WeatherResponse: WeatherSchema },
|
|
3323
|
+
* },
|
|
3324
|
+
* { mcp }
|
|
3325
|
+
* );
|
|
3326
|
+
*
|
|
3327
|
+
* @example
|
|
3328
|
+
* // With MCP reporter integration
|
|
3329
|
+
* test('eval dataset', async ({ mcp }, testInfo) => {
|
|
3330
|
+
* const result = await runEvalDataset(
|
|
3331
|
+
* { dataset },
|
|
3332
|
+
* { mcp, testInfo } // testInfo enables MCP reporter
|
|
3333
|
+
* );
|
|
3334
|
+
* });
|
|
3335
|
+
*/
|
|
3336
|
+
declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
|
|
3337
|
+
|
|
3338
|
+
/**
|
|
3339
|
+
* LLM Host Simulation - Main entry point
|
|
3340
|
+
*
|
|
3341
|
+
* Provides the public API for simulating LLM hosts interacting
|
|
3342
|
+
* with MCP servers through actual LLM providers.
|
|
3343
|
+
*/
|
|
3344
|
+
|
|
3345
|
+
/**
|
|
3346
|
+
* Simulates an LLM host interacting with an MCP server
|
|
3347
|
+
*
|
|
3348
|
+
* This function uses actual LLM providers (OpenAI or Anthropic) to test
|
|
3349
|
+
* MCP servers through natural language scenarios. The LLM chooses which
|
|
3350
|
+
* tools to call based on their descriptions, testing discoverability and
|
|
3351
|
+
* parameter clarity.
|
|
3352
|
+
*
|
|
3353
|
+
* @param mcp - MCP fixture API
|
|
3354
|
+
* @param scenario - Natural language prompt describing what to do
|
|
3355
|
+
* @param config - LLM host configuration
|
|
3356
|
+
* @returns Simulation result with tool calls and final response
|
|
3357
|
+
*
|
|
3358
|
+
* @example
|
|
3359
|
+
* ```typescript
|
|
3360
|
+
* const result = await simulateLLMHost(mcp,
|
|
3361
|
+
* "Get the weather for London",
|
|
3362
|
+
* {
|
|
3363
|
+
* provider: 'openai',
|
|
3364
|
+
* model: 'gpt-4o'
|
|
3365
|
+
* }
|
|
3366
|
+
* );
|
|
3367
|
+
*
|
|
3368
|
+
* expect(result.success).toBe(true);
|
|
3369
|
+
* expect(result.toolCalls).toContainEqual({
|
|
3370
|
+
* name: 'get_weather',
|
|
3371
|
+
* arguments: { city: 'London' }
|
|
3372
|
+
* });
|
|
3373
|
+
* ```
|
|
3374
|
+
*/
|
|
3375
|
+
declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
3376
|
+
/**
|
|
3377
|
+
* Checks if the required SDK is available for a given provider
|
|
3378
|
+
*
|
|
3379
|
+
* This performs a quick check without actually loading the SDK.
|
|
3380
|
+
* The actual SDK loading happens in the adapter when simulation runs.
|
|
3381
|
+
*
|
|
3382
|
+
* @param provider - LLM provider to check
|
|
3383
|
+
* @returns true if an adapter is registered for the provider
|
|
3384
|
+
*/
|
|
3385
|
+
declare function isProviderAvailable(provider: LLMProvider): boolean;
|
|
3386
|
+
/**
|
|
3387
|
+
* Gets a helpful error message for missing dependencies
|
|
3388
|
+
*
|
|
3389
|
+
* @param provider - LLM provider
|
|
3390
|
+
* @returns Error message with installation instructions
|
|
3391
|
+
*/
|
|
3392
|
+
declare function getMissingDependencyMessage(provider: LLMProvider): string;
|
|
3393
|
+
|
|
3394
|
+
/**
|
|
3395
|
+
* Tool call validator for LLM host mode
|
|
3396
|
+
*
|
|
3397
|
+
* Validates that the LLM made the expected tool calls with correct arguments
|
|
3398
|
+
*/
|
|
3399
|
+
|
|
3400
|
+
/**
|
|
3401
|
+
* Tool call validation function signature
|
|
3402
|
+
*/
|
|
3403
|
+
type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
|
|
3404
|
+
/**
|
|
3405
|
+
* Creates a tool call validator for LLM host mode
|
|
3406
|
+
*
|
|
3407
|
+
* Validates that the LLM made the expected tool calls with correct arguments.
|
|
3408
|
+
* Supports partial argument matching and optional calls.
|
|
3409
|
+
*
|
|
3410
|
+
* @returns Validator function
|
|
3411
|
+
*
|
|
3412
|
+
* @example
|
|
3413
|
+
* ```typescript
|
|
3414
|
+
* // In your eval case:
|
|
3415
|
+
* {
|
|
3416
|
+
* "id": "weather-london",
|
|
3417
|
+
* "mode": "llm_host",
|
|
3418
|
+
* "scenario": "Get the weather for London",
|
|
3419
|
+
* "expectedToolCalls": [
|
|
3420
|
+
* {
|
|
3421
|
+
* "name": "get_weather",
|
|
3422
|
+
* "arguments": { "city": "London" },
|
|
3423
|
+
* "required": true
|
|
3424
|
+
* }
|
|
3425
|
+
* ]
|
|
3426
|
+
* }
|
|
3427
|
+
* ```
|
|
3428
|
+
*/
|
|
3429
|
+
declare function createToolCallValidator(): ToolCallValidator;
|
|
3430
|
+
|
|
3431
|
+
/**
|
|
3432
|
+
* Creates an LLM judge for evaluating tool responses
|
|
3433
|
+
*
|
|
3434
|
+
* Uses Claude Agent SDK for evaluation with usage metrics tracking.
|
|
3435
|
+
*
|
|
3436
|
+
* @param config - Judge configuration
|
|
3437
|
+
* @returns Judge instance
|
|
3438
|
+
* @throws {Error} If provider is unsupported or configuration is invalid
|
|
3439
|
+
*
|
|
3440
|
+
* @example
|
|
3441
|
+
* // Default Claude judge
|
|
3442
|
+
* const judge = createJudge();
|
|
3443
|
+
*
|
|
3444
|
+
* @example
|
|
3445
|
+
* // With configuration
|
|
3446
|
+
* const judge = createJudge({
|
|
3447
|
+
* model: 'claude-sonnet-4-20250514',
|
|
3448
|
+
* maxToolOutputSize: 50000, // Fail if response > 50KB
|
|
3449
|
+
* maxBudgetUsd: 0.05,
|
|
3450
|
+
* });
|
|
3451
|
+
*
|
|
3452
|
+
* // Evaluate a response
|
|
3453
|
+
* const result = await judge.evaluate(
|
|
3454
|
+
* candidateResponse,
|
|
3455
|
+
* referenceResponse,
|
|
3456
|
+
* 'Evaluate for accuracy and completeness'
|
|
3457
|
+
* );
|
|
3458
|
+
*
|
|
3459
|
+
* // Access usage metrics
|
|
3460
|
+
* console.log('Cost:', result.usage?.totalCostUsd);
|
|
3461
|
+
* console.log('Tokens:', result.usage?.inputTokens, result.usage?.outputTokens);
|
|
3462
|
+
*/
|
|
3463
|
+
declare function createJudge(config?: JudgeConfig): Judge;
|
|
3464
|
+
|
|
3465
|
+
/**
|
|
3466
|
+
* Options for conformance checks
|
|
3467
|
+
*/
|
|
3468
|
+
interface MCPConformanceOptions {
|
|
3469
|
+
/**
|
|
3470
|
+
* List of tools that must be present
|
|
3471
|
+
*/
|
|
3472
|
+
requiredTools?: Array<string>;
|
|
3473
|
+
/**
|
|
3474
|
+
* Whether to validate tool schemas
|
|
3475
|
+
* @default true
|
|
3476
|
+
*/
|
|
3477
|
+
validateSchemas?: boolean;
|
|
3478
|
+
/**
|
|
3479
|
+
* Whether to check server info is present
|
|
3480
|
+
* @default true
|
|
3481
|
+
*/
|
|
3482
|
+
checkServerInfo?: boolean;
|
|
3483
|
+
/**
|
|
3484
|
+
* Whether to check resources capability (if declared by server)
|
|
3485
|
+
* @default true
|
|
3486
|
+
*/
|
|
3487
|
+
checkResources?: boolean;
|
|
3488
|
+
/**
|
|
3489
|
+
* Whether to check prompts capability (if declared by server)
|
|
3490
|
+
* @default true
|
|
3491
|
+
*/
|
|
3492
|
+
checkPrompts?: boolean;
|
|
3493
|
+
}
|
|
3494
|
+
/**
|
|
3495
|
+
* Individual check result
|
|
3496
|
+
*/
|
|
3497
|
+
interface MCPConformanceCheck$1 {
|
|
3498
|
+
name: string;
|
|
3499
|
+
pass: boolean;
|
|
3500
|
+
message: string;
|
|
3501
|
+
}
|
|
3502
|
+
/**
|
|
3503
|
+
* Raw MCP responses for snapshotting
|
|
3504
|
+
*/
|
|
3505
|
+
interface MCPConformanceRaw {
|
|
3506
|
+
/**
|
|
3507
|
+
* Server info (name, version)
|
|
3508
|
+
* null if not available
|
|
3509
|
+
*/
|
|
3510
|
+
serverInfo: Implementation | null;
|
|
3511
|
+
/**
|
|
3512
|
+
* Server capabilities
|
|
3513
|
+
* null if not available
|
|
3514
|
+
*/
|
|
3515
|
+
capabilities: ServerCapabilities | null;
|
|
3516
|
+
/**
|
|
3517
|
+
* List of tools from the server
|
|
3518
|
+
*/
|
|
3519
|
+
tools: Tool[];
|
|
3520
|
+
/**
|
|
3521
|
+
* List of resources from the server
|
|
3522
|
+
* null if server doesn't declare resources capability
|
|
3523
|
+
*/
|
|
3524
|
+
resources: Resource[] | null;
|
|
3525
|
+
/**
|
|
3526
|
+
* List of prompts from the server
|
|
3527
|
+
* null if server doesn't declare prompts capability
|
|
3528
|
+
*/
|
|
3529
|
+
prompts: Prompt[] | null;
|
|
3530
|
+
}
|
|
3531
|
+
/**
|
|
3532
|
+
* Result of conformance checks
|
|
3533
|
+
*/
|
|
3534
|
+
interface MCPConformanceResult {
|
|
3535
|
+
/**
|
|
3536
|
+
* Whether all checks passed
|
|
3537
|
+
*/
|
|
3538
|
+
pass: boolean;
|
|
3539
|
+
/**
|
|
3540
|
+
* List of check results
|
|
3541
|
+
*/
|
|
3542
|
+
checks: MCPConformanceCheck$1[];
|
|
3543
|
+
/**
|
|
3544
|
+
* Raw MCP responses for snapshotting
|
|
3545
|
+
*
|
|
3546
|
+
* @example
|
|
3547
|
+
* ```typescript
|
|
3548
|
+
* const result = await runConformanceChecks(mcp);
|
|
3549
|
+
* expect(result.raw.tools).toMatchSnapshot();
|
|
3550
|
+
* expect(result.raw.capabilities).toMatchSnapshot();
|
|
3551
|
+
* ```
|
|
3552
|
+
*/
|
|
3553
|
+
raw: MCPConformanceRaw;
|
|
3554
|
+
}
|
|
3555
|
+
/**
|
|
3556
|
+
* Runs MCP protocol conformance checks
|
|
3557
|
+
*
|
|
3558
|
+
* Validates that the MCP server conforms to expected protocol behavior.
|
|
3559
|
+
* Returns both assertion results and raw MCP responses for snapshotting.
|
|
3560
|
+
*
|
|
3561
|
+
* When testInfo is provided, results are automatically attached for the MCP reporter.
|
|
3562
|
+
*
|
|
3563
|
+
* @param mcp - MCP fixture API
|
|
3564
|
+
* @param options - Conformance check options
|
|
3565
|
+
* @param testInfo - Optional Playwright TestInfo for reporter integration
|
|
3566
|
+
* @returns Conformance check results with raw responses
|
|
3567
|
+
*
|
|
3568
|
+
* @example
|
|
3569
|
+
* ```typescript
|
|
3570
|
+
* // Basic usage
|
|
3571
|
+
* const result = await runConformanceChecks(mcp, {
|
|
3572
|
+
* requiredTools: ['get_weather', 'search_docs'],
|
|
3573
|
+
* validateSchemas: true,
|
|
3574
|
+
* });
|
|
3575
|
+
*
|
|
3576
|
+
* // Check assertions
|
|
3577
|
+
* expect(result.pass).toBe(true);
|
|
3578
|
+
*
|
|
3579
|
+
* // With reporter integration (recommended in Playwright tests)
|
|
3580
|
+
* const result = await runConformanceChecks(mcp, {
|
|
3581
|
+
* requiredTools: ['search'],
|
|
3582
|
+
* }, testInfo);
|
|
3583
|
+
*
|
|
3584
|
+
* // Snapshot raw responses
|
|
3585
|
+
* expect(result.raw.tools).toMatchSnapshot();
|
|
3586
|
+
* expect(result.raw.capabilities).toMatchSnapshot();
|
|
3587
|
+
* ```
|
|
3588
|
+
*/
|
|
3589
|
+
declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
|
|
3590
|
+
|
|
3591
|
+
/**
|
|
3592
|
+
* Reporter-specific type definitions
|
|
3593
|
+
*
|
|
3594
|
+
* These types are used by the MCP reporter and UI.
|
|
3595
|
+
*
|
|
3596
|
+
* @packageDocumentation
|
|
3597
|
+
*/
|
|
3598
|
+
|
|
3599
|
+
/**
|
|
3600
|
+
* Individual conformance check result
|
|
3601
|
+
*/
|
|
3602
|
+
interface MCPConformanceCheck {
|
|
3603
|
+
/**
|
|
3604
|
+
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
3605
|
+
*/
|
|
3606
|
+
name: string;
|
|
3607
|
+
/**
|
|
3608
|
+
* Whether the check passed
|
|
3609
|
+
*/
|
|
3610
|
+
pass: boolean;
|
|
3611
|
+
/**
|
|
3612
|
+
* Human-readable message describing the result
|
|
3613
|
+
*/
|
|
3614
|
+
message: string;
|
|
3615
|
+
}
|
|
3616
|
+
/**
|
|
3617
|
+
* Conformance check result as stored in reporter data
|
|
3618
|
+
*/
|
|
3619
|
+
interface MCPConformanceResultData {
|
|
3620
|
+
/**
|
|
3621
|
+
* Test title where conformance check was run
|
|
3622
|
+
*/
|
|
3623
|
+
testTitle: string;
|
|
3624
|
+
/**
|
|
3625
|
+
* Whether all checks passed
|
|
3626
|
+
*/
|
|
3627
|
+
pass: boolean;
|
|
3628
|
+
/**
|
|
3629
|
+
* Individual check results
|
|
3630
|
+
*/
|
|
3631
|
+
checks: MCPConformanceCheck[];
|
|
3632
|
+
/**
|
|
3633
|
+
* Server info if available
|
|
3634
|
+
*/
|
|
3635
|
+
serverInfo?: {
|
|
3636
|
+
name?: string;
|
|
3637
|
+
version?: string;
|
|
3638
|
+
};
|
|
3639
|
+
/**
|
|
3640
|
+
* Number of tools discovered
|
|
3641
|
+
*/
|
|
3642
|
+
toolCount: number;
|
|
3643
|
+
/**
|
|
3644
|
+
* Auth type used for this check
|
|
3645
|
+
*/
|
|
3646
|
+
authType?: AuthType;
|
|
3647
|
+
/**
|
|
3648
|
+
* Project name
|
|
3649
|
+
*/
|
|
3650
|
+
project?: string;
|
|
3651
|
+
}
|
|
3652
|
+
/**
|
|
3653
|
+
* Server capabilities data from mcp-list-tools attachment
|
|
3654
|
+
*/
|
|
3655
|
+
interface MCPServerCapabilitiesData {
|
|
3656
|
+
/**
|
|
3657
|
+
* Test title where listTools was called
|
|
3658
|
+
*/
|
|
3659
|
+
testTitle: string;
|
|
3660
|
+
/**
|
|
3661
|
+
* List of tools available on the server
|
|
3662
|
+
*/
|
|
3663
|
+
tools: Array<{
|
|
3664
|
+
name: string;
|
|
3665
|
+
description?: string;
|
|
3666
|
+
}>;
|
|
3667
|
+
/**
|
|
3668
|
+
* Total number of tools
|
|
3669
|
+
*/
|
|
3670
|
+
toolCount: number;
|
|
3671
|
+
/**
|
|
3672
|
+
* Auth type used for this test
|
|
3673
|
+
*/
|
|
3674
|
+
authType?: AuthType;
|
|
3675
|
+
/**
|
|
3676
|
+
* Project name
|
|
3677
|
+
*/
|
|
3678
|
+
project?: string;
|
|
3679
|
+
}
|
|
3680
|
+
/**
|
|
3681
|
+
* Result of a single eval case
|
|
3682
|
+
*/
|
|
3683
|
+
interface EvalCaseResult {
|
|
3684
|
+
/**
|
|
3685
|
+
* Case ID
|
|
3686
|
+
*/
|
|
3687
|
+
id: string;
|
|
3688
|
+
/**
|
|
3689
|
+
* Dataset name this case belongs to
|
|
3690
|
+
*/
|
|
3691
|
+
datasetName: string;
|
|
3692
|
+
/**
|
|
3693
|
+
* MCP tool name that was called
|
|
3694
|
+
*/
|
|
3695
|
+
toolName: string;
|
|
3696
|
+
/**
|
|
3697
|
+
* Source of this result
|
|
3698
|
+
*/
|
|
3699
|
+
source: ResultSource;
|
|
3700
|
+
/**
|
|
3701
|
+
* Overall pass/fail status
|
|
3702
|
+
*/
|
|
3703
|
+
pass: boolean;
|
|
3704
|
+
/**
|
|
3705
|
+
* Tool response
|
|
3706
|
+
*/
|
|
3707
|
+
response?: unknown;
|
|
3708
|
+
/**
|
|
3709
|
+
* Error if tool call failed
|
|
3710
|
+
*/
|
|
3711
|
+
error?: string;
|
|
3712
|
+
/**
|
|
3713
|
+
* Expectation results
|
|
3714
|
+
*/
|
|
3715
|
+
expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
|
|
3716
|
+
/**
|
|
3717
|
+
* Authentication type used for this test
|
|
3718
|
+
*/
|
|
3719
|
+
authType?: AuthType;
|
|
3720
|
+
/**
|
|
3721
|
+
* Playwright project name this test belongs to
|
|
3722
|
+
*/
|
|
3723
|
+
project?: string;
|
|
3724
|
+
/**
|
|
3725
|
+
* Execution time in milliseconds
|
|
3726
|
+
*/
|
|
3727
|
+
durationMs: number;
|
|
3728
|
+
/**
|
|
3729
|
+
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
3730
|
+
*/
|
|
3731
|
+
mode?: 'direct' | 'llm_host';
|
|
3732
|
+
}
|
|
3733
|
+
/**
|
|
3734
|
+
* Aggregated MCP eval run data
|
|
3735
|
+
*/
|
|
3736
|
+
interface MCPEvalRunData {
|
|
3737
|
+
/**
|
|
3738
|
+
* Run timestamp (ISO 8601)
|
|
3739
|
+
*/
|
|
3740
|
+
timestamp: string;
|
|
3741
|
+
/**
|
|
3742
|
+
* Total duration in milliseconds
|
|
3743
|
+
*/
|
|
3744
|
+
durationMs: number;
|
|
3745
|
+
/**
|
|
3746
|
+
* Environment info
|
|
3747
|
+
*/
|
|
3748
|
+
environment: {
|
|
3749
|
+
ci: boolean;
|
|
3750
|
+
node: string;
|
|
3751
|
+
platform: string;
|
|
3752
|
+
};
|
|
3753
|
+
/**
|
|
3754
|
+
* Aggregate metrics
|
|
3755
|
+
*/
|
|
3756
|
+
metrics: {
|
|
3757
|
+
/**
|
|
3758
|
+
* Total number of eval cases
|
|
3759
|
+
*/
|
|
3760
|
+
total: number;
|
|
3761
|
+
/**
|
|
3762
|
+
* Number of passed cases
|
|
3763
|
+
*/
|
|
3764
|
+
passed: number;
|
|
3765
|
+
/**
|
|
3766
|
+
* Number of failed cases
|
|
3767
|
+
*/
|
|
3768
|
+
failed: number;
|
|
3769
|
+
/**
|
|
3770
|
+
* Pass rate (0-1)
|
|
3771
|
+
*/
|
|
3772
|
+
passRate: number;
|
|
3773
|
+
/**
|
|
3774
|
+
* Dataset breakdown: dataset name -> count
|
|
3775
|
+
*/
|
|
3776
|
+
datasetBreakdown: Record<string, number>;
|
|
3777
|
+
/**
|
|
3778
|
+
* Expectation type breakdown
|
|
3779
|
+
*/
|
|
3780
|
+
expectationBreakdown: ExpectationBreakdown;
|
|
3781
|
+
};
|
|
3782
|
+
/**
|
|
3783
|
+
* All eval results from this run
|
|
3784
|
+
*/
|
|
3785
|
+
results: EvalCaseResult[];
|
|
3786
|
+
/**
|
|
3787
|
+
* Conformance check results (optional)
|
|
3788
|
+
*/
|
|
3789
|
+
conformanceChecks?: MCPConformanceResultData[];
|
|
3790
|
+
/**
|
|
3791
|
+
* Server capabilities discovered via listTools (optional)
|
|
3792
|
+
*/
|
|
3793
|
+
serverCapabilities?: MCPServerCapabilitiesData[];
|
|
3794
|
+
}
|
|
3795
|
+
/**
|
|
3796
|
+
* Historical summary for trend charts
|
|
3797
|
+
*/
|
|
3798
|
+
interface MCPEvalHistoricalSummary {
|
|
3799
|
+
timestamp: string;
|
|
3800
|
+
total: number;
|
|
3801
|
+
passed: number;
|
|
3802
|
+
failed: number;
|
|
3803
|
+
passRate: number;
|
|
3804
|
+
durationMs: number;
|
|
3805
|
+
}
|
|
3806
|
+
/**
|
|
3807
|
+
* Complete data structure passed to UI
|
|
3808
|
+
*/
|
|
3809
|
+
interface MCPEvalData {
|
|
3810
|
+
runData: MCPEvalRunData;
|
|
3811
|
+
historical: MCPEvalHistoricalSummary[];
|
|
3812
|
+
}
|
|
3813
|
+
|
|
3814
|
+
/**
|
|
3815
|
+
* Reporter types - re-exported from canonical source
|
|
3816
|
+
*
|
|
3817
|
+
* This module re-exports types from the canonical types module for backwards compatibility.
|
|
3818
|
+
* All type definitions now live in src/types/.
|
|
3819
|
+
*
|
|
3820
|
+
* @packageDocumentation
|
|
3821
|
+
*/
|
|
3822
|
+
|
|
3823
|
+
/**
|
|
3824
|
+
* Configuration options for MCP Eval Reporter
|
|
3825
|
+
*/
|
|
3826
|
+
interface MCPEvalReporterConfig {
|
|
3827
|
+
/**
|
|
3828
|
+
* Output directory for reports and historical data
|
|
3829
|
+
* @default '.mcp-test-results'
|
|
3830
|
+
*/
|
|
3831
|
+
outputDir?: string;
|
|
3832
|
+
/**
|
|
3833
|
+
* Auto-open report in browser after test run
|
|
3834
|
+
* @default true (disabled in CI)
|
|
3835
|
+
*/
|
|
3836
|
+
autoOpen?: boolean;
|
|
3837
|
+
/**
|
|
3838
|
+
* Number of historical runs to keep
|
|
3839
|
+
* @default 10
|
|
3840
|
+
*/
|
|
3841
|
+
historyLimit?: number;
|
|
3842
|
+
/**
|
|
3843
|
+
* Suppress console output (report still generated)
|
|
3844
|
+
* @default false
|
|
3845
|
+
*/
|
|
3846
|
+
quiet?: boolean;
|
|
3847
|
+
/**
|
|
3848
|
+
* Include auto-tracked MCP tool calls from tests without explicit eval results.
|
|
3849
|
+
* When true, any test using the MCP fixture will have its tool calls
|
|
3850
|
+
* included in the report, even without using runEvalCase/runEvalDataset.
|
|
3851
|
+
* When false, only tests with explicit eval results are included.
|
|
3852
|
+
* @default true
|
|
3853
|
+
*/
|
|
3854
|
+
includeAutoTracking?: boolean;
|
|
3855
|
+
}
|
|
3856
|
+
|
|
3857
|
+
export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult$1 as EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type ExpectedToolCall, type FieldRemovalSanitizer, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck$1 as MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type SizeValidatorOptions, type SnapshotSanitizer, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallValidationResult, type ToolCallValidator, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, createToolCallValidator, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, extractText as extractTextFromResponse, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performOAuthSetup, performOAuthSetupIfNeeded, runConformanceChecks, runEvalCase, runEvalDataset, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText };
|