@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +354 -37
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +721 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +533 -116
- package/dist/index.d.ts +533 -116
- package/dist/index.js +719 -78
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -6
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.js
CHANGED
|
@@ -18,7 +18,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
|
|
|
18
18
|
import { ProxyAgent, Agent } from 'undici';
|
|
19
19
|
import { readFileSync } from 'fs';
|
|
20
20
|
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
21
|
-
import { execFile } from 'child_process';
|
|
21
|
+
import { execFile, spawn } from 'child_process';
|
|
22
22
|
import { promisify } from 'util';
|
|
23
23
|
|
|
24
24
|
var __defProp = Object.defineProperty;
|
|
@@ -3279,7 +3279,11 @@ async function performOAuthSetup(config) {
|
|
|
3279
3279
|
const page = await context.newPage();
|
|
3280
3280
|
page.setDefaultTimeout(timeoutMs);
|
|
3281
3281
|
await page.goto(authorizationUrl.toString());
|
|
3282
|
-
|
|
3282
|
+
if ("customLoginFlow" in config && config.customLoginFlow) {
|
|
3283
|
+
await config.customLoginFlow(page);
|
|
3284
|
+
} else {
|
|
3285
|
+
await completeLoginForm(page, config);
|
|
3286
|
+
}
|
|
3283
3287
|
await page.waitForURL(
|
|
3284
3288
|
(url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
|
|
3285
3289
|
{ timeout: timeoutMs }
|
|
@@ -4380,7 +4384,7 @@ function escapeHtml(text) {
|
|
|
4380
4384
|
|
|
4381
4385
|
// package.json
|
|
4382
4386
|
var package_default = {
|
|
4383
|
-
version: "1.0.0
|
|
4387
|
+
version: "1.0.0"};
|
|
4384
4388
|
|
|
4385
4389
|
// src/mcp/clientFactory.ts
|
|
4386
4390
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4599,6 +4603,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4599
4603
|
}
|
|
4600
4604
|
async function closeMCPClient(client) {
|
|
4601
4605
|
try {
|
|
4606
|
+
const transport = client.transport;
|
|
4607
|
+
if (transport instanceof StreamableHTTPClientTransport) {
|
|
4608
|
+
try {
|
|
4609
|
+
await transport.terminateSession();
|
|
4610
|
+
} catch (sessionError) {
|
|
4611
|
+
debugClient(
|
|
4612
|
+
"Error terminating session: %s",
|
|
4613
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
4614
|
+
);
|
|
4615
|
+
}
|
|
4616
|
+
}
|
|
4602
4617
|
await client.close();
|
|
4603
4618
|
} catch (error) {
|
|
4604
4619
|
debugClient(
|
|
@@ -4827,11 +4842,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
4827
4842
|
} catch (error) {
|
|
4828
4843
|
const zodError = error;
|
|
4829
4844
|
const issues = formatZodIssues(zodError);
|
|
4845
|
+
const text = stringifyResponse(response);
|
|
4830
4846
|
return {
|
|
4831
4847
|
pass: false,
|
|
4832
4848
|
message: `Response does not match schema: ${issues}`,
|
|
4833
4849
|
details: {
|
|
4834
|
-
issues: zodError.issues
|
|
4850
|
+
issues: zodError.issues,
|
|
4851
|
+
textPreview: truncateForDisplay2(text)
|
|
4835
4852
|
}
|
|
4836
4853
|
};
|
|
4837
4854
|
}
|
|
@@ -4884,6 +4901,12 @@ function formatZodIssues(error) {
|
|
|
4884
4901
|
});
|
|
4885
4902
|
return issues.join("; ");
|
|
4886
4903
|
}
|
|
4904
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
4905
|
+
if (str.length <= maxLength) {
|
|
4906
|
+
return str;
|
|
4907
|
+
}
|
|
4908
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
4909
|
+
}
|
|
4887
4910
|
|
|
4888
4911
|
// src/assertions/validators/text.ts
|
|
4889
4912
|
function validateText(response, expected, options = {}) {
|
|
@@ -4910,11 +4933,11 @@ function validateText(response, expected, options = {}) {
|
|
|
4910
4933
|
details: {
|
|
4911
4934
|
missing,
|
|
4912
4935
|
textLength: text.length,
|
|
4913
|
-
textPreview:
|
|
4936
|
+
textPreview: truncateForDisplay3(text)
|
|
4914
4937
|
}
|
|
4915
4938
|
};
|
|
4916
4939
|
}
|
|
4917
|
-
function
|
|
4940
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
4918
4941
|
if (str.length <= maxLength) {
|
|
4919
4942
|
return str;
|
|
4920
4943
|
}
|
|
@@ -4946,7 +4969,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
4946
4969
|
details: {
|
|
4947
4970
|
unmatched,
|
|
4948
4971
|
textLength: text.length,
|
|
4949
|
-
textPreview:
|
|
4972
|
+
textPreview: truncateForDisplay4(text)
|
|
4950
4973
|
}
|
|
4951
4974
|
};
|
|
4952
4975
|
}
|
|
@@ -4966,7 +4989,7 @@ function patternToString(pattern) {
|
|
|
4966
4989
|
}
|
|
4967
4990
|
return `/${pattern}/`;
|
|
4968
4991
|
}
|
|
4969
|
-
function
|
|
4992
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
4970
4993
|
if (str.length <= maxLength) {
|
|
4971
4994
|
return str;
|
|
4972
4995
|
}
|
|
@@ -4989,7 +5012,7 @@ function validateError(response, expected = true) {
|
|
|
4989
5012
|
pass: false,
|
|
4990
5013
|
message: "Expected an error response but got success",
|
|
4991
5014
|
details: {
|
|
4992
|
-
textPreview:
|
|
5015
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
4993
5016
|
}
|
|
4994
5017
|
};
|
|
4995
5018
|
} else {
|
|
@@ -5001,7 +5024,7 @@ function validateError(response, expected = true) {
|
|
|
5001
5024
|
}
|
|
5002
5025
|
return {
|
|
5003
5026
|
pass: false,
|
|
5004
|
-
message: `Expected a success response but got error: "${
|
|
5027
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
5005
5028
|
details: {
|
|
5006
5029
|
errorMessage
|
|
5007
5030
|
}
|
|
@@ -5014,7 +5037,7 @@ function validateError(response, expected = true) {
|
|
|
5014
5037
|
pass: false,
|
|
5015
5038
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
5016
5039
|
details: {
|
|
5017
|
-
textPreview:
|
|
5040
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5018
5041
|
}
|
|
5019
5042
|
};
|
|
5020
5043
|
}
|
|
@@ -5036,7 +5059,7 @@ function validateError(response, expected = true) {
|
|
|
5036
5059
|
}
|
|
5037
5060
|
};
|
|
5038
5061
|
}
|
|
5039
|
-
function
|
|
5062
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
5040
5063
|
if (str.length <= maxLength) {
|
|
5041
5064
|
return str;
|
|
5042
5065
|
}
|
|
@@ -5097,9 +5120,17 @@ function formatBytes(bytes) {
|
|
|
5097
5120
|
function isSimulationResult(value) {
|
|
5098
5121
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
5099
5122
|
}
|
|
5123
|
+
function isPatternMatcher(v) {
|
|
5124
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
5125
|
+
}
|
|
5100
5126
|
function partialMatch(actual, expected) {
|
|
5101
5127
|
return Object.entries(expected).every(([k, v]) => {
|
|
5102
5128
|
const actualVal = actual[k];
|
|
5129
|
+
if (isPatternMatcher(v)) {
|
|
5130
|
+
if (typeof actualVal !== "string") return false;
|
|
5131
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
5132
|
+
return re.test(actualVal);
|
|
5133
|
+
}
|
|
5103
5134
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
5104
5135
|
return partialMatch(
|
|
5105
5136
|
actualVal,
|
|
@@ -5146,6 +5177,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5146
5177
|
return {
|
|
5147
5178
|
pass: false,
|
|
5148
5179
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
5180
|
+
details: {
|
|
5181
|
+
actual: actual.map((c) => c.name),
|
|
5182
|
+
expected: expected.name
|
|
5183
|
+
},
|
|
5149
5184
|
metrics
|
|
5150
5185
|
};
|
|
5151
5186
|
}
|
|
@@ -5162,6 +5197,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5162
5197
|
return {
|
|
5163
5198
|
pass: false,
|
|
5164
5199
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
5200
|
+
details: {
|
|
5201
|
+
actual: actual.map((c) => c.name),
|
|
5202
|
+
expected: expected.name
|
|
5203
|
+
},
|
|
5165
5204
|
metrics
|
|
5166
5205
|
};
|
|
5167
5206
|
}
|
|
@@ -5174,6 +5213,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5174
5213
|
return {
|
|
5175
5214
|
pass: false,
|
|
5176
5215
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
5216
|
+
details: {
|
|
5217
|
+
actual: actual.map((c) => c.name),
|
|
5218
|
+
unexpected: unexpected.map((c) => c.name)
|
|
5219
|
+
},
|
|
5177
5220
|
metrics
|
|
5178
5221
|
};
|
|
5179
5222
|
}
|
|
@@ -5192,19 +5235,22 @@ function validateToolCallCount(response, options) {
|
|
|
5192
5235
|
if (exact !== void 0 && count !== exact) {
|
|
5193
5236
|
return {
|
|
5194
5237
|
pass: false,
|
|
5195
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
5238
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
5239
|
+
details: { actual: count, expected: exact }
|
|
5196
5240
|
};
|
|
5197
5241
|
}
|
|
5198
5242
|
if (min !== void 0 && count < min) {
|
|
5199
5243
|
return {
|
|
5200
5244
|
pass: false,
|
|
5201
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
5245
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
5246
|
+
details: { actual: count, min }
|
|
5202
5247
|
};
|
|
5203
5248
|
}
|
|
5204
5249
|
if (max !== void 0 && count > max) {
|
|
5205
5250
|
return {
|
|
5206
5251
|
pass: false,
|
|
5207
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
5252
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
5253
|
+
details: { actual: count, max }
|
|
5208
5254
|
};
|
|
5209
5255
|
}
|
|
5210
5256
|
return {
|
|
@@ -5238,7 +5284,175 @@ var JudgeResponseSchema = z.object({
|
|
|
5238
5284
|
reasoning: z.string()
|
|
5239
5285
|
});
|
|
5240
5286
|
|
|
5241
|
-
// src/judge/
|
|
5287
|
+
// src/judge/anthropicJudge.ts
|
|
5288
|
+
function createAnthropicJudge(config = {}) {
|
|
5289
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
5290
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
5291
|
+
if (!apiKey) {
|
|
5292
|
+
throw new Error(
|
|
5293
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
5294
|
+
);
|
|
5295
|
+
}
|
|
5296
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5297
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5298
|
+
const temperature = config.temperature ?? 0;
|
|
5299
|
+
return {
|
|
5300
|
+
async evaluate(candidate, reference, rubric) {
|
|
5301
|
+
let anthropicModule;
|
|
5302
|
+
try {
|
|
5303
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
5304
|
+
} catch (err) {
|
|
5305
|
+
throw new Error(
|
|
5306
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
5307
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5308
|
+
);
|
|
5309
|
+
}
|
|
5310
|
+
const client = new anthropicModule.default({ apiKey });
|
|
5311
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
5312
|
+
const startTime = Date.now();
|
|
5313
|
+
const response = await client.messages.create({
|
|
5314
|
+
model,
|
|
5315
|
+
max_tokens: maxTokens,
|
|
5316
|
+
temperature,
|
|
5317
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5318
|
+
messages: [{ role: "user", content: prompt }]
|
|
5319
|
+
});
|
|
5320
|
+
const durationMs = Date.now() - startTime;
|
|
5321
|
+
const textBlock = response.content.find(
|
|
5322
|
+
(b) => b.type === "text"
|
|
5323
|
+
);
|
|
5324
|
+
const text = textBlock?.text ?? "";
|
|
5325
|
+
const parsed = parseJudgeResponse(text);
|
|
5326
|
+
return {
|
|
5327
|
+
pass: parsed.pass,
|
|
5328
|
+
score: parsed.score,
|
|
5329
|
+
reasoning: parsed.reasoning,
|
|
5330
|
+
usage: {
|
|
5331
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5332
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5333
|
+
totalCostUsd: 0,
|
|
5334
|
+
durationMs
|
|
5335
|
+
}
|
|
5336
|
+
};
|
|
5337
|
+
}
|
|
5338
|
+
};
|
|
5339
|
+
}
|
|
5340
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
5341
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5342
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5343
|
+
return `Rubric:
|
|
5344
|
+
${rubric}
|
|
5345
|
+
|
|
5346
|
+
<candidate_response>
|
|
5347
|
+
${candidateStr}
|
|
5348
|
+
</candidate_response>
|
|
5349
|
+
|
|
5350
|
+
<reference_answer>
|
|
5351
|
+
${referenceStr ?? "No reference provided."}
|
|
5352
|
+
</reference_answer>
|
|
5353
|
+
|
|
5354
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5355
|
+
}
|
|
5356
|
+
function parseJudgeResponse(text) {
|
|
5357
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5358
|
+
let parsed;
|
|
5359
|
+
try {
|
|
5360
|
+
parsed = JSON.parse(cleaned);
|
|
5361
|
+
} catch {
|
|
5362
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5363
|
+
}
|
|
5364
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5365
|
+
if (!result.success) {
|
|
5366
|
+
throw new Error(
|
|
5367
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5368
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5369
|
+
);
|
|
5370
|
+
}
|
|
5371
|
+
return result.data;
|
|
5372
|
+
}
|
|
5373
|
+
|
|
5374
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
5375
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
5376
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5377
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5378
|
+
const temperature = config.temperature ?? 0;
|
|
5379
|
+
return {
|
|
5380
|
+
async evaluate(candidate, reference, rubric) {
|
|
5381
|
+
let vertexModule;
|
|
5382
|
+
try {
|
|
5383
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
5384
|
+
} catch (err) {
|
|
5385
|
+
throw new Error(
|
|
5386
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
5387
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5388
|
+
);
|
|
5389
|
+
}
|
|
5390
|
+
const client = new vertexModule.AnthropicVertex({
|
|
5391
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
5392
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
5393
|
+
});
|
|
5394
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
5395
|
+
const startTime = Date.now();
|
|
5396
|
+
const response = await client.messages.create({
|
|
5397
|
+
model,
|
|
5398
|
+
max_tokens: maxTokens,
|
|
5399
|
+
temperature,
|
|
5400
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5401
|
+
messages: [{ role: "user", content: prompt }]
|
|
5402
|
+
});
|
|
5403
|
+
const durationMs = Date.now() - startTime;
|
|
5404
|
+
const textBlock = response.content.find(
|
|
5405
|
+
(b) => b.type === "text"
|
|
5406
|
+
);
|
|
5407
|
+
const text = textBlock?.text ?? "";
|
|
5408
|
+
const parsed = parseJudgeResponse2(text);
|
|
5409
|
+
return {
|
|
5410
|
+
pass: parsed.pass,
|
|
5411
|
+
score: parsed.score,
|
|
5412
|
+
reasoning: parsed.reasoning,
|
|
5413
|
+
usage: {
|
|
5414
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5415
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5416
|
+
totalCostUsd: 0,
|
|
5417
|
+
durationMs
|
|
5418
|
+
}
|
|
5419
|
+
};
|
|
5420
|
+
}
|
|
5421
|
+
};
|
|
5422
|
+
}
|
|
5423
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
5424
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5425
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5426
|
+
return `Rubric:
|
|
5427
|
+
${rubric}
|
|
5428
|
+
|
|
5429
|
+
<candidate_response>
|
|
5430
|
+
${candidateStr}
|
|
5431
|
+
</candidate_response>
|
|
5432
|
+
|
|
5433
|
+
<reference_answer>
|
|
5434
|
+
${referenceStr ?? "No reference provided."}
|
|
5435
|
+
</reference_answer>
|
|
5436
|
+
|
|
5437
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5438
|
+
}
|
|
5439
|
+
function parseJudgeResponse2(text) {
|
|
5440
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5441
|
+
let parsed;
|
|
5442
|
+
try {
|
|
5443
|
+
parsed = JSON.parse(cleaned);
|
|
5444
|
+
} catch {
|
|
5445
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5446
|
+
}
|
|
5447
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5448
|
+
if (!result.success) {
|
|
5449
|
+
throw new Error(
|
|
5450
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5451
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5452
|
+
);
|
|
5453
|
+
}
|
|
5454
|
+
return result.data;
|
|
5455
|
+
}
|
|
5242
5456
|
function createClaudeAgentJudge(config) {
|
|
5243
5457
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5244
5458
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -5256,7 +5470,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5256
5470
|
exceedsMaxToolOutputSize: true
|
|
5257
5471
|
};
|
|
5258
5472
|
}
|
|
5259
|
-
const prompt =
|
|
5473
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
5260
5474
|
try {
|
|
5261
5475
|
let resultMessage;
|
|
5262
5476
|
for await (const message of query({
|
|
@@ -5288,7 +5502,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5288
5502
|
);
|
|
5289
5503
|
}
|
|
5290
5504
|
const responseText = resultMessage.result ?? "";
|
|
5291
|
-
const parsed =
|
|
5505
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
5292
5506
|
const usage = {
|
|
5293
5507
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
5294
5508
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -5317,7 +5531,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5317
5531
|
function buildSystemPrompt() {
|
|
5318
5532
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
5319
5533
|
}
|
|
5320
|
-
function
|
|
5534
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
5321
5535
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5322
5536
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5323
5537
|
const parts = [];
|
|
@@ -5334,7 +5548,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
5334
5548
|
);
|
|
5335
5549
|
return parts.join("");
|
|
5336
5550
|
}
|
|
5337
|
-
function
|
|
5551
|
+
function parseJudgeResponse3(text) {
|
|
5338
5552
|
let jsonText = text.trim();
|
|
5339
5553
|
if (jsonText.startsWith("```json")) {
|
|
5340
5554
|
jsonText = jsonText.slice(7);
|
|
@@ -5391,7 +5605,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5391
5605
|
);
|
|
5392
5606
|
}
|
|
5393
5607
|
const client = new openaiModule.default({ apiKey });
|
|
5394
|
-
const prompt =
|
|
5608
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
5395
5609
|
const startTime = Date.now();
|
|
5396
5610
|
const completion = await client.chat.completions.create({
|
|
5397
5611
|
model,
|
|
@@ -5407,7 +5621,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5407
5621
|
});
|
|
5408
5622
|
const durationMs = Date.now() - startTime;
|
|
5409
5623
|
const text = completion.choices[0]?.message.content ?? "";
|
|
5410
|
-
const parsed =
|
|
5624
|
+
const parsed = parseJudgeResponse4(text);
|
|
5411
5625
|
return {
|
|
5412
5626
|
pass: parsed.pass,
|
|
5413
5627
|
score: parsed.score,
|
|
@@ -5422,7 +5636,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5422
5636
|
}
|
|
5423
5637
|
};
|
|
5424
5638
|
}
|
|
5425
|
-
function
|
|
5639
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
5426
5640
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5427
5641
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5428
5642
|
return `Rubric:
|
|
@@ -5438,7 +5652,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
5438
5652
|
|
|
5439
5653
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5440
5654
|
}
|
|
5441
|
-
function
|
|
5655
|
+
function parseJudgeResponse4(text) {
|
|
5442
5656
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5443
5657
|
let parsed;
|
|
5444
5658
|
try {
|
|
@@ -5540,14 +5754,48 @@ function createJudge(config = {}) {
|
|
|
5540
5754
|
const provider = config.provider ?? "anthropic";
|
|
5541
5755
|
switch (provider) {
|
|
5542
5756
|
case "anthropic":
|
|
5757
|
+
return createAnthropicJudge(config);
|
|
5758
|
+
case "vertex-anthropic":
|
|
5759
|
+
return createVertexAnthropicJudge(config);
|
|
5760
|
+
case "anthropic-agent-sdk":
|
|
5543
5761
|
return createClaudeAgentJudge(config);
|
|
5544
5762
|
case "openai":
|
|
5545
5763
|
return createOpenAIJudge(config);
|
|
5546
5764
|
case "google":
|
|
5547
5765
|
return createGoogleJudge(config);
|
|
5548
5766
|
default:
|
|
5549
|
-
throw new Error(
|
|
5767
|
+
throw new Error(
|
|
5768
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
5769
|
+
);
|
|
5770
|
+
}
|
|
5771
|
+
}
|
|
5772
|
+
|
|
5773
|
+
// src/judge/judgeRegistry.ts
|
|
5774
|
+
var registry = /* @__PURE__ */ new Map();
|
|
5775
|
+
function registerJudge(name15, executor) {
|
|
5776
|
+
const existing = registry.get(name15);
|
|
5777
|
+
if (existing !== void 0) {
|
|
5778
|
+
if (existing === executor) {
|
|
5779
|
+
return;
|
|
5780
|
+
}
|
|
5781
|
+
throw new Error(
|
|
5782
|
+
`Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
|
|
5783
|
+
);
|
|
5784
|
+
}
|
|
5785
|
+
registry.set(name15, executor);
|
|
5786
|
+
}
|
|
5787
|
+
function getRegisteredJudge(name15) {
|
|
5788
|
+
const executor = registry.get(name15);
|
|
5789
|
+
if (!executor) {
|
|
5790
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
5791
|
+
throw new Error(
|
|
5792
|
+
`Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
5793
|
+
);
|
|
5550
5794
|
}
|
|
5795
|
+
return executor;
|
|
5796
|
+
}
|
|
5797
|
+
function clearJudgeRegistry() {
|
|
5798
|
+
registry.clear();
|
|
5551
5799
|
}
|
|
5552
5800
|
|
|
5553
5801
|
// src/assertions/validators/judge.ts
|
|
@@ -5558,6 +5806,7 @@ function computeStdDev(scores, mean) {
|
|
|
5558
5806
|
}
|
|
5559
5807
|
async function validateJudge(response, config) {
|
|
5560
5808
|
const {
|
|
5809
|
+
judge: judgeName,
|
|
5561
5810
|
rubric,
|
|
5562
5811
|
reference,
|
|
5563
5812
|
threshold = 0.7,
|
|
@@ -5570,6 +5819,29 @@ async function validateJudge(response, config) {
|
|
|
5570
5819
|
maxBudgetUsd,
|
|
5571
5820
|
maxToolOutputSize
|
|
5572
5821
|
} = config;
|
|
5822
|
+
if (judgeName !== void 0) {
|
|
5823
|
+
try {
|
|
5824
|
+
const executor = getRegisteredJudge(judgeName);
|
|
5825
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
5826
|
+
const score = judgeResult.score;
|
|
5827
|
+
const passed = score >= threshold;
|
|
5828
|
+
return {
|
|
5829
|
+
pass: passed,
|
|
5830
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
5831
|
+
};
|
|
5832
|
+
} catch (err) {
|
|
5833
|
+
return {
|
|
5834
|
+
pass: false,
|
|
5835
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
5836
|
+
};
|
|
5837
|
+
}
|
|
5838
|
+
}
|
|
5839
|
+
if (rubric === void 0) {
|
|
5840
|
+
return {
|
|
5841
|
+
pass: false,
|
|
5842
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
5843
|
+
};
|
|
5844
|
+
}
|
|
5573
5845
|
const resolvedRubric = resolveRubric(rubric);
|
|
5574
5846
|
const judgeConfig = {
|
|
5575
5847
|
...provider !== void 0 && { provider },
|
|
@@ -5616,11 +5888,17 @@ async function validateJudge(response, config) {
|
|
|
5616
5888
|
return {
|
|
5617
5889
|
pass: passed,
|
|
5618
5890
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
5619
|
-
details:
|
|
5620
|
-
|
|
5621
|
-
|
|
5622
|
-
|
|
5623
|
-
|
|
5891
|
+
details: {
|
|
5892
|
+
score: meanScore,
|
|
5893
|
+
reasoning: lastReasoning,
|
|
5894
|
+
judgeProvider: provider ?? "anthropic",
|
|
5895
|
+
judgeModel: model,
|
|
5896
|
+
...reps > 1 && {
|
|
5897
|
+
scores,
|
|
5898
|
+
scoreStdDev: stdDev,
|
|
5899
|
+
highVariance
|
|
5900
|
+
}
|
|
5901
|
+
}
|
|
5624
5902
|
};
|
|
5625
5903
|
} catch (err) {
|
|
5626
5904
|
return {
|
|
@@ -5813,12 +6091,19 @@ function toMatchToolResponse(received, expected) {
|
|
|
5813
6091
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
5814
6092
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
5815
6093
|
const result = validateSchema(received, schema, options);
|
|
6094
|
+
const preview = result.details?.textPreview;
|
|
5816
6095
|
return {
|
|
5817
6096
|
pass: result.pass,
|
|
5818
6097
|
message: () => {
|
|
5819
6098
|
if (this.isNot) {
|
|
5820
6099
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
5821
6100
|
}
|
|
6101
|
+
if (!result.pass && preview) {
|
|
6102
|
+
return `${result.message}
|
|
6103
|
+
|
|
6104
|
+
Actual response (truncated):
|
|
6105
|
+
${preview}`;
|
|
6106
|
+
}
|
|
5822
6107
|
return result.message;
|
|
5823
6108
|
}
|
|
5824
6109
|
};
|
|
@@ -5827,6 +6112,7 @@ function toMatchToolSchema(received, schema, options = {}) {
|
|
|
5827
6112
|
// src/assertions/matchers/toContainToolText.ts
|
|
5828
6113
|
function toContainToolText(received, expected, options = {}) {
|
|
5829
6114
|
const result = validateText(received, expected, options);
|
|
6115
|
+
const preview = result.details?.textPreview;
|
|
5830
6116
|
return {
|
|
5831
6117
|
pass: result.pass,
|
|
5832
6118
|
message: () => {
|
|
@@ -5834,6 +6120,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
5834
6120
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
5835
6121
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
5836
6122
|
}
|
|
6123
|
+
if (!result.pass && preview) {
|
|
6124
|
+
return `${result.message}
|
|
6125
|
+
|
|
6126
|
+
Actual response (truncated):
|
|
6127
|
+
${preview}`;
|
|
6128
|
+
}
|
|
5837
6129
|
return result.message;
|
|
5838
6130
|
}
|
|
5839
6131
|
};
|
|
@@ -5842,12 +6134,19 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
5842
6134
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
5843
6135
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
5844
6136
|
const result = validatePattern(received, patterns, options);
|
|
6137
|
+
const preview = result.details?.textPreview;
|
|
5845
6138
|
return {
|
|
5846
6139
|
pass: result.pass,
|
|
5847
6140
|
message: () => {
|
|
5848
6141
|
if (this.isNot) {
|
|
5849
6142
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
5850
6143
|
}
|
|
6144
|
+
if (!result.pass && preview) {
|
|
6145
|
+
return `${result.message}
|
|
6146
|
+
|
|
6147
|
+
Actual response (truncated):
|
|
6148
|
+
${preview}`;
|
|
6149
|
+
}
|
|
5851
6150
|
return result.message;
|
|
5852
6151
|
}
|
|
5853
6152
|
};
|
|
@@ -5999,31 +6298,68 @@ function toBeToolError(received, expected = true) {
|
|
|
5999
6298
|
|
|
6000
6299
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
6001
6300
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
6002
|
-
async function
|
|
6301
|
+
async function runSingleJudge(received, rubric, options) {
|
|
6003
6302
|
const {
|
|
6004
6303
|
reference = null,
|
|
6005
6304
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
6006
6305
|
reps,
|
|
6007
6306
|
provider,
|
|
6008
|
-
model
|
|
6307
|
+
model,
|
|
6308
|
+
judge
|
|
6009
6309
|
} = options;
|
|
6010
6310
|
const validation = await validateJudge(received, {
|
|
6011
|
-
rubric,
|
|
6311
|
+
...rubric !== void 0 && { rubric },
|
|
6012
6312
|
reference: reference ?? void 0,
|
|
6013
6313
|
threshold: passingThreshold,
|
|
6014
6314
|
...reps !== void 0 && { reps },
|
|
6015
6315
|
...provider !== void 0 && { provider },
|
|
6016
|
-
...model !== void 0 && { model }
|
|
6316
|
+
...model !== void 0 && { model },
|
|
6317
|
+
...judge !== void 0 && { judge }
|
|
6017
6318
|
});
|
|
6319
|
+
return { pass: validation.pass, message: validation.message };
|
|
6320
|
+
}
|
|
6321
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
6322
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
6323
|
+
const results = await Promise.all(
|
|
6324
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
6325
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
6326
|
+
return runSingleJudge(received, r, opts);
|
|
6327
|
+
})
|
|
6328
|
+
);
|
|
6329
|
+
const allPassed = results.every((r) => r.pass);
|
|
6330
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
6331
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
6332
|
+
const details = results.map((r) => r.message).join("\n");
|
|
6333
|
+
if (this.isNot) {
|
|
6334
|
+
return {
|
|
6335
|
+
pass: !allPassed,
|
|
6336
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
6337
|
+
};
|
|
6338
|
+
}
|
|
6339
|
+
return {
|
|
6340
|
+
pass: allPassed,
|
|
6341
|
+
message: () => `${summary}
|
|
6342
|
+
${details}`
|
|
6343
|
+
};
|
|
6344
|
+
}
|
|
6345
|
+
let rubric;
|
|
6346
|
+
let options;
|
|
6347
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
6348
|
+
rubric = rubricOrOptions;
|
|
6349
|
+
options = maybeOptions ?? {};
|
|
6350
|
+
} else {
|
|
6351
|
+
options = rubricOrOptions;
|
|
6352
|
+
}
|
|
6353
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
6018
6354
|
if (this.isNot) {
|
|
6019
6355
|
return {
|
|
6020
|
-
pass: !
|
|
6021
|
-
message: () =>
|
|
6356
|
+
pass: !result.pass,
|
|
6357
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
6022
6358
|
};
|
|
6023
6359
|
}
|
|
6024
6360
|
return {
|
|
6025
|
-
pass:
|
|
6026
|
-
message: () =>
|
|
6361
|
+
pass: result.pass,
|
|
6362
|
+
message: () => result.message
|
|
6027
6363
|
};
|
|
6028
6364
|
}
|
|
6029
6365
|
|
|
@@ -6307,6 +6643,7 @@ function getAuthConfigFromEnv() {
|
|
|
6307
6643
|
return void 0;
|
|
6308
6644
|
}
|
|
6309
6645
|
var MCPHostConfigSchema = z.object({
|
|
6646
|
+
hostType: z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
|
|
6310
6647
|
provider: z.enum([
|
|
6311
6648
|
"openai",
|
|
6312
6649
|
"anthropic",
|
|
@@ -6317,12 +6654,18 @@ var MCPHostConfigSchema = z.object({
|
|
|
6317
6654
|
"openrouter",
|
|
6318
6655
|
"xai",
|
|
6319
6656
|
"vertex-anthropic"
|
|
6320
|
-
]),
|
|
6657
|
+
]).optional(),
|
|
6321
6658
|
apiKeyEnvVar: z.string().optional(),
|
|
6322
6659
|
model: z.string().optional(),
|
|
6323
6660
|
maxTokens: z.number().optional(),
|
|
6324
6661
|
temperature: z.number().optional(),
|
|
6325
|
-
maxToolCalls: z.number().optional()
|
|
6662
|
+
maxToolCalls: z.number().optional(),
|
|
6663
|
+
cli: z.object({
|
|
6664
|
+
command: z.string(),
|
|
6665
|
+
args: z.array(z.string()),
|
|
6666
|
+
outputFormat: z.enum(["stream-json", "json"]).optional(),
|
|
6667
|
+
timeout: z.number().optional()
|
|
6668
|
+
}).optional()
|
|
6326
6669
|
});
|
|
6327
6670
|
var SnapshotSanitizerSchema = z.union([
|
|
6328
6671
|
// Built-in sanitizers
|
|
@@ -6337,6 +6680,37 @@ var SnapshotSanitizerSchema = z.union([
|
|
|
6337
6680
|
remove: z.array(z.string())
|
|
6338
6681
|
})
|
|
6339
6682
|
]);
|
|
6683
|
+
var JudgeExpectConfigSchema = z.object({
|
|
6684
|
+
judge: z.string().min(1).optional(),
|
|
6685
|
+
rubric: z.union([
|
|
6686
|
+
z.enum([
|
|
6687
|
+
"correctness",
|
|
6688
|
+
"completeness",
|
|
6689
|
+
"groundedness",
|
|
6690
|
+
"instruction-following",
|
|
6691
|
+
"conciseness"
|
|
6692
|
+
]),
|
|
6693
|
+
z.object({ text: z.string().min(1) })
|
|
6694
|
+
]).optional(),
|
|
6695
|
+
reference: z.unknown().optional(),
|
|
6696
|
+
threshold: z.number().min(0).max(1).optional(),
|
|
6697
|
+
reps: z.number().int().min(1).optional(),
|
|
6698
|
+
provider: z.enum([
|
|
6699
|
+
"anthropic",
|
|
6700
|
+
"vertex-anthropic",
|
|
6701
|
+
"anthropic-agent-sdk",
|
|
6702
|
+
"openai",
|
|
6703
|
+
"google"
|
|
6704
|
+
]).optional(),
|
|
6705
|
+
model: z.string().optional(),
|
|
6706
|
+
apiKeyEnvVar: z.string().optional(),
|
|
6707
|
+
maxTokens: z.number().int().positive().optional(),
|
|
6708
|
+
temperature: z.number().min(0).max(1).optional(),
|
|
6709
|
+
maxBudgetUsd: z.number().positive().optional(),
|
|
6710
|
+
maxToolOutputSize: z.number().int().positive().optional()
|
|
6711
|
+
}).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
|
|
6712
|
+
message: 'Either "judge" or "rubric" must be provided in passesJudge'
|
|
6713
|
+
});
|
|
6340
6714
|
var EvalExpectBlockSchema = z.object({
|
|
6341
6715
|
response: z.unknown().optional(),
|
|
6342
6716
|
schema: z.string().optional(),
|
|
@@ -6345,28 +6719,7 @@ var EvalExpectBlockSchema = z.object({
|
|
|
6345
6719
|
snapshot: z.string().optional(),
|
|
6346
6720
|
snapshotSanitizers: z.array(SnapshotSanitizerSchema).optional(),
|
|
6347
6721
|
isError: z.union([z.boolean(), z.string(), z.array(z.string())]).optional(),
|
|
6348
|
-
passesJudge: z.
|
|
6349
|
-
rubric: z.union([
|
|
6350
|
-
z.enum([
|
|
6351
|
-
"correctness",
|
|
6352
|
-
"completeness",
|
|
6353
|
-
"groundedness",
|
|
6354
|
-
"instruction-following",
|
|
6355
|
-
"conciseness"
|
|
6356
|
-
]),
|
|
6357
|
-
z.object({ text: z.string().min(1) })
|
|
6358
|
-
]),
|
|
6359
|
-
reference: z.unknown().optional(),
|
|
6360
|
-
threshold: z.number().min(0).max(1).optional(),
|
|
6361
|
-
reps: z.number().int().min(1).optional(),
|
|
6362
|
-
provider: z.enum(["anthropic", "openai", "google"]).optional(),
|
|
6363
|
-
model: z.string().optional(),
|
|
6364
|
-
apiKeyEnvVar: z.string().optional(),
|
|
6365
|
-
maxTokens: z.number().int().positive().optional(),
|
|
6366
|
-
temperature: z.number().min(0).max(1).optional(),
|
|
6367
|
-
maxBudgetUsd: z.number().positive().optional(),
|
|
6368
|
-
maxToolOutputSize: z.number().int().positive().optional()
|
|
6369
|
-
}).optional(),
|
|
6722
|
+
passesJudge: z.union([JudgeExpectConfigSchema, z.array(JudgeExpectConfigSchema).min(1)]).optional(),
|
|
6370
6723
|
responseSize: z.object({
|
|
6371
6724
|
maxBytes: z.number().optional(),
|
|
6372
6725
|
minBytes: z.number().optional()
|
|
@@ -6539,6 +6892,9 @@ function createVercelOrchestrator() {
|
|
|
6539
6892
|
try {
|
|
6540
6893
|
const { generateText, stepCountIs } = await import('ai');
|
|
6541
6894
|
const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
|
|
6895
|
+
if (!config.provider) {
|
|
6896
|
+
throw new Error("provider is required for SDK host type");
|
|
6897
|
+
}
|
|
6542
6898
|
const modelId = config.model ?? defaultModel(config.provider);
|
|
6543
6899
|
const model = await loadModel(config.provider, modelId);
|
|
6544
6900
|
const mcpTools = await mcp.listTools();
|
|
@@ -6592,13 +6948,233 @@ function createVercelOrchestrator() {
|
|
|
6592
6948
|
return {
|
|
6593
6949
|
success: false,
|
|
6594
6950
|
toolCalls: [],
|
|
6595
|
-
error: enrichErrorMessage(err, config.provider)
|
|
6951
|
+
error: enrichErrorMessage(err, config.provider ?? "unknown")
|
|
6596
6952
|
};
|
|
6597
6953
|
}
|
|
6598
6954
|
}
|
|
6599
6955
|
};
|
|
6600
6956
|
}
|
|
6601
6957
|
|
|
6958
|
+
// src/evals/mcpHost/adapters/cli/parsers.ts
|
|
6959
|
+
function parseStreamJson(stdout) {
|
|
6960
|
+
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6961
|
+
const toolCalls = [];
|
|
6962
|
+
const textParts = [];
|
|
6963
|
+
const conversationHistory = [];
|
|
6964
|
+
for (const line of lines) {
|
|
6965
|
+
let event;
|
|
6966
|
+
try {
|
|
6967
|
+
event = JSON.parse(line);
|
|
6968
|
+
} catch {
|
|
6969
|
+
continue;
|
|
6970
|
+
}
|
|
6971
|
+
if (event.type === "assistant" && event.message?.content) {
|
|
6972
|
+
for (const block of event.message.content) {
|
|
6973
|
+
if (block.type === "tool_use" && block.name) {
|
|
6974
|
+
const rawName = block.name;
|
|
6975
|
+
const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
|
|
6976
|
+
toolCalls.push({
|
|
6977
|
+
name: mcpMatch ? mcpMatch[1] : rawName,
|
|
6978
|
+
arguments: block.input ?? {},
|
|
6979
|
+
id: block.id
|
|
6980
|
+
});
|
|
6981
|
+
}
|
|
6982
|
+
if (block.type === "text" && block.text) {
|
|
6983
|
+
textParts.push(block.text);
|
|
6984
|
+
}
|
|
6985
|
+
}
|
|
6986
|
+
}
|
|
6987
|
+
if (event.type === "user" && event.message?.content) {
|
|
6988
|
+
for (const block of event.message.content) {
|
|
6989
|
+
if (block.type === "tool_result") {
|
|
6990
|
+
const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
|
|
6991
|
+
conversationHistory.push({ role: "tool", content });
|
|
6992
|
+
}
|
|
6993
|
+
}
|
|
6994
|
+
}
|
|
6995
|
+
if (event.type === "result" && typeof event.result === "string") {
|
|
6996
|
+
if (textParts.length === 0) {
|
|
6997
|
+
textParts.push(event.result);
|
|
6998
|
+
}
|
|
6999
|
+
}
|
|
7000
|
+
if (event.type === "result" && event.is_error === true) {
|
|
7001
|
+
return {
|
|
7002
|
+
success: false,
|
|
7003
|
+
toolCalls,
|
|
7004
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7005
|
+
};
|
|
7006
|
+
}
|
|
7007
|
+
}
|
|
7008
|
+
const response = textParts.join("");
|
|
7009
|
+
if (response) {
|
|
7010
|
+
conversationHistory.push({ role: "assistant", content: response });
|
|
7011
|
+
}
|
|
7012
|
+
return {
|
|
7013
|
+
success: true,
|
|
7014
|
+
toolCalls,
|
|
7015
|
+
response: response || void 0,
|
|
7016
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7017
|
+
};
|
|
7018
|
+
}
|
|
7019
|
+
function createJsonParser(paths) {
|
|
7020
|
+
return (stdout) => {
|
|
7021
|
+
const data = JSON.parse(stdout);
|
|
7022
|
+
const rawToolCalls = getNestedValue(data, paths.toolCalls);
|
|
7023
|
+
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
|
|
7024
|
+
name: typeof tc.name === "string" ? tc.name : "",
|
|
7025
|
+
arguments: tc.arguments ?? tc.args ?? {}
|
|
7026
|
+
})) : [];
|
|
7027
|
+
const response = getNestedValue(data, paths.response);
|
|
7028
|
+
const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
|
|
7029
|
+
return {
|
|
7030
|
+
success,
|
|
7031
|
+
toolCalls,
|
|
7032
|
+
response: typeof response === "string" ? response : void 0
|
|
7033
|
+
};
|
|
7034
|
+
};
|
|
7035
|
+
}
|
|
7036
|
+
function getNestedValue(obj, path3) {
|
|
7037
|
+
return path3.split(".").reduce((current, key) => {
|
|
7038
|
+
if (current !== null && typeof current === "object") {
|
|
7039
|
+
return current[key];
|
|
7040
|
+
}
|
|
7041
|
+
return void 0;
|
|
7042
|
+
}, obj);
|
|
7043
|
+
}
|
|
7044
|
+
|
|
7045
|
+
// src/evals/mcpHost/adapters/cli/runner.ts
|
|
7046
|
+
var DEFAULT_TIMEOUT = 12e4;
|
|
7047
|
+
var MAX_BUFFER = 10 * 1024 * 1024;
|
|
7048
|
+
function getParser(format) {
|
|
7049
|
+
switch (format ?? "stream-json") {
|
|
7050
|
+
case "stream-json":
|
|
7051
|
+
return parseStreamJson;
|
|
7052
|
+
case "json":
|
|
7053
|
+
return createJsonParser({
|
|
7054
|
+
toolCalls: "toolCalls",
|
|
7055
|
+
response: "response",
|
|
7056
|
+
success: "success"
|
|
7057
|
+
});
|
|
7058
|
+
}
|
|
7059
|
+
}
|
|
7060
|
+
function interpolateArgs(args, scenario) {
|
|
7061
|
+
return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
|
|
7062
|
+
}
|
|
7063
|
+
async function runCLIHost(cliConfig, scenario) {
|
|
7064
|
+
const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
|
|
7065
|
+
const args = interpolateArgs(cliConfig.args, scenario);
|
|
7066
|
+
const startTime = Date.now();
|
|
7067
|
+
let stdout;
|
|
7068
|
+
try {
|
|
7069
|
+
const result2 = await spawnProcess(cliConfig.command, args, { timeout });
|
|
7070
|
+
stdout = result2.stdout;
|
|
7071
|
+
} catch (err) {
|
|
7072
|
+
const elapsed = Date.now() - startTime;
|
|
7073
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7074
|
+
if (message.includes("TIMEOUT") || message.includes("timed out")) {
|
|
7075
|
+
return {
|
|
7076
|
+
success: false,
|
|
7077
|
+
toolCalls: [],
|
|
7078
|
+
error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
|
|
7079
|
+
};
|
|
7080
|
+
}
|
|
7081
|
+
return {
|
|
7082
|
+
success: false,
|
|
7083
|
+
toolCalls: [],
|
|
7084
|
+
error: `CLI host process failed: ${message}`
|
|
7085
|
+
};
|
|
7086
|
+
}
|
|
7087
|
+
const parse = getParser(cliConfig.outputFormat);
|
|
7088
|
+
let result;
|
|
7089
|
+
try {
|
|
7090
|
+
result = parse(stdout);
|
|
7091
|
+
} catch (err) {
|
|
7092
|
+
return {
|
|
7093
|
+
success: false,
|
|
7094
|
+
toolCalls: [],
|
|
7095
|
+
error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
|
|
7096
|
+
stdout (first 500 chars): ${stdout.slice(0, 500)}`
|
|
7097
|
+
};
|
|
7098
|
+
}
|
|
7099
|
+
const validationError = validateSimulationResult(result);
|
|
7100
|
+
if (validationError) {
|
|
7101
|
+
return {
|
|
7102
|
+
success: false,
|
|
7103
|
+
toolCalls: [],
|
|
7104
|
+
error: `CLI host returned invalid result: ${validationError}`
|
|
7105
|
+
};
|
|
7106
|
+
}
|
|
7107
|
+
return result;
|
|
7108
|
+
}
|
|
7109
|
+
function validateSimulationResult(result) {
|
|
7110
|
+
if (result === null || typeof result !== "object") {
|
|
7111
|
+
return `Expected object, got ${typeof result}`;
|
|
7112
|
+
}
|
|
7113
|
+
const obj = result;
|
|
7114
|
+
if (typeof obj.success !== "boolean") {
|
|
7115
|
+
return `"success" must be a boolean, got ${typeof obj.success}`;
|
|
7116
|
+
}
|
|
7117
|
+
if (!Array.isArray(obj.toolCalls)) {
|
|
7118
|
+
return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
|
|
7119
|
+
}
|
|
7120
|
+
for (let i = 0; i < obj.toolCalls.length; i++) {
|
|
7121
|
+
const tc = obj.toolCalls[i];
|
|
7122
|
+
if (typeof tc.name !== "string") {
|
|
7123
|
+
return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
|
|
7124
|
+
}
|
|
7125
|
+
if (typeof tc.arguments !== "object" || tc.arguments === null) {
|
|
7126
|
+
return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
|
|
7127
|
+
}
|
|
7128
|
+
}
|
|
7129
|
+
return null;
|
|
7130
|
+
}
|
|
7131
|
+
function spawnProcess(command, args, options) {
|
|
7132
|
+
return new Promise((resolve2, reject) => {
|
|
7133
|
+
const child = spawn(command, args, {
|
|
7134
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
7135
|
+
});
|
|
7136
|
+
child.stdin.end();
|
|
7137
|
+
const stdoutChunks = [];
|
|
7138
|
+
const stderrChunks = [];
|
|
7139
|
+
let totalBytes = 0;
|
|
7140
|
+
child.stdout.on("data", (chunk) => {
|
|
7141
|
+
totalBytes += chunk.length;
|
|
7142
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7143
|
+
stdoutChunks.push(chunk);
|
|
7144
|
+
}
|
|
7145
|
+
});
|
|
7146
|
+
child.stderr.on("data", (chunk) => {
|
|
7147
|
+
totalBytes += chunk.length;
|
|
7148
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7149
|
+
stderrChunks.push(chunk);
|
|
7150
|
+
}
|
|
7151
|
+
});
|
|
7152
|
+
const timer = setTimeout(() => {
|
|
7153
|
+
child.kill("SIGTERM");
|
|
7154
|
+
reject(new Error(`Process timed out after ${options.timeout}ms`));
|
|
7155
|
+
}, options.timeout);
|
|
7156
|
+
child.on("error", (err) => {
|
|
7157
|
+
clearTimeout(timer);
|
|
7158
|
+
reject(err);
|
|
7159
|
+
});
|
|
7160
|
+
child.on("close", (code) => {
|
|
7161
|
+
clearTimeout(timer);
|
|
7162
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
|
|
7163
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf-8");
|
|
7164
|
+
if (code !== 0) {
|
|
7165
|
+
reject(
|
|
7166
|
+
new Error(
|
|
7167
|
+
`Command failed with exit code ${code ?? "null"}` + (stderr ? `
|
|
7168
|
+
stderr: ${stderr}` : "")
|
|
7169
|
+
)
|
|
7170
|
+
);
|
|
7171
|
+
return;
|
|
7172
|
+
}
|
|
7173
|
+
resolve2({ stdout, stderr });
|
|
7174
|
+
});
|
|
7175
|
+
});
|
|
7176
|
+
}
|
|
7177
|
+
|
|
6602
7178
|
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6603
7179
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6604
7180
|
var allProviders = [
|
|
@@ -6616,6 +7192,25 @@ var simulatorRegistry = new Map(
|
|
|
6616
7192
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6617
7193
|
);
|
|
6618
7194
|
async function simulateMCPHost(mcp, scenario, config) {
|
|
7195
|
+
const hostType = config.hostType ?? "sdk";
|
|
7196
|
+
if (hostType === "cli") {
|
|
7197
|
+
if (!config.cli) {
|
|
7198
|
+
throw new Error(
|
|
7199
|
+
`mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
|
|
7200
|
+
);
|
|
7201
|
+
}
|
|
7202
|
+
return runCLIHost(config.cli, scenario);
|
|
7203
|
+
}
|
|
7204
|
+
if (hostType === "browser" || hostType === "desktop") {
|
|
7205
|
+
throw new Error(
|
|
7206
|
+
`Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
|
|
7207
|
+
);
|
|
7208
|
+
}
|
|
7209
|
+
if (!config.provider) {
|
|
7210
|
+
throw new Error(
|
|
7211
|
+
`mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
|
|
7212
|
+
);
|
|
7213
|
+
}
|
|
6619
7214
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6620
7215
|
if (!simulator) {
|
|
6621
7216
|
throw new Error(
|
|
@@ -6807,17 +7402,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6807
7402
|
};
|
|
6808
7403
|
}
|
|
6809
7404
|
if (expectBlock.passesJudge !== void 0) {
|
|
6810
|
-
const
|
|
6811
|
-
const
|
|
6812
|
-
|
|
6813
|
-
|
|
6814
|
-
|
|
6815
|
-
|
|
6816
|
-
|
|
6817
|
-
|
|
6818
|
-
|
|
6819
|
-
|
|
6820
|
-
|
|
7405
|
+
const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
|
|
7406
|
+
const judgeResultEntries = await Promise.all(
|
|
7407
|
+
judgeConfigs.map(async (judgeConfig) => {
|
|
7408
|
+
const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
|
|
7409
|
+
const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
|
|
7410
|
+
const validation = await validateJudge(response, {
|
|
7411
|
+
...judgeConfig,
|
|
7412
|
+
reference: effectiveReference,
|
|
7413
|
+
reps: effectiveReps
|
|
7414
|
+
});
|
|
7415
|
+
const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
|
|
7416
|
+
return {
|
|
7417
|
+
pass: validation.pass,
|
|
7418
|
+
details: validation.message,
|
|
7419
|
+
score: validation.details?.score,
|
|
7420
|
+
reasoning: validation.details?.reasoning,
|
|
7421
|
+
judgeName,
|
|
7422
|
+
judgeProvider: validation.details?.judgeProvider,
|
|
7423
|
+
judgeModel: validation.details?.judgeModel
|
|
7424
|
+
};
|
|
7425
|
+
})
|
|
7426
|
+
);
|
|
7427
|
+
if (judgeResultEntries.length === 1) {
|
|
7428
|
+
results.judge = judgeResultEntries[0];
|
|
7429
|
+
} else {
|
|
7430
|
+
const allPassed = judgeResultEntries.every((r) => r.pass);
|
|
7431
|
+
const passCount = judgeResultEntries.filter((r) => r.pass).length;
|
|
7432
|
+
results.judge = {
|
|
7433
|
+
pass: allPassed,
|
|
7434
|
+
details: `${passCount}/${judgeResultEntries.length} judges passed`,
|
|
7435
|
+
judgeResults: judgeResultEntries
|
|
7436
|
+
};
|
|
7437
|
+
}
|
|
6821
7438
|
}
|
|
6822
7439
|
if (expectBlock.snapshot !== void 0) {
|
|
6823
7440
|
if (!config.playwrightExpect) {
|
|
@@ -6846,6 +7463,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6846
7463
|
}
|
|
6847
7464
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6848
7465
|
}
|
|
7466
|
+
function buildRequest(evalCase) {
|
|
7467
|
+
const request = {};
|
|
7468
|
+
if (evalCase.description) request.description = evalCase.description;
|
|
7469
|
+
if (evalCase.mode === "mcp_host") {
|
|
7470
|
+
if (evalCase.scenario) request.scenario = evalCase.scenario;
|
|
7471
|
+
if (evalCase.mcpHostConfig) {
|
|
7472
|
+
request.mcpHostConfig = {
|
|
7473
|
+
provider: evalCase.mcpHostConfig.provider,
|
|
7474
|
+
...evalCase.mcpHostConfig.model !== void 0 && {
|
|
7475
|
+
model: evalCase.mcpHostConfig.model
|
|
7476
|
+
}
|
|
7477
|
+
};
|
|
7478
|
+
}
|
|
7479
|
+
} else {
|
|
7480
|
+
if (evalCase.args) request.args = evalCase.args;
|
|
7481
|
+
}
|
|
7482
|
+
return request;
|
|
7483
|
+
}
|
|
6849
7484
|
function isMCPHostSimulationResult(value) {
|
|
6850
7485
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6851
7486
|
}
|
|
@@ -6894,6 +7529,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6894
7529
|
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6895
7530
|
source: "eval",
|
|
6896
7531
|
pass: didCasePass(error, expectationResults),
|
|
7532
|
+
request: buildRequest(evalCase),
|
|
6897
7533
|
response,
|
|
6898
7534
|
error,
|
|
6899
7535
|
expectations: expectationResults,
|
|
@@ -6919,7 +7555,7 @@ function isInfrastructureError(err) {
|
|
|
6919
7555
|
} else {
|
|
6920
7556
|
return false;
|
|
6921
7557
|
}
|
|
6922
|
-
return name15 === "
|
|
7558
|
+
return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
6923
7559
|
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
6924
7560
|
}
|
|
6925
7561
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
@@ -7036,8 +7672,13 @@ async function runEvalDataset(options, context) {
|
|
|
7036
7672
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7037
7673
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7038
7674
|
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7039
|
-
|
|
7040
|
-
|
|
7675
|
+
if (c.expect?.passesJudge == null) return sum;
|
|
7676
|
+
const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
|
|
7677
|
+
const totalReps = judges.reduce(
|
|
7678
|
+
(r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
|
|
7679
|
+
0
|
|
7680
|
+
);
|
|
7681
|
+
return sum + effectiveIterations * totalReps;
|
|
7041
7682
|
}, 0);
|
|
7042
7683
|
if (estimatedJudgeCalls > 50) {
|
|
7043
7684
|
debugEval(
|
|
@@ -7385,6 +8026,6 @@ function formatCapabilities(capabilities) {
|
|
|
7385
8026
|
return parts.length > 0 ? parts.join(", ") : "none declared";
|
|
7386
8027
|
}
|
|
7387
8028
|
|
|
7388
|
-
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
8029
|
+
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
7389
8030
|
//# sourceMappingURL=index.js.map
|
|
7390
8031
|
//# sourceMappingURL=index.js.map
|