@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +284 -24
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +649 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +504 -115
- package/dist/index.d.ts +504 -115
- package/dist/index.js +648 -64
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +12 -7
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.cjs
CHANGED
|
@@ -3306,7 +3306,11 @@ async function performOAuthSetup(config) {
|
|
|
3306
3306
|
const page = await context.newPage();
|
|
3307
3307
|
page.setDefaultTimeout(timeoutMs);
|
|
3308
3308
|
await page.goto(authorizationUrl.toString());
|
|
3309
|
-
|
|
3309
|
+
if ("customLoginFlow" in config && config.customLoginFlow) {
|
|
3310
|
+
await config.customLoginFlow(page);
|
|
3311
|
+
} else {
|
|
3312
|
+
await completeLoginForm(page, config);
|
|
3313
|
+
}
|
|
3310
3314
|
await page.waitForURL(
|
|
3311
3315
|
(url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
|
|
3312
3316
|
{ timeout: timeoutMs }
|
|
@@ -4407,7 +4411,7 @@ function escapeHtml(text) {
|
|
|
4407
4411
|
|
|
4408
4412
|
// package.json
|
|
4409
4413
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4414
|
+
version: "1.0.0-beta.8"};
|
|
4411
4415
|
|
|
4412
4416
|
// src/mcp/clientFactory.ts
|
|
4413
4417
|
function getRetryAfterDelayMs(err) {
|
|
@@ -5124,9 +5128,17 @@ function formatBytes(bytes) {
|
|
|
5124
5128
|
function isSimulationResult(value) {
|
|
5125
5129
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
5126
5130
|
}
|
|
5131
|
+
function isPatternMatcher(v) {
|
|
5132
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
5133
|
+
}
|
|
5127
5134
|
function partialMatch(actual, expected) {
|
|
5128
5135
|
return Object.entries(expected).every(([k, v]) => {
|
|
5129
5136
|
const actualVal = actual[k];
|
|
5137
|
+
if (isPatternMatcher(v)) {
|
|
5138
|
+
if (typeof actualVal !== "string") return false;
|
|
5139
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
5140
|
+
return re.test(actualVal);
|
|
5141
|
+
}
|
|
5130
5142
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
5131
5143
|
return partialMatch(
|
|
5132
5144
|
actualVal,
|
|
@@ -5265,7 +5277,175 @@ var JudgeResponseSchema = zod.z.object({
|
|
|
5265
5277
|
reasoning: zod.z.string()
|
|
5266
5278
|
});
|
|
5267
5279
|
|
|
5268
|
-
// src/judge/
|
|
5280
|
+
// src/judge/anthropicJudge.ts
|
|
5281
|
+
function createAnthropicJudge(config = {}) {
|
|
5282
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
5283
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
5284
|
+
if (!apiKey) {
|
|
5285
|
+
throw new Error(
|
|
5286
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
5287
|
+
);
|
|
5288
|
+
}
|
|
5289
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5290
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5291
|
+
const temperature = config.temperature ?? 0;
|
|
5292
|
+
return {
|
|
5293
|
+
async evaluate(candidate, reference, rubric) {
|
|
5294
|
+
let anthropicModule;
|
|
5295
|
+
try {
|
|
5296
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
5297
|
+
} catch (err) {
|
|
5298
|
+
throw new Error(
|
|
5299
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
5300
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5301
|
+
);
|
|
5302
|
+
}
|
|
5303
|
+
const client = new anthropicModule.default({ apiKey });
|
|
5304
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
5305
|
+
const startTime = Date.now();
|
|
5306
|
+
const response = await client.messages.create({
|
|
5307
|
+
model,
|
|
5308
|
+
max_tokens: maxTokens,
|
|
5309
|
+
temperature,
|
|
5310
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5311
|
+
messages: [{ role: "user", content: prompt }]
|
|
5312
|
+
});
|
|
5313
|
+
const durationMs = Date.now() - startTime;
|
|
5314
|
+
const textBlock = response.content.find(
|
|
5315
|
+
(b) => b.type === "text"
|
|
5316
|
+
);
|
|
5317
|
+
const text = textBlock?.text ?? "";
|
|
5318
|
+
const parsed = parseJudgeResponse(text);
|
|
5319
|
+
return {
|
|
5320
|
+
pass: parsed.pass,
|
|
5321
|
+
score: parsed.score,
|
|
5322
|
+
reasoning: parsed.reasoning,
|
|
5323
|
+
usage: {
|
|
5324
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5325
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5326
|
+
totalCostUsd: 0,
|
|
5327
|
+
durationMs
|
|
5328
|
+
}
|
|
5329
|
+
};
|
|
5330
|
+
}
|
|
5331
|
+
};
|
|
5332
|
+
}
|
|
5333
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
5334
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5335
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5336
|
+
return `Rubric:
|
|
5337
|
+
${rubric}
|
|
5338
|
+
|
|
5339
|
+
<candidate_response>
|
|
5340
|
+
${candidateStr}
|
|
5341
|
+
</candidate_response>
|
|
5342
|
+
|
|
5343
|
+
<reference_answer>
|
|
5344
|
+
${referenceStr ?? "No reference provided."}
|
|
5345
|
+
</reference_answer>
|
|
5346
|
+
|
|
5347
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5348
|
+
}
|
|
5349
|
+
function parseJudgeResponse(text) {
|
|
5350
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5351
|
+
let parsed;
|
|
5352
|
+
try {
|
|
5353
|
+
parsed = JSON.parse(cleaned);
|
|
5354
|
+
} catch {
|
|
5355
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5356
|
+
}
|
|
5357
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5358
|
+
if (!result.success) {
|
|
5359
|
+
throw new Error(
|
|
5360
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5361
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5362
|
+
);
|
|
5363
|
+
}
|
|
5364
|
+
return result.data;
|
|
5365
|
+
}
|
|
5366
|
+
|
|
5367
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
5368
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
5369
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5370
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5371
|
+
const temperature = config.temperature ?? 0;
|
|
5372
|
+
return {
|
|
5373
|
+
async evaluate(candidate, reference, rubric) {
|
|
5374
|
+
let vertexModule;
|
|
5375
|
+
try {
|
|
5376
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
5377
|
+
} catch (err) {
|
|
5378
|
+
throw new Error(
|
|
5379
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
5380
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5381
|
+
);
|
|
5382
|
+
}
|
|
5383
|
+
const client = new vertexModule.AnthropicVertex({
|
|
5384
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
5385
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
5386
|
+
});
|
|
5387
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
5388
|
+
const startTime = Date.now();
|
|
5389
|
+
const response = await client.messages.create({
|
|
5390
|
+
model,
|
|
5391
|
+
max_tokens: maxTokens,
|
|
5392
|
+
temperature,
|
|
5393
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5394
|
+
messages: [{ role: "user", content: prompt }]
|
|
5395
|
+
});
|
|
5396
|
+
const durationMs = Date.now() - startTime;
|
|
5397
|
+
const textBlock = response.content.find(
|
|
5398
|
+
(b) => b.type === "text"
|
|
5399
|
+
);
|
|
5400
|
+
const text = textBlock?.text ?? "";
|
|
5401
|
+
const parsed = parseJudgeResponse2(text);
|
|
5402
|
+
return {
|
|
5403
|
+
pass: parsed.pass,
|
|
5404
|
+
score: parsed.score,
|
|
5405
|
+
reasoning: parsed.reasoning,
|
|
5406
|
+
usage: {
|
|
5407
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5408
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5409
|
+
totalCostUsd: 0,
|
|
5410
|
+
durationMs
|
|
5411
|
+
}
|
|
5412
|
+
};
|
|
5413
|
+
}
|
|
5414
|
+
};
|
|
5415
|
+
}
|
|
5416
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
5417
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5418
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5419
|
+
return `Rubric:
|
|
5420
|
+
${rubric}
|
|
5421
|
+
|
|
5422
|
+
<candidate_response>
|
|
5423
|
+
${candidateStr}
|
|
5424
|
+
</candidate_response>
|
|
5425
|
+
|
|
5426
|
+
<reference_answer>
|
|
5427
|
+
${referenceStr ?? "No reference provided."}
|
|
5428
|
+
</reference_answer>
|
|
5429
|
+
|
|
5430
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5431
|
+
}
|
|
5432
|
+
function parseJudgeResponse2(text) {
|
|
5433
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5434
|
+
let parsed;
|
|
5435
|
+
try {
|
|
5436
|
+
parsed = JSON.parse(cleaned);
|
|
5437
|
+
} catch {
|
|
5438
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5439
|
+
}
|
|
5440
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5441
|
+
if (!result.success) {
|
|
5442
|
+
throw new Error(
|
|
5443
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5444
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5445
|
+
);
|
|
5446
|
+
}
|
|
5447
|
+
return result.data;
|
|
5448
|
+
}
|
|
5269
5449
|
function createClaudeAgentJudge(config) {
|
|
5270
5450
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5271
5451
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -5283,7 +5463,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5283
5463
|
exceedsMaxToolOutputSize: true
|
|
5284
5464
|
};
|
|
5285
5465
|
}
|
|
5286
|
-
const prompt =
|
|
5466
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
5287
5467
|
try {
|
|
5288
5468
|
let resultMessage;
|
|
5289
5469
|
for await (const message of claudeAgentSdk.query({
|
|
@@ -5315,7 +5495,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5315
5495
|
);
|
|
5316
5496
|
}
|
|
5317
5497
|
const responseText = resultMessage.result ?? "";
|
|
5318
|
-
const parsed =
|
|
5498
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
5319
5499
|
const usage = {
|
|
5320
5500
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
5321
5501
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -5344,7 +5524,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5344
5524
|
function buildSystemPrompt() {
|
|
5345
5525
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
5346
5526
|
}
|
|
5347
|
-
function
|
|
5527
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
5348
5528
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5349
5529
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5350
5530
|
const parts = [];
|
|
@@ -5361,7 +5541,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
5361
5541
|
);
|
|
5362
5542
|
return parts.join("");
|
|
5363
5543
|
}
|
|
5364
|
-
function
|
|
5544
|
+
function parseJudgeResponse3(text) {
|
|
5365
5545
|
let jsonText = text.trim();
|
|
5366
5546
|
if (jsonText.startsWith("```json")) {
|
|
5367
5547
|
jsonText = jsonText.slice(7);
|
|
@@ -5418,7 +5598,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5418
5598
|
);
|
|
5419
5599
|
}
|
|
5420
5600
|
const client = new openaiModule.default({ apiKey });
|
|
5421
|
-
const prompt =
|
|
5601
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
5422
5602
|
const startTime = Date.now();
|
|
5423
5603
|
const completion = await client.chat.completions.create({
|
|
5424
5604
|
model,
|
|
@@ -5434,7 +5614,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5434
5614
|
});
|
|
5435
5615
|
const durationMs = Date.now() - startTime;
|
|
5436
5616
|
const text = completion.choices[0]?.message.content ?? "";
|
|
5437
|
-
const parsed =
|
|
5617
|
+
const parsed = parseJudgeResponse4(text);
|
|
5438
5618
|
return {
|
|
5439
5619
|
pass: parsed.pass,
|
|
5440
5620
|
score: parsed.score,
|
|
@@ -5449,7 +5629,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5449
5629
|
}
|
|
5450
5630
|
};
|
|
5451
5631
|
}
|
|
5452
|
-
function
|
|
5632
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
5453
5633
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5454
5634
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5455
5635
|
return `Rubric:
|
|
@@ -5465,7 +5645,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
5465
5645
|
|
|
5466
5646
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5467
5647
|
}
|
|
5468
|
-
function
|
|
5648
|
+
function parseJudgeResponse4(text) {
|
|
5469
5649
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5470
5650
|
let parsed;
|
|
5471
5651
|
try {
|
|
@@ -5567,6 +5747,10 @@ function createJudge(config = {}) {
|
|
|
5567
5747
|
const provider = config.provider ?? "anthropic";
|
|
5568
5748
|
switch (provider) {
|
|
5569
5749
|
case "anthropic":
|
|
5750
|
+
return createAnthropicJudge(config);
|
|
5751
|
+
case "vertex-anthropic":
|
|
5752
|
+
return createVertexAnthropicJudge(config);
|
|
5753
|
+
case "anthropic-agent-sdk":
|
|
5570
5754
|
return createClaudeAgentJudge(config);
|
|
5571
5755
|
case "openai":
|
|
5572
5756
|
return createOpenAIJudge(config);
|
|
@@ -5577,6 +5761,34 @@ function createJudge(config = {}) {
|
|
|
5577
5761
|
}
|
|
5578
5762
|
}
|
|
5579
5763
|
|
|
5764
|
+
// src/judge/judgeRegistry.ts
|
|
5765
|
+
var registry = /* @__PURE__ */ new Map();
|
|
5766
|
+
function registerJudge(name15, executor) {
|
|
5767
|
+
const existing = registry.get(name15);
|
|
5768
|
+
if (existing !== void 0) {
|
|
5769
|
+
if (existing === executor) {
|
|
5770
|
+
return;
|
|
5771
|
+
}
|
|
5772
|
+
throw new Error(
|
|
5773
|
+
`Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
|
|
5774
|
+
);
|
|
5775
|
+
}
|
|
5776
|
+
registry.set(name15, executor);
|
|
5777
|
+
}
|
|
5778
|
+
function getRegisteredJudge(name15) {
|
|
5779
|
+
const executor = registry.get(name15);
|
|
5780
|
+
if (!executor) {
|
|
5781
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
5782
|
+
throw new Error(
|
|
5783
|
+
`Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
5784
|
+
);
|
|
5785
|
+
}
|
|
5786
|
+
return executor;
|
|
5787
|
+
}
|
|
5788
|
+
function clearJudgeRegistry() {
|
|
5789
|
+
registry.clear();
|
|
5790
|
+
}
|
|
5791
|
+
|
|
5580
5792
|
// src/assertions/validators/judge.ts
|
|
5581
5793
|
function computeStdDev(scores, mean) {
|
|
5582
5794
|
if (scores.length <= 1) return 0;
|
|
@@ -5585,6 +5797,7 @@ function computeStdDev(scores, mean) {
|
|
|
5585
5797
|
}
|
|
5586
5798
|
async function validateJudge(response, config) {
|
|
5587
5799
|
const {
|
|
5800
|
+
judge: judgeName,
|
|
5588
5801
|
rubric,
|
|
5589
5802
|
reference,
|
|
5590
5803
|
threshold = 0.7,
|
|
@@ -5597,6 +5810,29 @@ async function validateJudge(response, config) {
|
|
|
5597
5810
|
maxBudgetUsd,
|
|
5598
5811
|
maxToolOutputSize
|
|
5599
5812
|
} = config;
|
|
5813
|
+
if (judgeName !== void 0) {
|
|
5814
|
+
try {
|
|
5815
|
+
const executor = getRegisteredJudge(judgeName);
|
|
5816
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
5817
|
+
const score = judgeResult.score;
|
|
5818
|
+
const passed = score >= threshold;
|
|
5819
|
+
return {
|
|
5820
|
+
pass: passed,
|
|
5821
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
5822
|
+
};
|
|
5823
|
+
} catch (err) {
|
|
5824
|
+
return {
|
|
5825
|
+
pass: false,
|
|
5826
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
5827
|
+
};
|
|
5828
|
+
}
|
|
5829
|
+
}
|
|
5830
|
+
if (rubric === void 0) {
|
|
5831
|
+
return {
|
|
5832
|
+
pass: false,
|
|
5833
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
5834
|
+
};
|
|
5835
|
+
}
|
|
5600
5836
|
const resolvedRubric = resolveRubric(rubric);
|
|
5601
5837
|
const judgeConfig = {
|
|
5602
5838
|
...provider !== void 0 && { provider },
|
|
@@ -5643,11 +5879,17 @@ async function validateJudge(response, config) {
|
|
|
5643
5879
|
return {
|
|
5644
5880
|
pass: passed,
|
|
5645
5881
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
5646
|
-
details:
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
|
|
5650
|
-
|
|
5882
|
+
details: {
|
|
5883
|
+
score: meanScore,
|
|
5884
|
+
reasoning: lastReasoning,
|
|
5885
|
+
judgeProvider: provider ?? "anthropic",
|
|
5886
|
+
judgeModel: model,
|
|
5887
|
+
...reps > 1 && {
|
|
5888
|
+
scores,
|
|
5889
|
+
scoreStdDev: stdDev,
|
|
5890
|
+
highVariance
|
|
5891
|
+
}
|
|
5892
|
+
}
|
|
5651
5893
|
};
|
|
5652
5894
|
} catch (err) {
|
|
5653
5895
|
return {
|
|
@@ -6026,31 +6268,68 @@ function toBeToolError(received, expected = true) {
|
|
|
6026
6268
|
|
|
6027
6269
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
6028
6270
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
6029
|
-
async function
|
|
6271
|
+
async function runSingleJudge(received, rubric, options) {
|
|
6030
6272
|
const {
|
|
6031
6273
|
reference = null,
|
|
6032
6274
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
6033
6275
|
reps,
|
|
6034
6276
|
provider,
|
|
6035
|
-
model
|
|
6277
|
+
model,
|
|
6278
|
+
judge
|
|
6036
6279
|
} = options;
|
|
6037
6280
|
const validation = await validateJudge(received, {
|
|
6038
|
-
rubric,
|
|
6281
|
+
...rubric !== void 0 && { rubric },
|
|
6039
6282
|
reference: reference ?? void 0,
|
|
6040
6283
|
threshold: passingThreshold,
|
|
6041
6284
|
...reps !== void 0 && { reps },
|
|
6042
6285
|
...provider !== void 0 && { provider },
|
|
6043
|
-
...model !== void 0 && { model }
|
|
6286
|
+
...model !== void 0 && { model },
|
|
6287
|
+
...judge !== void 0 && { judge }
|
|
6044
6288
|
});
|
|
6289
|
+
return { pass: validation.pass, message: validation.message };
|
|
6290
|
+
}
|
|
6291
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
6292
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
6293
|
+
const results = await Promise.all(
|
|
6294
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
6295
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
6296
|
+
return runSingleJudge(received, r, opts);
|
|
6297
|
+
})
|
|
6298
|
+
);
|
|
6299
|
+
const allPassed = results.every((r) => r.pass);
|
|
6300
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
6301
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
6302
|
+
const details = results.map((r) => r.message).join("\n");
|
|
6303
|
+
if (this.isNot) {
|
|
6304
|
+
return {
|
|
6305
|
+
pass: !allPassed,
|
|
6306
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
6307
|
+
};
|
|
6308
|
+
}
|
|
6309
|
+
return {
|
|
6310
|
+
pass: allPassed,
|
|
6311
|
+
message: () => `${summary}
|
|
6312
|
+
${details}`
|
|
6313
|
+
};
|
|
6314
|
+
}
|
|
6315
|
+
let rubric;
|
|
6316
|
+
let options;
|
|
6317
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
6318
|
+
rubric = rubricOrOptions;
|
|
6319
|
+
options = maybeOptions ?? {};
|
|
6320
|
+
} else {
|
|
6321
|
+
options = rubricOrOptions;
|
|
6322
|
+
}
|
|
6323
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
6045
6324
|
if (this.isNot) {
|
|
6046
6325
|
return {
|
|
6047
|
-
pass: !
|
|
6048
|
-
message: () =>
|
|
6326
|
+
pass: !result.pass,
|
|
6327
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
6049
6328
|
};
|
|
6050
6329
|
}
|
|
6051
6330
|
return {
|
|
6052
|
-
pass:
|
|
6053
|
-
message: () =>
|
|
6331
|
+
pass: result.pass,
|
|
6332
|
+
message: () => result.message
|
|
6054
6333
|
};
|
|
6055
6334
|
}
|
|
6056
6335
|
|
|
@@ -6334,6 +6613,7 @@ function getAuthConfigFromEnv() {
|
|
|
6334
6613
|
return void 0;
|
|
6335
6614
|
}
|
|
6336
6615
|
var MCPHostConfigSchema = zod.z.object({
|
|
6616
|
+
hostType: zod.z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
|
|
6337
6617
|
provider: zod.z.enum([
|
|
6338
6618
|
"openai",
|
|
6339
6619
|
"anthropic",
|
|
@@ -6344,12 +6624,18 @@ var MCPHostConfigSchema = zod.z.object({
|
|
|
6344
6624
|
"openrouter",
|
|
6345
6625
|
"xai",
|
|
6346
6626
|
"vertex-anthropic"
|
|
6347
|
-
]),
|
|
6627
|
+
]).optional(),
|
|
6348
6628
|
apiKeyEnvVar: zod.z.string().optional(),
|
|
6349
6629
|
model: zod.z.string().optional(),
|
|
6350
6630
|
maxTokens: zod.z.number().optional(),
|
|
6351
6631
|
temperature: zod.z.number().optional(),
|
|
6352
|
-
maxToolCalls: zod.z.number().optional()
|
|
6632
|
+
maxToolCalls: zod.z.number().optional(),
|
|
6633
|
+
cli: zod.z.object({
|
|
6634
|
+
command: zod.z.string(),
|
|
6635
|
+
args: zod.z.array(zod.z.string()),
|
|
6636
|
+
outputFormat: zod.z.enum(["stream-json", "json"]).optional(),
|
|
6637
|
+
timeout: zod.z.number().optional()
|
|
6638
|
+
}).optional()
|
|
6353
6639
|
});
|
|
6354
6640
|
var SnapshotSanitizerSchema = zod.z.union([
|
|
6355
6641
|
// Built-in sanitizers
|
|
@@ -6364,6 +6650,37 @@ var SnapshotSanitizerSchema = zod.z.union([
|
|
|
6364
6650
|
remove: zod.z.array(zod.z.string())
|
|
6365
6651
|
})
|
|
6366
6652
|
]);
|
|
6653
|
+
var JudgeExpectConfigSchema = zod.z.object({
|
|
6654
|
+
judge: zod.z.string().min(1).optional(),
|
|
6655
|
+
rubric: zod.z.union([
|
|
6656
|
+
zod.z.enum([
|
|
6657
|
+
"correctness",
|
|
6658
|
+
"completeness",
|
|
6659
|
+
"groundedness",
|
|
6660
|
+
"instruction-following",
|
|
6661
|
+
"conciseness"
|
|
6662
|
+
]),
|
|
6663
|
+
zod.z.object({ text: zod.z.string().min(1) })
|
|
6664
|
+
]).optional(),
|
|
6665
|
+
reference: zod.z.unknown().optional(),
|
|
6666
|
+
threshold: zod.z.number().min(0).max(1).optional(),
|
|
6667
|
+
reps: zod.z.number().int().min(1).optional(),
|
|
6668
|
+
provider: zod.z.enum([
|
|
6669
|
+
"anthropic",
|
|
6670
|
+
"vertex-anthropic",
|
|
6671
|
+
"anthropic-agent-sdk",
|
|
6672
|
+
"openai",
|
|
6673
|
+
"google"
|
|
6674
|
+
]).optional(),
|
|
6675
|
+
model: zod.z.string().optional(),
|
|
6676
|
+
apiKeyEnvVar: zod.z.string().optional(),
|
|
6677
|
+
maxTokens: zod.z.number().int().positive().optional(),
|
|
6678
|
+
temperature: zod.z.number().min(0).max(1).optional(),
|
|
6679
|
+
maxBudgetUsd: zod.z.number().positive().optional(),
|
|
6680
|
+
maxToolOutputSize: zod.z.number().int().positive().optional()
|
|
6681
|
+
}).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
|
|
6682
|
+
message: 'Either "judge" or "rubric" must be provided in passesJudge'
|
|
6683
|
+
});
|
|
6367
6684
|
var EvalExpectBlockSchema = zod.z.object({
|
|
6368
6685
|
response: zod.z.unknown().optional(),
|
|
6369
6686
|
schema: zod.z.string().optional(),
|
|
@@ -6372,28 +6689,7 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6372
6689
|
snapshot: zod.z.string().optional(),
|
|
6373
6690
|
snapshotSanitizers: zod.z.array(SnapshotSanitizerSchema).optional(),
|
|
6374
6691
|
isError: zod.z.union([zod.z.boolean(), zod.z.string(), zod.z.array(zod.z.string())]).optional(),
|
|
6375
|
-
passesJudge: zod.z.
|
|
6376
|
-
rubric: zod.z.union([
|
|
6377
|
-
zod.z.enum([
|
|
6378
|
-
"correctness",
|
|
6379
|
-
"completeness",
|
|
6380
|
-
"groundedness",
|
|
6381
|
-
"instruction-following",
|
|
6382
|
-
"conciseness"
|
|
6383
|
-
]),
|
|
6384
|
-
zod.z.object({ text: zod.z.string().min(1) })
|
|
6385
|
-
]),
|
|
6386
|
-
reference: zod.z.unknown().optional(),
|
|
6387
|
-
threshold: zod.z.number().min(0).max(1).optional(),
|
|
6388
|
-
reps: zod.z.number().int().min(1).optional(),
|
|
6389
|
-
provider: zod.z.enum(["anthropic", "openai", "google"]).optional(),
|
|
6390
|
-
model: zod.z.string().optional(),
|
|
6391
|
-
apiKeyEnvVar: zod.z.string().optional(),
|
|
6392
|
-
maxTokens: zod.z.number().int().positive().optional(),
|
|
6393
|
-
temperature: zod.z.number().min(0).max(1).optional(),
|
|
6394
|
-
maxBudgetUsd: zod.z.number().positive().optional(),
|
|
6395
|
-
maxToolOutputSize: zod.z.number().int().positive().optional()
|
|
6396
|
-
}).optional(),
|
|
6692
|
+
passesJudge: zod.z.union([JudgeExpectConfigSchema, zod.z.array(JudgeExpectConfigSchema).min(1)]).optional(),
|
|
6397
6693
|
responseSize: zod.z.object({
|
|
6398
6694
|
maxBytes: zod.z.number().optional(),
|
|
6399
6695
|
minBytes: zod.z.number().optional()
|
|
@@ -6566,6 +6862,9 @@ function createVercelOrchestrator() {
|
|
|
6566
6862
|
try {
|
|
6567
6863
|
const { generateText, stepCountIs } = await import('ai');
|
|
6568
6864
|
const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
|
|
6865
|
+
if (!config.provider) {
|
|
6866
|
+
throw new Error("provider is required for SDK host type");
|
|
6867
|
+
}
|
|
6569
6868
|
const modelId = config.model ?? defaultModel(config.provider);
|
|
6570
6869
|
const model = await loadModel(config.provider, modelId);
|
|
6571
6870
|
const mcpTools = await mcp.listTools();
|
|
@@ -6619,13 +6918,233 @@ function createVercelOrchestrator() {
|
|
|
6619
6918
|
return {
|
|
6620
6919
|
success: false,
|
|
6621
6920
|
toolCalls: [],
|
|
6622
|
-
error: enrichErrorMessage(err, config.provider)
|
|
6921
|
+
error: enrichErrorMessage(err, config.provider ?? "unknown")
|
|
6623
6922
|
};
|
|
6624
6923
|
}
|
|
6625
6924
|
}
|
|
6626
6925
|
};
|
|
6627
6926
|
}
|
|
6628
6927
|
|
|
6928
|
+
// src/evals/mcpHost/adapters/cli/parsers.ts
|
|
6929
|
+
function parseStreamJson(stdout) {
|
|
6930
|
+
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6931
|
+
const toolCalls = [];
|
|
6932
|
+
const textParts = [];
|
|
6933
|
+
const conversationHistory = [];
|
|
6934
|
+
for (const line of lines) {
|
|
6935
|
+
let event;
|
|
6936
|
+
try {
|
|
6937
|
+
event = JSON.parse(line);
|
|
6938
|
+
} catch {
|
|
6939
|
+
continue;
|
|
6940
|
+
}
|
|
6941
|
+
if (event.type === "assistant" && event.message?.content) {
|
|
6942
|
+
for (const block of event.message.content) {
|
|
6943
|
+
if (block.type === "tool_use" && block.name) {
|
|
6944
|
+
const rawName = block.name;
|
|
6945
|
+
const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
|
|
6946
|
+
toolCalls.push({
|
|
6947
|
+
name: mcpMatch ? mcpMatch[1] : rawName,
|
|
6948
|
+
arguments: block.input ?? {},
|
|
6949
|
+
id: block.id
|
|
6950
|
+
});
|
|
6951
|
+
}
|
|
6952
|
+
if (block.type === "text" && block.text) {
|
|
6953
|
+
textParts.push(block.text);
|
|
6954
|
+
}
|
|
6955
|
+
}
|
|
6956
|
+
}
|
|
6957
|
+
if (event.type === "user" && event.message?.content) {
|
|
6958
|
+
for (const block of event.message.content) {
|
|
6959
|
+
if (block.type === "tool_result") {
|
|
6960
|
+
const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
|
|
6961
|
+
conversationHistory.push({ role: "tool", content });
|
|
6962
|
+
}
|
|
6963
|
+
}
|
|
6964
|
+
}
|
|
6965
|
+
if (event.type === "result" && typeof event.result === "string") {
|
|
6966
|
+
if (textParts.length === 0) {
|
|
6967
|
+
textParts.push(event.result);
|
|
6968
|
+
}
|
|
6969
|
+
}
|
|
6970
|
+
if (event.type === "result" && event.is_error === true) {
|
|
6971
|
+
return {
|
|
6972
|
+
success: false,
|
|
6973
|
+
toolCalls,
|
|
6974
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
6975
|
+
};
|
|
6976
|
+
}
|
|
6977
|
+
}
|
|
6978
|
+
const response = textParts.join("");
|
|
6979
|
+
if (response) {
|
|
6980
|
+
conversationHistory.push({ role: "assistant", content: response });
|
|
6981
|
+
}
|
|
6982
|
+
return {
|
|
6983
|
+
success: true,
|
|
6984
|
+
toolCalls,
|
|
6985
|
+
response: response || void 0,
|
|
6986
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
6987
|
+
};
|
|
6988
|
+
}
|
|
6989
|
+
function createJsonParser(paths) {
|
|
6990
|
+
return (stdout) => {
|
|
6991
|
+
const data = JSON.parse(stdout);
|
|
6992
|
+
const rawToolCalls = getNestedValue(data, paths.toolCalls);
|
|
6993
|
+
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
|
|
6994
|
+
name: typeof tc.name === "string" ? tc.name : "",
|
|
6995
|
+
arguments: tc.arguments ?? tc.args ?? {}
|
|
6996
|
+
})) : [];
|
|
6997
|
+
const response = getNestedValue(data, paths.response);
|
|
6998
|
+
const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
|
|
6999
|
+
return {
|
|
7000
|
+
success,
|
|
7001
|
+
toolCalls,
|
|
7002
|
+
response: typeof response === "string" ? response : void 0
|
|
7003
|
+
};
|
|
7004
|
+
};
|
|
7005
|
+
}
|
|
7006
|
+
function getNestedValue(obj, path3) {
|
|
7007
|
+
return path3.split(".").reduce((current, key) => {
|
|
7008
|
+
if (current !== null && typeof current === "object") {
|
|
7009
|
+
return current[key];
|
|
7010
|
+
}
|
|
7011
|
+
return void 0;
|
|
7012
|
+
}, obj);
|
|
7013
|
+
}
|
|
7014
|
+
|
|
7015
|
+
// src/evals/mcpHost/adapters/cli/runner.ts
|
|
7016
|
+
var DEFAULT_TIMEOUT = 12e4;
|
|
7017
|
+
var MAX_BUFFER = 10 * 1024 * 1024;
|
|
7018
|
+
function getParser(format) {
|
|
7019
|
+
switch (format ?? "stream-json") {
|
|
7020
|
+
case "stream-json":
|
|
7021
|
+
return parseStreamJson;
|
|
7022
|
+
case "json":
|
|
7023
|
+
return createJsonParser({
|
|
7024
|
+
toolCalls: "toolCalls",
|
|
7025
|
+
response: "response",
|
|
7026
|
+
success: "success"
|
|
7027
|
+
});
|
|
7028
|
+
}
|
|
7029
|
+
}
|
|
7030
|
+
function interpolateArgs(args, scenario) {
|
|
7031
|
+
return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
|
|
7032
|
+
}
|
|
7033
|
+
async function runCLIHost(cliConfig, scenario) {
|
|
7034
|
+
const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
|
|
7035
|
+
const args = interpolateArgs(cliConfig.args, scenario);
|
|
7036
|
+
const startTime = Date.now();
|
|
7037
|
+
let stdout;
|
|
7038
|
+
try {
|
|
7039
|
+
const result2 = await spawnProcess(cliConfig.command, args, { timeout });
|
|
7040
|
+
stdout = result2.stdout;
|
|
7041
|
+
} catch (err) {
|
|
7042
|
+
const elapsed = Date.now() - startTime;
|
|
7043
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7044
|
+
if (message.includes("TIMEOUT") || message.includes("timed out")) {
|
|
7045
|
+
return {
|
|
7046
|
+
success: false,
|
|
7047
|
+
toolCalls: [],
|
|
7048
|
+
error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
|
|
7049
|
+
};
|
|
7050
|
+
}
|
|
7051
|
+
return {
|
|
7052
|
+
success: false,
|
|
7053
|
+
toolCalls: [],
|
|
7054
|
+
error: `CLI host process failed: ${message}`
|
|
7055
|
+
};
|
|
7056
|
+
}
|
|
7057
|
+
const parse = getParser(cliConfig.outputFormat);
|
|
7058
|
+
let result;
|
|
7059
|
+
try {
|
|
7060
|
+
result = parse(stdout);
|
|
7061
|
+
} catch (err) {
|
|
7062
|
+
return {
|
|
7063
|
+
success: false,
|
|
7064
|
+
toolCalls: [],
|
|
7065
|
+
error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
|
|
7066
|
+
stdout (first 500 chars): ${stdout.slice(0, 500)}`
|
|
7067
|
+
};
|
|
7068
|
+
}
|
|
7069
|
+
const validationError = validateSimulationResult(result);
|
|
7070
|
+
if (validationError) {
|
|
7071
|
+
return {
|
|
7072
|
+
success: false,
|
|
7073
|
+
toolCalls: [],
|
|
7074
|
+
error: `CLI host returned invalid result: ${validationError}`
|
|
7075
|
+
};
|
|
7076
|
+
}
|
|
7077
|
+
return result;
|
|
7078
|
+
}
|
|
7079
|
+
function validateSimulationResult(result) {
|
|
7080
|
+
if (result === null || typeof result !== "object") {
|
|
7081
|
+
return `Expected object, got ${typeof result}`;
|
|
7082
|
+
}
|
|
7083
|
+
const obj = result;
|
|
7084
|
+
if (typeof obj.success !== "boolean") {
|
|
7085
|
+
return `"success" must be a boolean, got ${typeof obj.success}`;
|
|
7086
|
+
}
|
|
7087
|
+
if (!Array.isArray(obj.toolCalls)) {
|
|
7088
|
+
return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
|
|
7089
|
+
}
|
|
7090
|
+
for (let i = 0; i < obj.toolCalls.length; i++) {
|
|
7091
|
+
const tc = obj.toolCalls[i];
|
|
7092
|
+
if (typeof tc.name !== "string") {
|
|
7093
|
+
return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
|
|
7094
|
+
}
|
|
7095
|
+
if (typeof tc.arguments !== "object" || tc.arguments === null) {
|
|
7096
|
+
return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
|
|
7097
|
+
}
|
|
7098
|
+
}
|
|
7099
|
+
return null;
|
|
7100
|
+
}
|
|
7101
|
+
function spawnProcess(command, args, options) {
|
|
7102
|
+
return new Promise((resolve2, reject) => {
|
|
7103
|
+
const child = child_process.spawn(command, args, {
|
|
7104
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
7105
|
+
});
|
|
7106
|
+
child.stdin.end();
|
|
7107
|
+
const stdoutChunks = [];
|
|
7108
|
+
const stderrChunks = [];
|
|
7109
|
+
let totalBytes = 0;
|
|
7110
|
+
child.stdout.on("data", (chunk) => {
|
|
7111
|
+
totalBytes += chunk.length;
|
|
7112
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7113
|
+
stdoutChunks.push(chunk);
|
|
7114
|
+
}
|
|
7115
|
+
});
|
|
7116
|
+
child.stderr.on("data", (chunk) => {
|
|
7117
|
+
totalBytes += chunk.length;
|
|
7118
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7119
|
+
stderrChunks.push(chunk);
|
|
7120
|
+
}
|
|
7121
|
+
});
|
|
7122
|
+
const timer = setTimeout(() => {
|
|
7123
|
+
child.kill("SIGTERM");
|
|
7124
|
+
reject(new Error(`Process timed out after ${options.timeout}ms`));
|
|
7125
|
+
}, options.timeout);
|
|
7126
|
+
child.on("error", (err) => {
|
|
7127
|
+
clearTimeout(timer);
|
|
7128
|
+
reject(err);
|
|
7129
|
+
});
|
|
7130
|
+
child.on("close", (code) => {
|
|
7131
|
+
clearTimeout(timer);
|
|
7132
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
|
|
7133
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf-8");
|
|
7134
|
+
if (code !== 0) {
|
|
7135
|
+
reject(
|
|
7136
|
+
new Error(
|
|
7137
|
+
`Command failed with exit code ${code ?? "null"}` + (stderr ? `
|
|
7138
|
+
stderr: ${stderr}` : "")
|
|
7139
|
+
)
|
|
7140
|
+
);
|
|
7141
|
+
return;
|
|
7142
|
+
}
|
|
7143
|
+
resolve2({ stdout, stderr });
|
|
7144
|
+
});
|
|
7145
|
+
});
|
|
7146
|
+
}
|
|
7147
|
+
|
|
6629
7148
|
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6630
7149
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6631
7150
|
var allProviders = [
|
|
@@ -6643,6 +7162,25 @@ var simulatorRegistry = new Map(
|
|
|
6643
7162
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6644
7163
|
);
|
|
6645
7164
|
async function simulateMCPHost(mcp, scenario, config) {
|
|
7165
|
+
const hostType = config.hostType ?? "sdk";
|
|
7166
|
+
if (hostType === "cli") {
|
|
7167
|
+
if (!config.cli) {
|
|
7168
|
+
throw new Error(
|
|
7169
|
+
`mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
|
|
7170
|
+
);
|
|
7171
|
+
}
|
|
7172
|
+
return runCLIHost(config.cli, scenario);
|
|
7173
|
+
}
|
|
7174
|
+
if (hostType === "browser" || hostType === "desktop") {
|
|
7175
|
+
throw new Error(
|
|
7176
|
+
`Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
|
|
7177
|
+
);
|
|
7178
|
+
}
|
|
7179
|
+
if (!config.provider) {
|
|
7180
|
+
throw new Error(
|
|
7181
|
+
`mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
|
|
7182
|
+
);
|
|
7183
|
+
}
|
|
6646
7184
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6647
7185
|
if (!simulator) {
|
|
6648
7186
|
throw new Error(
|
|
@@ -6834,17 +7372,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6834
7372
|
};
|
|
6835
7373
|
}
|
|
6836
7374
|
if (expectBlock.passesJudge !== void 0) {
|
|
6837
|
-
const
|
|
6838
|
-
const
|
|
6839
|
-
|
|
6840
|
-
|
|
6841
|
-
|
|
6842
|
-
|
|
6843
|
-
|
|
6844
|
-
|
|
6845
|
-
|
|
6846
|
-
|
|
6847
|
-
|
|
7375
|
+
const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
|
|
7376
|
+
const judgeResultEntries = await Promise.all(
|
|
7377
|
+
judgeConfigs.map(async (judgeConfig) => {
|
|
7378
|
+
const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
|
|
7379
|
+
const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
|
|
7380
|
+
const validation = await validateJudge(response, {
|
|
7381
|
+
...judgeConfig,
|
|
7382
|
+
reference: effectiveReference,
|
|
7383
|
+
reps: effectiveReps
|
|
7384
|
+
});
|
|
7385
|
+
const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
|
|
7386
|
+
return {
|
|
7387
|
+
pass: validation.pass,
|
|
7388
|
+
details: validation.message,
|
|
7389
|
+
score: validation.details?.score,
|
|
7390
|
+
reasoning: validation.details?.reasoning,
|
|
7391
|
+
judgeName,
|
|
7392
|
+
judgeProvider: validation.details?.judgeProvider,
|
|
7393
|
+
judgeModel: validation.details?.judgeModel
|
|
7394
|
+
};
|
|
7395
|
+
})
|
|
7396
|
+
);
|
|
7397
|
+
if (judgeResultEntries.length === 1) {
|
|
7398
|
+
results.judge = judgeResultEntries[0];
|
|
7399
|
+
} else {
|
|
7400
|
+
const allPassed = judgeResultEntries.every((r) => r.pass);
|
|
7401
|
+
const passCount = judgeResultEntries.filter((r) => r.pass).length;
|
|
7402
|
+
results.judge = {
|
|
7403
|
+
pass: allPassed,
|
|
7404
|
+
details: `${passCount}/${judgeResultEntries.length} judges passed`,
|
|
7405
|
+
judgeResults: judgeResultEntries
|
|
7406
|
+
};
|
|
7407
|
+
}
|
|
6848
7408
|
}
|
|
6849
7409
|
if (expectBlock.snapshot !== void 0) {
|
|
6850
7410
|
if (!config.playwrightExpect) {
|
|
@@ -6873,6 +7433,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6873
7433
|
}
|
|
6874
7434
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6875
7435
|
}
|
|
7436
|
+
function buildRequest(evalCase) {
|
|
7437
|
+
const request = {};
|
|
7438
|
+
if (evalCase.description) request.description = evalCase.description;
|
|
7439
|
+
if (evalCase.mode === "mcp_host") {
|
|
7440
|
+
if (evalCase.scenario) request.scenario = evalCase.scenario;
|
|
7441
|
+
if (evalCase.mcpHostConfig) {
|
|
7442
|
+
request.mcpHostConfig = {
|
|
7443
|
+
provider: evalCase.mcpHostConfig.provider,
|
|
7444
|
+
...evalCase.mcpHostConfig.model !== void 0 && {
|
|
7445
|
+
model: evalCase.mcpHostConfig.model
|
|
7446
|
+
}
|
|
7447
|
+
};
|
|
7448
|
+
}
|
|
7449
|
+
} else {
|
|
7450
|
+
if (evalCase.args) request.args = evalCase.args;
|
|
7451
|
+
}
|
|
7452
|
+
return request;
|
|
7453
|
+
}
|
|
6876
7454
|
function isMCPHostSimulationResult(value) {
|
|
6877
7455
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6878
7456
|
}
|
|
@@ -6921,6 +7499,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6921
7499
|
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6922
7500
|
source: "eval",
|
|
6923
7501
|
pass: didCasePass(error, expectationResults),
|
|
7502
|
+
request: buildRequest(evalCase),
|
|
6924
7503
|
response,
|
|
6925
7504
|
error,
|
|
6926
7505
|
expectations: expectationResults,
|
|
@@ -7063,8 +7642,13 @@ async function runEvalDataset(options, context) {
|
|
|
7063
7642
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7064
7643
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7065
7644
|
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7066
|
-
|
|
7067
|
-
|
|
7645
|
+
if (c.expect?.passesJudge == null) return sum;
|
|
7646
|
+
const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
|
|
7647
|
+
const totalReps = judges.reduce(
|
|
7648
|
+
(r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
|
|
7649
|
+
0
|
|
7650
|
+
);
|
|
7651
|
+
return sum + effectiveIterations * totalReps;
|
|
7068
7652
|
}, 0);
|
|
7069
7653
|
if (estimatedJudgeCalls > 50) {
|
|
7070
7654
|
debugEval(
|
|
@@ -7421,6 +8005,7 @@ exports.EvalDatasetSchema = EvalDatasetSchema;
|
|
|
7421
8005
|
exports.MCPConfigSchema = MCPConfigSchema;
|
|
7422
8006
|
exports.MCP_PROTOCOL_VERSION = MCP_PROTOCOL_VERSION;
|
|
7423
8007
|
exports.SnapshotSanitizers = SnapshotSanitizers;
|
|
8008
|
+
exports.clearJudgeRegistry = clearJudgeRegistry;
|
|
7424
8009
|
exports.closeMCPClient = closeMCPClient;
|
|
7425
8010
|
exports.createJudge = createJudge;
|
|
7426
8011
|
exports.createMCPClientForConfig = createMCPClientForConfig;
|
|
@@ -7431,6 +8016,7 @@ exports.discoverProtectedResource = discoverProtectedResource;
|
|
|
7431
8016
|
exports.expect = expect;
|
|
7432
8017
|
exports.extractText = extractText;
|
|
7433
8018
|
exports.getMissingDependencyMessage = getMissingDependencyMessage;
|
|
8019
|
+
exports.getRegisteredJudge = getRegisteredJudge;
|
|
7434
8020
|
exports.getResponseSizeBytes = getResponseSizeBytes;
|
|
7435
8021
|
exports.hasValidTokens = hasValidTokens;
|
|
7436
8022
|
exports.injectTokens = injectTokens;
|
|
@@ -7451,6 +8037,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
|
|
|
7451
8037
|
exports.performClientCredentialsFlow = performClientCredentialsFlow;
|
|
7452
8038
|
exports.performOAuthSetup = performOAuthSetup;
|
|
7453
8039
|
exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
|
|
8040
|
+
exports.registerJudge = registerJudge;
|
|
7454
8041
|
exports.resolveRubric = resolveRubric;
|
|
7455
8042
|
exports.runConformanceChecks = runConformanceChecks;
|
|
7456
8043
|
exports.runEvalCase = runEvalCase;
|