@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +284 -24
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +649 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +504 -115
- package/dist/index.d.ts +504 -115
- package/dist/index.js +648 -64
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +12 -7
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.js
CHANGED
|
@@ -18,7 +18,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
|
|
|
18
18
|
import { ProxyAgent, Agent } from 'undici';
|
|
19
19
|
import { readFileSync } from 'fs';
|
|
20
20
|
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
21
|
-
import { execFile } from 'child_process';
|
|
21
|
+
import { execFile, spawn } from 'child_process';
|
|
22
22
|
import { promisify } from 'util';
|
|
23
23
|
|
|
24
24
|
var __defProp = Object.defineProperty;
|
|
@@ -3279,7 +3279,11 @@ async function performOAuthSetup(config) {
|
|
|
3279
3279
|
const page = await context.newPage();
|
|
3280
3280
|
page.setDefaultTimeout(timeoutMs);
|
|
3281
3281
|
await page.goto(authorizationUrl.toString());
|
|
3282
|
-
|
|
3282
|
+
if ("customLoginFlow" in config && config.customLoginFlow) {
|
|
3283
|
+
await config.customLoginFlow(page);
|
|
3284
|
+
} else {
|
|
3285
|
+
await completeLoginForm(page, config);
|
|
3286
|
+
}
|
|
3283
3287
|
await page.waitForURL(
|
|
3284
3288
|
(url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
|
|
3285
3289
|
{ timeout: timeoutMs }
|
|
@@ -4380,7 +4384,7 @@ function escapeHtml(text) {
|
|
|
4380
4384
|
|
|
4381
4385
|
// package.json
|
|
4382
4386
|
var package_default = {
|
|
4383
|
-
version: "1.0.0-beta.
|
|
4387
|
+
version: "1.0.0-beta.8"};
|
|
4384
4388
|
|
|
4385
4389
|
// src/mcp/clientFactory.ts
|
|
4386
4390
|
function getRetryAfterDelayMs(err) {
|
|
@@ -5097,9 +5101,17 @@ function formatBytes(bytes) {
|
|
|
5097
5101
|
function isSimulationResult(value) {
|
|
5098
5102
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
5099
5103
|
}
|
|
5104
|
+
function isPatternMatcher(v) {
|
|
5105
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
5106
|
+
}
|
|
5100
5107
|
function partialMatch(actual, expected) {
|
|
5101
5108
|
return Object.entries(expected).every(([k, v]) => {
|
|
5102
5109
|
const actualVal = actual[k];
|
|
5110
|
+
if (isPatternMatcher(v)) {
|
|
5111
|
+
if (typeof actualVal !== "string") return false;
|
|
5112
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
5113
|
+
return re.test(actualVal);
|
|
5114
|
+
}
|
|
5103
5115
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
5104
5116
|
return partialMatch(
|
|
5105
5117
|
actualVal,
|
|
@@ -5238,7 +5250,175 @@ var JudgeResponseSchema = z.object({
|
|
|
5238
5250
|
reasoning: z.string()
|
|
5239
5251
|
});
|
|
5240
5252
|
|
|
5241
|
-
// src/judge/
|
|
5253
|
+
// src/judge/anthropicJudge.ts
|
|
5254
|
+
function createAnthropicJudge(config = {}) {
|
|
5255
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
5256
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
5257
|
+
if (!apiKey) {
|
|
5258
|
+
throw new Error(
|
|
5259
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
5260
|
+
);
|
|
5261
|
+
}
|
|
5262
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5263
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5264
|
+
const temperature = config.temperature ?? 0;
|
|
5265
|
+
return {
|
|
5266
|
+
async evaluate(candidate, reference, rubric) {
|
|
5267
|
+
let anthropicModule;
|
|
5268
|
+
try {
|
|
5269
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
5270
|
+
} catch (err) {
|
|
5271
|
+
throw new Error(
|
|
5272
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
5273
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5274
|
+
);
|
|
5275
|
+
}
|
|
5276
|
+
const client = new anthropicModule.default({ apiKey });
|
|
5277
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
5278
|
+
const startTime = Date.now();
|
|
5279
|
+
const response = await client.messages.create({
|
|
5280
|
+
model,
|
|
5281
|
+
max_tokens: maxTokens,
|
|
5282
|
+
temperature,
|
|
5283
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5284
|
+
messages: [{ role: "user", content: prompt }]
|
|
5285
|
+
});
|
|
5286
|
+
const durationMs = Date.now() - startTime;
|
|
5287
|
+
const textBlock = response.content.find(
|
|
5288
|
+
(b) => b.type === "text"
|
|
5289
|
+
);
|
|
5290
|
+
const text = textBlock?.text ?? "";
|
|
5291
|
+
const parsed = parseJudgeResponse(text);
|
|
5292
|
+
return {
|
|
5293
|
+
pass: parsed.pass,
|
|
5294
|
+
score: parsed.score,
|
|
5295
|
+
reasoning: parsed.reasoning,
|
|
5296
|
+
usage: {
|
|
5297
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5298
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5299
|
+
totalCostUsd: 0,
|
|
5300
|
+
durationMs
|
|
5301
|
+
}
|
|
5302
|
+
};
|
|
5303
|
+
}
|
|
5304
|
+
};
|
|
5305
|
+
}
|
|
5306
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
5307
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5308
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5309
|
+
return `Rubric:
|
|
5310
|
+
${rubric}
|
|
5311
|
+
|
|
5312
|
+
<candidate_response>
|
|
5313
|
+
${candidateStr}
|
|
5314
|
+
</candidate_response>
|
|
5315
|
+
|
|
5316
|
+
<reference_answer>
|
|
5317
|
+
${referenceStr ?? "No reference provided."}
|
|
5318
|
+
</reference_answer>
|
|
5319
|
+
|
|
5320
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5321
|
+
}
|
|
5322
|
+
function parseJudgeResponse(text) {
|
|
5323
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5324
|
+
let parsed;
|
|
5325
|
+
try {
|
|
5326
|
+
parsed = JSON.parse(cleaned);
|
|
5327
|
+
} catch {
|
|
5328
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5329
|
+
}
|
|
5330
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5331
|
+
if (!result.success) {
|
|
5332
|
+
throw new Error(
|
|
5333
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5334
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5335
|
+
);
|
|
5336
|
+
}
|
|
5337
|
+
return result.data;
|
|
5338
|
+
}
|
|
5339
|
+
|
|
5340
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
5341
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
5342
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5343
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5344
|
+
const temperature = config.temperature ?? 0;
|
|
5345
|
+
return {
|
|
5346
|
+
async evaluate(candidate, reference, rubric) {
|
|
5347
|
+
let vertexModule;
|
|
5348
|
+
try {
|
|
5349
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
5350
|
+
} catch (err) {
|
|
5351
|
+
throw new Error(
|
|
5352
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
5353
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5354
|
+
);
|
|
5355
|
+
}
|
|
5356
|
+
const client = new vertexModule.AnthropicVertex({
|
|
5357
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
5358
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
5359
|
+
});
|
|
5360
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
5361
|
+
const startTime = Date.now();
|
|
5362
|
+
const response = await client.messages.create({
|
|
5363
|
+
model,
|
|
5364
|
+
max_tokens: maxTokens,
|
|
5365
|
+
temperature,
|
|
5366
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5367
|
+
messages: [{ role: "user", content: prompt }]
|
|
5368
|
+
});
|
|
5369
|
+
const durationMs = Date.now() - startTime;
|
|
5370
|
+
const textBlock = response.content.find(
|
|
5371
|
+
(b) => b.type === "text"
|
|
5372
|
+
);
|
|
5373
|
+
const text = textBlock?.text ?? "";
|
|
5374
|
+
const parsed = parseJudgeResponse2(text);
|
|
5375
|
+
return {
|
|
5376
|
+
pass: parsed.pass,
|
|
5377
|
+
score: parsed.score,
|
|
5378
|
+
reasoning: parsed.reasoning,
|
|
5379
|
+
usage: {
|
|
5380
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5381
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5382
|
+
totalCostUsd: 0,
|
|
5383
|
+
durationMs
|
|
5384
|
+
}
|
|
5385
|
+
};
|
|
5386
|
+
}
|
|
5387
|
+
};
|
|
5388
|
+
}
|
|
5389
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
5390
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5391
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5392
|
+
return `Rubric:
|
|
5393
|
+
${rubric}
|
|
5394
|
+
|
|
5395
|
+
<candidate_response>
|
|
5396
|
+
${candidateStr}
|
|
5397
|
+
</candidate_response>
|
|
5398
|
+
|
|
5399
|
+
<reference_answer>
|
|
5400
|
+
${referenceStr ?? "No reference provided."}
|
|
5401
|
+
</reference_answer>
|
|
5402
|
+
|
|
5403
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5404
|
+
}
|
|
5405
|
+
function parseJudgeResponse2(text) {
|
|
5406
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5407
|
+
let parsed;
|
|
5408
|
+
try {
|
|
5409
|
+
parsed = JSON.parse(cleaned);
|
|
5410
|
+
} catch {
|
|
5411
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5412
|
+
}
|
|
5413
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5414
|
+
if (!result.success) {
|
|
5415
|
+
throw new Error(
|
|
5416
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5417
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5418
|
+
);
|
|
5419
|
+
}
|
|
5420
|
+
return result.data;
|
|
5421
|
+
}
|
|
5242
5422
|
function createClaudeAgentJudge(config) {
|
|
5243
5423
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5244
5424
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -5256,7 +5436,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5256
5436
|
exceedsMaxToolOutputSize: true
|
|
5257
5437
|
};
|
|
5258
5438
|
}
|
|
5259
|
-
const prompt =
|
|
5439
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
5260
5440
|
try {
|
|
5261
5441
|
let resultMessage;
|
|
5262
5442
|
for await (const message of query({
|
|
@@ -5288,7 +5468,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5288
5468
|
);
|
|
5289
5469
|
}
|
|
5290
5470
|
const responseText = resultMessage.result ?? "";
|
|
5291
|
-
const parsed =
|
|
5471
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
5292
5472
|
const usage = {
|
|
5293
5473
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
5294
5474
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -5317,7 +5497,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5317
5497
|
function buildSystemPrompt() {
|
|
5318
5498
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
5319
5499
|
}
|
|
5320
|
-
function
|
|
5500
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
5321
5501
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5322
5502
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5323
5503
|
const parts = [];
|
|
@@ -5334,7 +5514,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
5334
5514
|
);
|
|
5335
5515
|
return parts.join("");
|
|
5336
5516
|
}
|
|
5337
|
-
function
|
|
5517
|
+
function parseJudgeResponse3(text) {
|
|
5338
5518
|
let jsonText = text.trim();
|
|
5339
5519
|
if (jsonText.startsWith("```json")) {
|
|
5340
5520
|
jsonText = jsonText.slice(7);
|
|
@@ -5391,7 +5571,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5391
5571
|
);
|
|
5392
5572
|
}
|
|
5393
5573
|
const client = new openaiModule.default({ apiKey });
|
|
5394
|
-
const prompt =
|
|
5574
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
5395
5575
|
const startTime = Date.now();
|
|
5396
5576
|
const completion = await client.chat.completions.create({
|
|
5397
5577
|
model,
|
|
@@ -5407,7 +5587,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5407
5587
|
});
|
|
5408
5588
|
const durationMs = Date.now() - startTime;
|
|
5409
5589
|
const text = completion.choices[0]?.message.content ?? "";
|
|
5410
|
-
const parsed =
|
|
5590
|
+
const parsed = parseJudgeResponse4(text);
|
|
5411
5591
|
return {
|
|
5412
5592
|
pass: parsed.pass,
|
|
5413
5593
|
score: parsed.score,
|
|
@@ -5422,7 +5602,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5422
5602
|
}
|
|
5423
5603
|
};
|
|
5424
5604
|
}
|
|
5425
|
-
function
|
|
5605
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
5426
5606
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5427
5607
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5428
5608
|
return `Rubric:
|
|
@@ -5438,7 +5618,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
5438
5618
|
|
|
5439
5619
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5440
5620
|
}
|
|
5441
|
-
function
|
|
5621
|
+
function parseJudgeResponse4(text) {
|
|
5442
5622
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5443
5623
|
let parsed;
|
|
5444
5624
|
try {
|
|
@@ -5540,6 +5720,10 @@ function createJudge(config = {}) {
|
|
|
5540
5720
|
const provider = config.provider ?? "anthropic";
|
|
5541
5721
|
switch (provider) {
|
|
5542
5722
|
case "anthropic":
|
|
5723
|
+
return createAnthropicJudge(config);
|
|
5724
|
+
case "vertex-anthropic":
|
|
5725
|
+
return createVertexAnthropicJudge(config);
|
|
5726
|
+
case "anthropic-agent-sdk":
|
|
5543
5727
|
return createClaudeAgentJudge(config);
|
|
5544
5728
|
case "openai":
|
|
5545
5729
|
return createOpenAIJudge(config);
|
|
@@ -5550,6 +5734,34 @@ function createJudge(config = {}) {
|
|
|
5550
5734
|
}
|
|
5551
5735
|
}
|
|
5552
5736
|
|
|
5737
|
+
// src/judge/judgeRegistry.ts
|
|
5738
|
+
var registry = /* @__PURE__ */ new Map();
|
|
5739
|
+
function registerJudge(name15, executor) {
|
|
5740
|
+
const existing = registry.get(name15);
|
|
5741
|
+
if (existing !== void 0) {
|
|
5742
|
+
if (existing === executor) {
|
|
5743
|
+
return;
|
|
5744
|
+
}
|
|
5745
|
+
throw new Error(
|
|
5746
|
+
`Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
|
|
5747
|
+
);
|
|
5748
|
+
}
|
|
5749
|
+
registry.set(name15, executor);
|
|
5750
|
+
}
|
|
5751
|
+
function getRegisteredJudge(name15) {
|
|
5752
|
+
const executor = registry.get(name15);
|
|
5753
|
+
if (!executor) {
|
|
5754
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
5755
|
+
throw new Error(
|
|
5756
|
+
`Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
5757
|
+
);
|
|
5758
|
+
}
|
|
5759
|
+
return executor;
|
|
5760
|
+
}
|
|
5761
|
+
function clearJudgeRegistry() {
|
|
5762
|
+
registry.clear();
|
|
5763
|
+
}
|
|
5764
|
+
|
|
5553
5765
|
// src/assertions/validators/judge.ts
|
|
5554
5766
|
function computeStdDev(scores, mean) {
|
|
5555
5767
|
if (scores.length <= 1) return 0;
|
|
@@ -5558,6 +5770,7 @@ function computeStdDev(scores, mean) {
|
|
|
5558
5770
|
}
|
|
5559
5771
|
async function validateJudge(response, config) {
|
|
5560
5772
|
const {
|
|
5773
|
+
judge: judgeName,
|
|
5561
5774
|
rubric,
|
|
5562
5775
|
reference,
|
|
5563
5776
|
threshold = 0.7,
|
|
@@ -5570,6 +5783,29 @@ async function validateJudge(response, config) {
|
|
|
5570
5783
|
maxBudgetUsd,
|
|
5571
5784
|
maxToolOutputSize
|
|
5572
5785
|
} = config;
|
|
5786
|
+
if (judgeName !== void 0) {
|
|
5787
|
+
try {
|
|
5788
|
+
const executor = getRegisteredJudge(judgeName);
|
|
5789
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
5790
|
+
const score = judgeResult.score;
|
|
5791
|
+
const passed = score >= threshold;
|
|
5792
|
+
return {
|
|
5793
|
+
pass: passed,
|
|
5794
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
5795
|
+
};
|
|
5796
|
+
} catch (err) {
|
|
5797
|
+
return {
|
|
5798
|
+
pass: false,
|
|
5799
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
5800
|
+
};
|
|
5801
|
+
}
|
|
5802
|
+
}
|
|
5803
|
+
if (rubric === void 0) {
|
|
5804
|
+
return {
|
|
5805
|
+
pass: false,
|
|
5806
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
5807
|
+
};
|
|
5808
|
+
}
|
|
5573
5809
|
const resolvedRubric = resolveRubric(rubric);
|
|
5574
5810
|
const judgeConfig = {
|
|
5575
5811
|
...provider !== void 0 && { provider },
|
|
@@ -5616,11 +5852,17 @@ async function validateJudge(response, config) {
|
|
|
5616
5852
|
return {
|
|
5617
5853
|
pass: passed,
|
|
5618
5854
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
5619
|
-
details:
|
|
5620
|
-
|
|
5621
|
-
|
|
5622
|
-
|
|
5623
|
-
|
|
5855
|
+
details: {
|
|
5856
|
+
score: meanScore,
|
|
5857
|
+
reasoning: lastReasoning,
|
|
5858
|
+
judgeProvider: provider ?? "anthropic",
|
|
5859
|
+
judgeModel: model,
|
|
5860
|
+
...reps > 1 && {
|
|
5861
|
+
scores,
|
|
5862
|
+
scoreStdDev: stdDev,
|
|
5863
|
+
highVariance
|
|
5864
|
+
}
|
|
5865
|
+
}
|
|
5624
5866
|
};
|
|
5625
5867
|
} catch (err) {
|
|
5626
5868
|
return {
|
|
@@ -5999,31 +6241,68 @@ function toBeToolError(received, expected = true) {
|
|
|
5999
6241
|
|
|
6000
6242
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
6001
6243
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
6002
|
-
async function
|
|
6244
|
+
async function runSingleJudge(received, rubric, options) {
|
|
6003
6245
|
const {
|
|
6004
6246
|
reference = null,
|
|
6005
6247
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
6006
6248
|
reps,
|
|
6007
6249
|
provider,
|
|
6008
|
-
model
|
|
6250
|
+
model,
|
|
6251
|
+
judge
|
|
6009
6252
|
} = options;
|
|
6010
6253
|
const validation = await validateJudge(received, {
|
|
6011
|
-
rubric,
|
|
6254
|
+
...rubric !== void 0 && { rubric },
|
|
6012
6255
|
reference: reference ?? void 0,
|
|
6013
6256
|
threshold: passingThreshold,
|
|
6014
6257
|
...reps !== void 0 && { reps },
|
|
6015
6258
|
...provider !== void 0 && { provider },
|
|
6016
|
-
...model !== void 0 && { model }
|
|
6259
|
+
...model !== void 0 && { model },
|
|
6260
|
+
...judge !== void 0 && { judge }
|
|
6017
6261
|
});
|
|
6262
|
+
return { pass: validation.pass, message: validation.message };
|
|
6263
|
+
}
|
|
6264
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
6265
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
6266
|
+
const results = await Promise.all(
|
|
6267
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
6268
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
6269
|
+
return runSingleJudge(received, r, opts);
|
|
6270
|
+
})
|
|
6271
|
+
);
|
|
6272
|
+
const allPassed = results.every((r) => r.pass);
|
|
6273
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
6274
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
6275
|
+
const details = results.map((r) => r.message).join("\n");
|
|
6276
|
+
if (this.isNot) {
|
|
6277
|
+
return {
|
|
6278
|
+
pass: !allPassed,
|
|
6279
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
6280
|
+
};
|
|
6281
|
+
}
|
|
6282
|
+
return {
|
|
6283
|
+
pass: allPassed,
|
|
6284
|
+
message: () => `${summary}
|
|
6285
|
+
${details}`
|
|
6286
|
+
};
|
|
6287
|
+
}
|
|
6288
|
+
let rubric;
|
|
6289
|
+
let options;
|
|
6290
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
6291
|
+
rubric = rubricOrOptions;
|
|
6292
|
+
options = maybeOptions ?? {};
|
|
6293
|
+
} else {
|
|
6294
|
+
options = rubricOrOptions;
|
|
6295
|
+
}
|
|
6296
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
6018
6297
|
if (this.isNot) {
|
|
6019
6298
|
return {
|
|
6020
|
-
pass: !
|
|
6021
|
-
message: () =>
|
|
6299
|
+
pass: !result.pass,
|
|
6300
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
6022
6301
|
};
|
|
6023
6302
|
}
|
|
6024
6303
|
return {
|
|
6025
|
-
pass:
|
|
6026
|
-
message: () =>
|
|
6304
|
+
pass: result.pass,
|
|
6305
|
+
message: () => result.message
|
|
6027
6306
|
};
|
|
6028
6307
|
}
|
|
6029
6308
|
|
|
@@ -6307,6 +6586,7 @@ function getAuthConfigFromEnv() {
|
|
|
6307
6586
|
return void 0;
|
|
6308
6587
|
}
|
|
6309
6588
|
var MCPHostConfigSchema = z.object({
|
|
6589
|
+
hostType: z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
|
|
6310
6590
|
provider: z.enum([
|
|
6311
6591
|
"openai",
|
|
6312
6592
|
"anthropic",
|
|
@@ -6317,12 +6597,18 @@ var MCPHostConfigSchema = z.object({
|
|
|
6317
6597
|
"openrouter",
|
|
6318
6598
|
"xai",
|
|
6319
6599
|
"vertex-anthropic"
|
|
6320
|
-
]),
|
|
6600
|
+
]).optional(),
|
|
6321
6601
|
apiKeyEnvVar: z.string().optional(),
|
|
6322
6602
|
model: z.string().optional(),
|
|
6323
6603
|
maxTokens: z.number().optional(),
|
|
6324
6604
|
temperature: z.number().optional(),
|
|
6325
|
-
maxToolCalls: z.number().optional()
|
|
6605
|
+
maxToolCalls: z.number().optional(),
|
|
6606
|
+
cli: z.object({
|
|
6607
|
+
command: z.string(),
|
|
6608
|
+
args: z.array(z.string()),
|
|
6609
|
+
outputFormat: z.enum(["stream-json", "json"]).optional(),
|
|
6610
|
+
timeout: z.number().optional()
|
|
6611
|
+
}).optional()
|
|
6326
6612
|
});
|
|
6327
6613
|
var SnapshotSanitizerSchema = z.union([
|
|
6328
6614
|
// Built-in sanitizers
|
|
@@ -6337,6 +6623,37 @@ var SnapshotSanitizerSchema = z.union([
|
|
|
6337
6623
|
remove: z.array(z.string())
|
|
6338
6624
|
})
|
|
6339
6625
|
]);
|
|
6626
|
+
var JudgeExpectConfigSchema = z.object({
|
|
6627
|
+
judge: z.string().min(1).optional(),
|
|
6628
|
+
rubric: z.union([
|
|
6629
|
+
z.enum([
|
|
6630
|
+
"correctness",
|
|
6631
|
+
"completeness",
|
|
6632
|
+
"groundedness",
|
|
6633
|
+
"instruction-following",
|
|
6634
|
+
"conciseness"
|
|
6635
|
+
]),
|
|
6636
|
+
z.object({ text: z.string().min(1) })
|
|
6637
|
+
]).optional(),
|
|
6638
|
+
reference: z.unknown().optional(),
|
|
6639
|
+
threshold: z.number().min(0).max(1).optional(),
|
|
6640
|
+
reps: z.number().int().min(1).optional(),
|
|
6641
|
+
provider: z.enum([
|
|
6642
|
+
"anthropic",
|
|
6643
|
+
"vertex-anthropic",
|
|
6644
|
+
"anthropic-agent-sdk",
|
|
6645
|
+
"openai",
|
|
6646
|
+
"google"
|
|
6647
|
+
]).optional(),
|
|
6648
|
+
model: z.string().optional(),
|
|
6649
|
+
apiKeyEnvVar: z.string().optional(),
|
|
6650
|
+
maxTokens: z.number().int().positive().optional(),
|
|
6651
|
+
temperature: z.number().min(0).max(1).optional(),
|
|
6652
|
+
maxBudgetUsd: z.number().positive().optional(),
|
|
6653
|
+
maxToolOutputSize: z.number().int().positive().optional()
|
|
6654
|
+
}).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
|
|
6655
|
+
message: 'Either "judge" or "rubric" must be provided in passesJudge'
|
|
6656
|
+
});
|
|
6340
6657
|
var EvalExpectBlockSchema = z.object({
|
|
6341
6658
|
response: z.unknown().optional(),
|
|
6342
6659
|
schema: z.string().optional(),
|
|
@@ -6345,28 +6662,7 @@ var EvalExpectBlockSchema = z.object({
|
|
|
6345
6662
|
snapshot: z.string().optional(),
|
|
6346
6663
|
snapshotSanitizers: z.array(SnapshotSanitizerSchema).optional(),
|
|
6347
6664
|
isError: z.union([z.boolean(), z.string(), z.array(z.string())]).optional(),
|
|
6348
|
-
passesJudge: z.
|
|
6349
|
-
rubric: z.union([
|
|
6350
|
-
z.enum([
|
|
6351
|
-
"correctness",
|
|
6352
|
-
"completeness",
|
|
6353
|
-
"groundedness",
|
|
6354
|
-
"instruction-following",
|
|
6355
|
-
"conciseness"
|
|
6356
|
-
]),
|
|
6357
|
-
z.object({ text: z.string().min(1) })
|
|
6358
|
-
]),
|
|
6359
|
-
reference: z.unknown().optional(),
|
|
6360
|
-
threshold: z.number().min(0).max(1).optional(),
|
|
6361
|
-
reps: z.number().int().min(1).optional(),
|
|
6362
|
-
provider: z.enum(["anthropic", "openai", "google"]).optional(),
|
|
6363
|
-
model: z.string().optional(),
|
|
6364
|
-
apiKeyEnvVar: z.string().optional(),
|
|
6365
|
-
maxTokens: z.number().int().positive().optional(),
|
|
6366
|
-
temperature: z.number().min(0).max(1).optional(),
|
|
6367
|
-
maxBudgetUsd: z.number().positive().optional(),
|
|
6368
|
-
maxToolOutputSize: z.number().int().positive().optional()
|
|
6369
|
-
}).optional(),
|
|
6665
|
+
passesJudge: z.union([JudgeExpectConfigSchema, z.array(JudgeExpectConfigSchema).min(1)]).optional(),
|
|
6370
6666
|
responseSize: z.object({
|
|
6371
6667
|
maxBytes: z.number().optional(),
|
|
6372
6668
|
minBytes: z.number().optional()
|
|
@@ -6539,6 +6835,9 @@ function createVercelOrchestrator() {
|
|
|
6539
6835
|
try {
|
|
6540
6836
|
const { generateText, stepCountIs } = await import('ai');
|
|
6541
6837
|
const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
|
|
6838
|
+
if (!config.provider) {
|
|
6839
|
+
throw new Error("provider is required for SDK host type");
|
|
6840
|
+
}
|
|
6542
6841
|
const modelId = config.model ?? defaultModel(config.provider);
|
|
6543
6842
|
const model = await loadModel(config.provider, modelId);
|
|
6544
6843
|
const mcpTools = await mcp.listTools();
|
|
@@ -6592,13 +6891,233 @@ function createVercelOrchestrator() {
|
|
|
6592
6891
|
return {
|
|
6593
6892
|
success: false,
|
|
6594
6893
|
toolCalls: [],
|
|
6595
|
-
error: enrichErrorMessage(err, config.provider)
|
|
6894
|
+
error: enrichErrorMessage(err, config.provider ?? "unknown")
|
|
6596
6895
|
};
|
|
6597
6896
|
}
|
|
6598
6897
|
}
|
|
6599
6898
|
};
|
|
6600
6899
|
}
|
|
6601
6900
|
|
|
6901
|
+
// src/evals/mcpHost/adapters/cli/parsers.ts
|
|
6902
|
+
function parseStreamJson(stdout) {
|
|
6903
|
+
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6904
|
+
const toolCalls = [];
|
|
6905
|
+
const textParts = [];
|
|
6906
|
+
const conversationHistory = [];
|
|
6907
|
+
for (const line of lines) {
|
|
6908
|
+
let event;
|
|
6909
|
+
try {
|
|
6910
|
+
event = JSON.parse(line);
|
|
6911
|
+
} catch {
|
|
6912
|
+
continue;
|
|
6913
|
+
}
|
|
6914
|
+
if (event.type === "assistant" && event.message?.content) {
|
|
6915
|
+
for (const block of event.message.content) {
|
|
6916
|
+
if (block.type === "tool_use" && block.name) {
|
|
6917
|
+
const rawName = block.name;
|
|
6918
|
+
const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
|
|
6919
|
+
toolCalls.push({
|
|
6920
|
+
name: mcpMatch ? mcpMatch[1] : rawName,
|
|
6921
|
+
arguments: block.input ?? {},
|
|
6922
|
+
id: block.id
|
|
6923
|
+
});
|
|
6924
|
+
}
|
|
6925
|
+
if (block.type === "text" && block.text) {
|
|
6926
|
+
textParts.push(block.text);
|
|
6927
|
+
}
|
|
6928
|
+
}
|
|
6929
|
+
}
|
|
6930
|
+
if (event.type === "user" && event.message?.content) {
|
|
6931
|
+
for (const block of event.message.content) {
|
|
6932
|
+
if (block.type === "tool_result") {
|
|
6933
|
+
const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
|
|
6934
|
+
conversationHistory.push({ role: "tool", content });
|
|
6935
|
+
}
|
|
6936
|
+
}
|
|
6937
|
+
}
|
|
6938
|
+
if (event.type === "result" && typeof event.result === "string") {
|
|
6939
|
+
if (textParts.length === 0) {
|
|
6940
|
+
textParts.push(event.result);
|
|
6941
|
+
}
|
|
6942
|
+
}
|
|
6943
|
+
if (event.type === "result" && event.is_error === true) {
|
|
6944
|
+
return {
|
|
6945
|
+
success: false,
|
|
6946
|
+
toolCalls,
|
|
6947
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
6948
|
+
};
|
|
6949
|
+
}
|
|
6950
|
+
}
|
|
6951
|
+
const response = textParts.join("");
|
|
6952
|
+
if (response) {
|
|
6953
|
+
conversationHistory.push({ role: "assistant", content: response });
|
|
6954
|
+
}
|
|
6955
|
+
return {
|
|
6956
|
+
success: true,
|
|
6957
|
+
toolCalls,
|
|
6958
|
+
response: response || void 0,
|
|
6959
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
6960
|
+
};
|
|
6961
|
+
}
|
|
6962
|
+
function createJsonParser(paths) {
|
|
6963
|
+
return (stdout) => {
|
|
6964
|
+
const data = JSON.parse(stdout);
|
|
6965
|
+
const rawToolCalls = getNestedValue(data, paths.toolCalls);
|
|
6966
|
+
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
|
|
6967
|
+
name: typeof tc.name === "string" ? tc.name : "",
|
|
6968
|
+
arguments: tc.arguments ?? tc.args ?? {}
|
|
6969
|
+
})) : [];
|
|
6970
|
+
const response = getNestedValue(data, paths.response);
|
|
6971
|
+
const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
|
|
6972
|
+
return {
|
|
6973
|
+
success,
|
|
6974
|
+
toolCalls,
|
|
6975
|
+
response: typeof response === "string" ? response : void 0
|
|
6976
|
+
};
|
|
6977
|
+
};
|
|
6978
|
+
}
|
|
6979
|
+
function getNestedValue(obj, path3) {
|
|
6980
|
+
return path3.split(".").reduce((current, key) => {
|
|
6981
|
+
if (current !== null && typeof current === "object") {
|
|
6982
|
+
return current[key];
|
|
6983
|
+
}
|
|
6984
|
+
return void 0;
|
|
6985
|
+
}, obj);
|
|
6986
|
+
}
|
|
6987
|
+
|
|
6988
|
+
// src/evals/mcpHost/adapters/cli/runner.ts
|
|
6989
|
+
var DEFAULT_TIMEOUT = 12e4;
|
|
6990
|
+
var MAX_BUFFER = 10 * 1024 * 1024;
|
|
6991
|
+
function getParser(format) {
|
|
6992
|
+
switch (format ?? "stream-json") {
|
|
6993
|
+
case "stream-json":
|
|
6994
|
+
return parseStreamJson;
|
|
6995
|
+
case "json":
|
|
6996
|
+
return createJsonParser({
|
|
6997
|
+
toolCalls: "toolCalls",
|
|
6998
|
+
response: "response",
|
|
6999
|
+
success: "success"
|
|
7000
|
+
});
|
|
7001
|
+
}
|
|
7002
|
+
}
|
|
7003
|
+
function interpolateArgs(args, scenario) {
|
|
7004
|
+
return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
|
|
7005
|
+
}
|
|
7006
|
+
async function runCLIHost(cliConfig, scenario) {
|
|
7007
|
+
const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
|
|
7008
|
+
const args = interpolateArgs(cliConfig.args, scenario);
|
|
7009
|
+
const startTime = Date.now();
|
|
7010
|
+
let stdout;
|
|
7011
|
+
try {
|
|
7012
|
+
const result2 = await spawnProcess(cliConfig.command, args, { timeout });
|
|
7013
|
+
stdout = result2.stdout;
|
|
7014
|
+
} catch (err) {
|
|
7015
|
+
const elapsed = Date.now() - startTime;
|
|
7016
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7017
|
+
if (message.includes("TIMEOUT") || message.includes("timed out")) {
|
|
7018
|
+
return {
|
|
7019
|
+
success: false,
|
|
7020
|
+
toolCalls: [],
|
|
7021
|
+
error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
|
|
7022
|
+
};
|
|
7023
|
+
}
|
|
7024
|
+
return {
|
|
7025
|
+
success: false,
|
|
7026
|
+
toolCalls: [],
|
|
7027
|
+
error: `CLI host process failed: ${message}`
|
|
7028
|
+
};
|
|
7029
|
+
}
|
|
7030
|
+
const parse = getParser(cliConfig.outputFormat);
|
|
7031
|
+
let result;
|
|
7032
|
+
try {
|
|
7033
|
+
result = parse(stdout);
|
|
7034
|
+
} catch (err) {
|
|
7035
|
+
return {
|
|
7036
|
+
success: false,
|
|
7037
|
+
toolCalls: [],
|
|
7038
|
+
error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
|
|
7039
|
+
stdout (first 500 chars): ${stdout.slice(0, 500)}`
|
|
7040
|
+
};
|
|
7041
|
+
}
|
|
7042
|
+
const validationError = validateSimulationResult(result);
|
|
7043
|
+
if (validationError) {
|
|
7044
|
+
return {
|
|
7045
|
+
success: false,
|
|
7046
|
+
toolCalls: [],
|
|
7047
|
+
error: `CLI host returned invalid result: ${validationError}`
|
|
7048
|
+
};
|
|
7049
|
+
}
|
|
7050
|
+
return result;
|
|
7051
|
+
}
|
|
7052
|
+
function validateSimulationResult(result) {
|
|
7053
|
+
if (result === null || typeof result !== "object") {
|
|
7054
|
+
return `Expected object, got ${typeof result}`;
|
|
7055
|
+
}
|
|
7056
|
+
const obj = result;
|
|
7057
|
+
if (typeof obj.success !== "boolean") {
|
|
7058
|
+
return `"success" must be a boolean, got ${typeof obj.success}`;
|
|
7059
|
+
}
|
|
7060
|
+
if (!Array.isArray(obj.toolCalls)) {
|
|
7061
|
+
return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
|
|
7062
|
+
}
|
|
7063
|
+
for (let i = 0; i < obj.toolCalls.length; i++) {
|
|
7064
|
+
const tc = obj.toolCalls[i];
|
|
7065
|
+
if (typeof tc.name !== "string") {
|
|
7066
|
+
return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
|
|
7067
|
+
}
|
|
7068
|
+
if (typeof tc.arguments !== "object" || tc.arguments === null) {
|
|
7069
|
+
return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
|
|
7070
|
+
}
|
|
7071
|
+
}
|
|
7072
|
+
return null;
|
|
7073
|
+
}
|
|
7074
|
+
function spawnProcess(command, args, options) {
|
|
7075
|
+
return new Promise((resolve2, reject) => {
|
|
7076
|
+
const child = spawn(command, args, {
|
|
7077
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
7078
|
+
});
|
|
7079
|
+
child.stdin.end();
|
|
7080
|
+
const stdoutChunks = [];
|
|
7081
|
+
const stderrChunks = [];
|
|
7082
|
+
let totalBytes = 0;
|
|
7083
|
+
child.stdout.on("data", (chunk) => {
|
|
7084
|
+
totalBytes += chunk.length;
|
|
7085
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7086
|
+
stdoutChunks.push(chunk);
|
|
7087
|
+
}
|
|
7088
|
+
});
|
|
7089
|
+
child.stderr.on("data", (chunk) => {
|
|
7090
|
+
totalBytes += chunk.length;
|
|
7091
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7092
|
+
stderrChunks.push(chunk);
|
|
7093
|
+
}
|
|
7094
|
+
});
|
|
7095
|
+
const timer = setTimeout(() => {
|
|
7096
|
+
child.kill("SIGTERM");
|
|
7097
|
+
reject(new Error(`Process timed out after ${options.timeout}ms`));
|
|
7098
|
+
}, options.timeout);
|
|
7099
|
+
child.on("error", (err) => {
|
|
7100
|
+
clearTimeout(timer);
|
|
7101
|
+
reject(err);
|
|
7102
|
+
});
|
|
7103
|
+
child.on("close", (code) => {
|
|
7104
|
+
clearTimeout(timer);
|
|
7105
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
|
|
7106
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf-8");
|
|
7107
|
+
if (code !== 0) {
|
|
7108
|
+
reject(
|
|
7109
|
+
new Error(
|
|
7110
|
+
`Command failed with exit code ${code ?? "null"}` + (stderr ? `
|
|
7111
|
+
stderr: ${stderr}` : "")
|
|
7112
|
+
)
|
|
7113
|
+
);
|
|
7114
|
+
return;
|
|
7115
|
+
}
|
|
7116
|
+
resolve2({ stdout, stderr });
|
|
7117
|
+
});
|
|
7118
|
+
});
|
|
7119
|
+
}
|
|
7120
|
+
|
|
6602
7121
|
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6603
7122
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6604
7123
|
var allProviders = [
|
|
@@ -6616,6 +7135,25 @@ var simulatorRegistry = new Map(
|
|
|
6616
7135
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6617
7136
|
);
|
|
6618
7137
|
async function simulateMCPHost(mcp, scenario, config) {
|
|
7138
|
+
const hostType = config.hostType ?? "sdk";
|
|
7139
|
+
if (hostType === "cli") {
|
|
7140
|
+
if (!config.cli) {
|
|
7141
|
+
throw new Error(
|
|
7142
|
+
`mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
|
|
7143
|
+
);
|
|
7144
|
+
}
|
|
7145
|
+
return runCLIHost(config.cli, scenario);
|
|
7146
|
+
}
|
|
7147
|
+
if (hostType === "browser" || hostType === "desktop") {
|
|
7148
|
+
throw new Error(
|
|
7149
|
+
`Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
|
|
7150
|
+
);
|
|
7151
|
+
}
|
|
7152
|
+
if (!config.provider) {
|
|
7153
|
+
throw new Error(
|
|
7154
|
+
`mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
|
|
7155
|
+
);
|
|
7156
|
+
}
|
|
6619
7157
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6620
7158
|
if (!simulator) {
|
|
6621
7159
|
throw new Error(
|
|
@@ -6807,17 +7345,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6807
7345
|
};
|
|
6808
7346
|
}
|
|
6809
7347
|
if (expectBlock.passesJudge !== void 0) {
|
|
6810
|
-
const
|
|
6811
|
-
const
|
|
6812
|
-
|
|
6813
|
-
|
|
6814
|
-
|
|
6815
|
-
|
|
6816
|
-
|
|
6817
|
-
|
|
6818
|
-
|
|
6819
|
-
|
|
6820
|
-
|
|
7348
|
+
const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
|
|
7349
|
+
const judgeResultEntries = await Promise.all(
|
|
7350
|
+
judgeConfigs.map(async (judgeConfig) => {
|
|
7351
|
+
const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
|
|
7352
|
+
const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
|
|
7353
|
+
const validation = await validateJudge(response, {
|
|
7354
|
+
...judgeConfig,
|
|
7355
|
+
reference: effectiveReference,
|
|
7356
|
+
reps: effectiveReps
|
|
7357
|
+
});
|
|
7358
|
+
const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
|
|
7359
|
+
return {
|
|
7360
|
+
pass: validation.pass,
|
|
7361
|
+
details: validation.message,
|
|
7362
|
+
score: validation.details?.score,
|
|
7363
|
+
reasoning: validation.details?.reasoning,
|
|
7364
|
+
judgeName,
|
|
7365
|
+
judgeProvider: validation.details?.judgeProvider,
|
|
7366
|
+
judgeModel: validation.details?.judgeModel
|
|
7367
|
+
};
|
|
7368
|
+
})
|
|
7369
|
+
);
|
|
7370
|
+
if (judgeResultEntries.length === 1) {
|
|
7371
|
+
results.judge = judgeResultEntries[0];
|
|
7372
|
+
} else {
|
|
7373
|
+
const allPassed = judgeResultEntries.every((r) => r.pass);
|
|
7374
|
+
const passCount = judgeResultEntries.filter((r) => r.pass).length;
|
|
7375
|
+
results.judge = {
|
|
7376
|
+
pass: allPassed,
|
|
7377
|
+
details: `${passCount}/${judgeResultEntries.length} judges passed`,
|
|
7378
|
+
judgeResults: judgeResultEntries
|
|
7379
|
+
};
|
|
7380
|
+
}
|
|
6821
7381
|
}
|
|
6822
7382
|
if (expectBlock.snapshot !== void 0) {
|
|
6823
7383
|
if (!config.playwrightExpect) {
|
|
@@ -6846,6 +7406,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6846
7406
|
}
|
|
6847
7407
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6848
7408
|
}
|
|
7409
|
+
function buildRequest(evalCase) {
|
|
7410
|
+
const request = {};
|
|
7411
|
+
if (evalCase.description) request.description = evalCase.description;
|
|
7412
|
+
if (evalCase.mode === "mcp_host") {
|
|
7413
|
+
if (evalCase.scenario) request.scenario = evalCase.scenario;
|
|
7414
|
+
if (evalCase.mcpHostConfig) {
|
|
7415
|
+
request.mcpHostConfig = {
|
|
7416
|
+
provider: evalCase.mcpHostConfig.provider,
|
|
7417
|
+
...evalCase.mcpHostConfig.model !== void 0 && {
|
|
7418
|
+
model: evalCase.mcpHostConfig.model
|
|
7419
|
+
}
|
|
7420
|
+
};
|
|
7421
|
+
}
|
|
7422
|
+
} else {
|
|
7423
|
+
if (evalCase.args) request.args = evalCase.args;
|
|
7424
|
+
}
|
|
7425
|
+
return request;
|
|
7426
|
+
}
|
|
6849
7427
|
function isMCPHostSimulationResult(value) {
|
|
6850
7428
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6851
7429
|
}
|
|
@@ -6894,6 +7472,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6894
7472
|
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6895
7473
|
source: "eval",
|
|
6896
7474
|
pass: didCasePass(error, expectationResults),
|
|
7475
|
+
request: buildRequest(evalCase),
|
|
6897
7476
|
response,
|
|
6898
7477
|
error,
|
|
6899
7478
|
expectations: expectationResults,
|
|
@@ -7036,8 +7615,13 @@ async function runEvalDataset(options, context) {
|
|
|
7036
7615
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7037
7616
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7038
7617
|
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7039
|
-
|
|
7040
|
-
|
|
7618
|
+
if (c.expect?.passesJudge == null) return sum;
|
|
7619
|
+
const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
|
|
7620
|
+
const totalReps = judges.reduce(
|
|
7621
|
+
(r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
|
|
7622
|
+
0
|
|
7623
|
+
);
|
|
7624
|
+
return sum + effectiveIterations * totalReps;
|
|
7041
7625
|
}, 0);
|
|
7042
7626
|
if (estimatedJudgeCalls > 50) {
|
|
7043
7627
|
debugEval(
|
|
@@ -7385,6 +7969,6 @@ function formatCapabilities(capabilities) {
|
|
|
7385
7969
|
return parts.length > 0 ? parts.join(", ") : "none declared";
|
|
7386
7970
|
}
|
|
7387
7971
|
|
|
7388
|
-
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
7972
|
+
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
7389
7973
|
//# sourceMappingURL=index.js.map
|
|
7390
7974
|
//# sourceMappingURL=index.js.map
|