@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +354 -37
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +721 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +533 -116
- package/dist/index.d.ts +533 -116
- package/dist/index.js +719 -78
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -6
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/index.cjs
CHANGED
|
@@ -3306,7 +3306,11 @@ async function performOAuthSetup(config) {
|
|
|
3306
3306
|
const page = await context.newPage();
|
|
3307
3307
|
page.setDefaultTimeout(timeoutMs);
|
|
3308
3308
|
await page.goto(authorizationUrl.toString());
|
|
3309
|
-
|
|
3309
|
+
if ("customLoginFlow" in config && config.customLoginFlow) {
|
|
3310
|
+
await config.customLoginFlow(page);
|
|
3311
|
+
} else {
|
|
3312
|
+
await completeLoginForm(page, config);
|
|
3313
|
+
}
|
|
3310
3314
|
await page.waitForURL(
|
|
3311
3315
|
(url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
|
|
3312
3316
|
{ timeout: timeoutMs }
|
|
@@ -4407,7 +4411,7 @@ function escapeHtml(text) {
|
|
|
4407
4411
|
|
|
4408
4412
|
// package.json
|
|
4409
4413
|
var package_default = {
|
|
4410
|
-
version: "1.0.0
|
|
4414
|
+
version: "1.0.0"};
|
|
4411
4415
|
|
|
4412
4416
|
// src/mcp/clientFactory.ts
|
|
4413
4417
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4626,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4626
4630
|
}
|
|
4627
4631
|
async function closeMCPClient(client) {
|
|
4628
4632
|
try {
|
|
4633
|
+
const transport = client.transport;
|
|
4634
|
+
if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
|
|
4635
|
+
try {
|
|
4636
|
+
await transport.terminateSession();
|
|
4637
|
+
} catch (sessionError) {
|
|
4638
|
+
debugClient(
|
|
4639
|
+
"Error terminating session: %s",
|
|
4640
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
4641
|
+
);
|
|
4642
|
+
}
|
|
4643
|
+
}
|
|
4629
4644
|
await client.close();
|
|
4630
4645
|
} catch (error) {
|
|
4631
4646
|
debugClient(
|
|
@@ -4854,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
4854
4869
|
} catch (error) {
|
|
4855
4870
|
const zodError = error;
|
|
4856
4871
|
const issues = formatZodIssues(zodError);
|
|
4872
|
+
const text = stringifyResponse(response);
|
|
4857
4873
|
return {
|
|
4858
4874
|
pass: false,
|
|
4859
4875
|
message: `Response does not match schema: ${issues}`,
|
|
4860
4876
|
details: {
|
|
4861
|
-
issues: zodError.issues
|
|
4877
|
+
issues: zodError.issues,
|
|
4878
|
+
textPreview: truncateForDisplay2(text)
|
|
4862
4879
|
}
|
|
4863
4880
|
};
|
|
4864
4881
|
}
|
|
@@ -4911,6 +4928,12 @@ function formatZodIssues(error) {
|
|
|
4911
4928
|
});
|
|
4912
4929
|
return issues.join("; ");
|
|
4913
4930
|
}
|
|
4931
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
4932
|
+
if (str.length <= maxLength) {
|
|
4933
|
+
return str;
|
|
4934
|
+
}
|
|
4935
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
4936
|
+
}
|
|
4914
4937
|
|
|
4915
4938
|
// src/assertions/validators/text.ts
|
|
4916
4939
|
function validateText(response, expected, options = {}) {
|
|
@@ -4937,11 +4960,11 @@ function validateText(response, expected, options = {}) {
|
|
|
4937
4960
|
details: {
|
|
4938
4961
|
missing,
|
|
4939
4962
|
textLength: text.length,
|
|
4940
|
-
textPreview:
|
|
4963
|
+
textPreview: truncateForDisplay3(text)
|
|
4941
4964
|
}
|
|
4942
4965
|
};
|
|
4943
4966
|
}
|
|
4944
|
-
function
|
|
4967
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
4945
4968
|
if (str.length <= maxLength) {
|
|
4946
4969
|
return str;
|
|
4947
4970
|
}
|
|
@@ -4973,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
4973
4996
|
details: {
|
|
4974
4997
|
unmatched,
|
|
4975
4998
|
textLength: text.length,
|
|
4976
|
-
textPreview:
|
|
4999
|
+
textPreview: truncateForDisplay4(text)
|
|
4977
5000
|
}
|
|
4978
5001
|
};
|
|
4979
5002
|
}
|
|
@@ -4993,7 +5016,7 @@ function patternToString(pattern) {
|
|
|
4993
5016
|
}
|
|
4994
5017
|
return `/${pattern}/`;
|
|
4995
5018
|
}
|
|
4996
|
-
function
|
|
5019
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
4997
5020
|
if (str.length <= maxLength) {
|
|
4998
5021
|
return str;
|
|
4999
5022
|
}
|
|
@@ -5016,7 +5039,7 @@ function validateError(response, expected = true) {
|
|
|
5016
5039
|
pass: false,
|
|
5017
5040
|
message: "Expected an error response but got success",
|
|
5018
5041
|
details: {
|
|
5019
|
-
textPreview:
|
|
5042
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5020
5043
|
}
|
|
5021
5044
|
};
|
|
5022
5045
|
} else {
|
|
@@ -5028,7 +5051,7 @@ function validateError(response, expected = true) {
|
|
|
5028
5051
|
}
|
|
5029
5052
|
return {
|
|
5030
5053
|
pass: false,
|
|
5031
|
-
message: `Expected a success response but got error: "${
|
|
5054
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
5032
5055
|
details: {
|
|
5033
5056
|
errorMessage
|
|
5034
5057
|
}
|
|
@@ -5041,7 +5064,7 @@ function validateError(response, expected = true) {
|
|
|
5041
5064
|
pass: false,
|
|
5042
5065
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
5043
5066
|
details: {
|
|
5044
|
-
textPreview:
|
|
5067
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5045
5068
|
}
|
|
5046
5069
|
};
|
|
5047
5070
|
}
|
|
@@ -5063,7 +5086,7 @@ function validateError(response, expected = true) {
|
|
|
5063
5086
|
}
|
|
5064
5087
|
};
|
|
5065
5088
|
}
|
|
5066
|
-
function
|
|
5089
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
5067
5090
|
if (str.length <= maxLength) {
|
|
5068
5091
|
return str;
|
|
5069
5092
|
}
|
|
@@ -5124,9 +5147,17 @@ function formatBytes(bytes) {
|
|
|
5124
5147
|
function isSimulationResult(value) {
|
|
5125
5148
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
5126
5149
|
}
|
|
5150
|
+
function isPatternMatcher(v) {
|
|
5151
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
5152
|
+
}
|
|
5127
5153
|
function partialMatch(actual, expected) {
|
|
5128
5154
|
return Object.entries(expected).every(([k, v]) => {
|
|
5129
5155
|
const actualVal = actual[k];
|
|
5156
|
+
if (isPatternMatcher(v)) {
|
|
5157
|
+
if (typeof actualVal !== "string") return false;
|
|
5158
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
5159
|
+
return re.test(actualVal);
|
|
5160
|
+
}
|
|
5130
5161
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
5131
5162
|
return partialMatch(
|
|
5132
5163
|
actualVal,
|
|
@@ -5173,6 +5204,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5173
5204
|
return {
|
|
5174
5205
|
pass: false,
|
|
5175
5206
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
5207
|
+
details: {
|
|
5208
|
+
actual: actual.map((c) => c.name),
|
|
5209
|
+
expected: expected.name
|
|
5210
|
+
},
|
|
5176
5211
|
metrics
|
|
5177
5212
|
};
|
|
5178
5213
|
}
|
|
@@ -5189,6 +5224,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5189
5224
|
return {
|
|
5190
5225
|
pass: false,
|
|
5191
5226
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
5227
|
+
details: {
|
|
5228
|
+
actual: actual.map((c) => c.name),
|
|
5229
|
+
expected: expected.name
|
|
5230
|
+
},
|
|
5192
5231
|
metrics
|
|
5193
5232
|
};
|
|
5194
5233
|
}
|
|
@@ -5201,6 +5240,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5201
5240
|
return {
|
|
5202
5241
|
pass: false,
|
|
5203
5242
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
5243
|
+
details: {
|
|
5244
|
+
actual: actual.map((c) => c.name),
|
|
5245
|
+
unexpected: unexpected.map((c) => c.name)
|
|
5246
|
+
},
|
|
5204
5247
|
metrics
|
|
5205
5248
|
};
|
|
5206
5249
|
}
|
|
@@ -5219,19 +5262,22 @@ function validateToolCallCount(response, options) {
|
|
|
5219
5262
|
if (exact !== void 0 && count !== exact) {
|
|
5220
5263
|
return {
|
|
5221
5264
|
pass: false,
|
|
5222
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
5265
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
5266
|
+
details: { actual: count, expected: exact }
|
|
5223
5267
|
};
|
|
5224
5268
|
}
|
|
5225
5269
|
if (min !== void 0 && count < min) {
|
|
5226
5270
|
return {
|
|
5227
5271
|
pass: false,
|
|
5228
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
5272
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
5273
|
+
details: { actual: count, min }
|
|
5229
5274
|
};
|
|
5230
5275
|
}
|
|
5231
5276
|
if (max !== void 0 && count > max) {
|
|
5232
5277
|
return {
|
|
5233
5278
|
pass: false,
|
|
5234
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
5279
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
5280
|
+
details: { actual: count, max }
|
|
5235
5281
|
};
|
|
5236
5282
|
}
|
|
5237
5283
|
return {
|
|
@@ -5265,7 +5311,175 @@ var JudgeResponseSchema = zod.z.object({
|
|
|
5265
5311
|
reasoning: zod.z.string()
|
|
5266
5312
|
});
|
|
5267
5313
|
|
|
5268
|
-
// src/judge/
|
|
5314
|
+
// src/judge/anthropicJudge.ts
|
|
5315
|
+
function createAnthropicJudge(config = {}) {
|
|
5316
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
5317
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
5318
|
+
if (!apiKey) {
|
|
5319
|
+
throw new Error(
|
|
5320
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
5321
|
+
);
|
|
5322
|
+
}
|
|
5323
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5324
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5325
|
+
const temperature = config.temperature ?? 0;
|
|
5326
|
+
return {
|
|
5327
|
+
async evaluate(candidate, reference, rubric) {
|
|
5328
|
+
let anthropicModule;
|
|
5329
|
+
try {
|
|
5330
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
5331
|
+
} catch (err) {
|
|
5332
|
+
throw new Error(
|
|
5333
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
5334
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5335
|
+
);
|
|
5336
|
+
}
|
|
5337
|
+
const client = new anthropicModule.default({ apiKey });
|
|
5338
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
5339
|
+
const startTime = Date.now();
|
|
5340
|
+
const response = await client.messages.create({
|
|
5341
|
+
model,
|
|
5342
|
+
max_tokens: maxTokens,
|
|
5343
|
+
temperature,
|
|
5344
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5345
|
+
messages: [{ role: "user", content: prompt }]
|
|
5346
|
+
});
|
|
5347
|
+
const durationMs = Date.now() - startTime;
|
|
5348
|
+
const textBlock = response.content.find(
|
|
5349
|
+
(b) => b.type === "text"
|
|
5350
|
+
);
|
|
5351
|
+
const text = textBlock?.text ?? "";
|
|
5352
|
+
const parsed = parseJudgeResponse(text);
|
|
5353
|
+
return {
|
|
5354
|
+
pass: parsed.pass,
|
|
5355
|
+
score: parsed.score,
|
|
5356
|
+
reasoning: parsed.reasoning,
|
|
5357
|
+
usage: {
|
|
5358
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5359
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5360
|
+
totalCostUsd: 0,
|
|
5361
|
+
durationMs
|
|
5362
|
+
}
|
|
5363
|
+
};
|
|
5364
|
+
}
|
|
5365
|
+
};
|
|
5366
|
+
}
|
|
5367
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
5368
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5369
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5370
|
+
return `Rubric:
|
|
5371
|
+
${rubric}
|
|
5372
|
+
|
|
5373
|
+
<candidate_response>
|
|
5374
|
+
${candidateStr}
|
|
5375
|
+
</candidate_response>
|
|
5376
|
+
|
|
5377
|
+
<reference_answer>
|
|
5378
|
+
${referenceStr ?? "No reference provided."}
|
|
5379
|
+
</reference_answer>
|
|
5380
|
+
|
|
5381
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5382
|
+
}
|
|
5383
|
+
function parseJudgeResponse(text) {
|
|
5384
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5385
|
+
let parsed;
|
|
5386
|
+
try {
|
|
5387
|
+
parsed = JSON.parse(cleaned);
|
|
5388
|
+
} catch {
|
|
5389
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5390
|
+
}
|
|
5391
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5392
|
+
if (!result.success) {
|
|
5393
|
+
throw new Error(
|
|
5394
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5395
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5396
|
+
);
|
|
5397
|
+
}
|
|
5398
|
+
return result.data;
|
|
5399
|
+
}
|
|
5400
|
+
|
|
5401
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
5402
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
5403
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5404
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
5405
|
+
const temperature = config.temperature ?? 0;
|
|
5406
|
+
return {
|
|
5407
|
+
async evaluate(candidate, reference, rubric) {
|
|
5408
|
+
let vertexModule;
|
|
5409
|
+
try {
|
|
5410
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
5411
|
+
} catch (err) {
|
|
5412
|
+
throw new Error(
|
|
5413
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
5414
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
5415
|
+
);
|
|
5416
|
+
}
|
|
5417
|
+
const client = new vertexModule.AnthropicVertex({
|
|
5418
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
5419
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
5420
|
+
});
|
|
5421
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
5422
|
+
const startTime = Date.now();
|
|
5423
|
+
const response = await client.messages.create({
|
|
5424
|
+
model,
|
|
5425
|
+
max_tokens: maxTokens,
|
|
5426
|
+
temperature,
|
|
5427
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
5428
|
+
messages: [{ role: "user", content: prompt }]
|
|
5429
|
+
});
|
|
5430
|
+
const durationMs = Date.now() - startTime;
|
|
5431
|
+
const textBlock = response.content.find(
|
|
5432
|
+
(b) => b.type === "text"
|
|
5433
|
+
);
|
|
5434
|
+
const text = textBlock?.text ?? "";
|
|
5435
|
+
const parsed = parseJudgeResponse2(text);
|
|
5436
|
+
return {
|
|
5437
|
+
pass: parsed.pass,
|
|
5438
|
+
score: parsed.score,
|
|
5439
|
+
reasoning: parsed.reasoning,
|
|
5440
|
+
usage: {
|
|
5441
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
5442
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
5443
|
+
totalCostUsd: 0,
|
|
5444
|
+
durationMs
|
|
5445
|
+
}
|
|
5446
|
+
};
|
|
5447
|
+
}
|
|
5448
|
+
};
|
|
5449
|
+
}
|
|
5450
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
5451
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5452
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5453
|
+
return `Rubric:
|
|
5454
|
+
${rubric}
|
|
5455
|
+
|
|
5456
|
+
<candidate_response>
|
|
5457
|
+
${candidateStr}
|
|
5458
|
+
</candidate_response>
|
|
5459
|
+
|
|
5460
|
+
<reference_answer>
|
|
5461
|
+
${referenceStr ?? "No reference provided."}
|
|
5462
|
+
</reference_answer>
|
|
5463
|
+
|
|
5464
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5465
|
+
}
|
|
5466
|
+
function parseJudgeResponse2(text) {
|
|
5467
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5468
|
+
let parsed;
|
|
5469
|
+
try {
|
|
5470
|
+
parsed = JSON.parse(cleaned);
|
|
5471
|
+
} catch {
|
|
5472
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
5473
|
+
}
|
|
5474
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
5475
|
+
if (!result.success) {
|
|
5476
|
+
throw new Error(
|
|
5477
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
5478
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
5479
|
+
);
|
|
5480
|
+
}
|
|
5481
|
+
return result.data;
|
|
5482
|
+
}
|
|
5269
5483
|
function createClaudeAgentJudge(config) {
|
|
5270
5484
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
5271
5485
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -5283,7 +5497,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5283
5497
|
exceedsMaxToolOutputSize: true
|
|
5284
5498
|
};
|
|
5285
5499
|
}
|
|
5286
|
-
const prompt =
|
|
5500
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
5287
5501
|
try {
|
|
5288
5502
|
let resultMessage;
|
|
5289
5503
|
for await (const message of claudeAgentSdk.query({
|
|
@@ -5315,7 +5529,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5315
5529
|
);
|
|
5316
5530
|
}
|
|
5317
5531
|
const responseText = resultMessage.result ?? "";
|
|
5318
|
-
const parsed =
|
|
5532
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
5319
5533
|
const usage = {
|
|
5320
5534
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
5321
5535
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -5344,7 +5558,7 @@ function createClaudeAgentJudge(config) {
|
|
|
5344
5558
|
function buildSystemPrompt() {
|
|
5345
5559
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
5346
5560
|
}
|
|
5347
|
-
function
|
|
5561
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
5348
5562
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5349
5563
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5350
5564
|
const parts = [];
|
|
@@ -5361,7 +5575,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
5361
5575
|
);
|
|
5362
5576
|
return parts.join("");
|
|
5363
5577
|
}
|
|
5364
|
-
function
|
|
5578
|
+
function parseJudgeResponse3(text) {
|
|
5365
5579
|
let jsonText = text.trim();
|
|
5366
5580
|
if (jsonText.startsWith("```json")) {
|
|
5367
5581
|
jsonText = jsonText.slice(7);
|
|
@@ -5418,7 +5632,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5418
5632
|
);
|
|
5419
5633
|
}
|
|
5420
5634
|
const client = new openaiModule.default({ apiKey });
|
|
5421
|
-
const prompt =
|
|
5635
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
5422
5636
|
const startTime = Date.now();
|
|
5423
5637
|
const completion = await client.chat.completions.create({
|
|
5424
5638
|
model,
|
|
@@ -5434,7 +5648,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5434
5648
|
});
|
|
5435
5649
|
const durationMs = Date.now() - startTime;
|
|
5436
5650
|
const text = completion.choices[0]?.message.content ?? "";
|
|
5437
|
-
const parsed =
|
|
5651
|
+
const parsed = parseJudgeResponse4(text);
|
|
5438
5652
|
return {
|
|
5439
5653
|
pass: parsed.pass,
|
|
5440
5654
|
score: parsed.score,
|
|
@@ -5449,7 +5663,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
5449
5663
|
}
|
|
5450
5664
|
};
|
|
5451
5665
|
}
|
|
5452
|
-
function
|
|
5666
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
5453
5667
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
5454
5668
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
5455
5669
|
return `Rubric:
|
|
@@ -5465,7 +5679,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
5465
5679
|
|
|
5466
5680
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
5467
5681
|
}
|
|
5468
|
-
function
|
|
5682
|
+
function parseJudgeResponse4(text) {
|
|
5469
5683
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
5470
5684
|
let parsed;
|
|
5471
5685
|
try {
|
|
@@ -5567,14 +5781,48 @@ function createJudge(config = {}) {
|
|
|
5567
5781
|
const provider = config.provider ?? "anthropic";
|
|
5568
5782
|
switch (provider) {
|
|
5569
5783
|
case "anthropic":
|
|
5784
|
+
return createAnthropicJudge(config);
|
|
5785
|
+
case "vertex-anthropic":
|
|
5786
|
+
return createVertexAnthropicJudge(config);
|
|
5787
|
+
case "anthropic-agent-sdk":
|
|
5570
5788
|
return createClaudeAgentJudge(config);
|
|
5571
5789
|
case "openai":
|
|
5572
5790
|
return createOpenAIJudge(config);
|
|
5573
5791
|
case "google":
|
|
5574
5792
|
return createGoogleJudge(config);
|
|
5575
5793
|
default:
|
|
5576
|
-
throw new Error(
|
|
5794
|
+
throw new Error(
|
|
5795
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
5796
|
+
);
|
|
5797
|
+
}
|
|
5798
|
+
}
|
|
5799
|
+
|
|
5800
|
+
// src/judge/judgeRegistry.ts
|
|
5801
|
+
var registry = /* @__PURE__ */ new Map();
|
|
5802
|
+
function registerJudge(name15, executor) {
|
|
5803
|
+
const existing = registry.get(name15);
|
|
5804
|
+
if (existing !== void 0) {
|
|
5805
|
+
if (existing === executor) {
|
|
5806
|
+
return;
|
|
5807
|
+
}
|
|
5808
|
+
throw new Error(
|
|
5809
|
+
`Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
|
|
5810
|
+
);
|
|
5811
|
+
}
|
|
5812
|
+
registry.set(name15, executor);
|
|
5813
|
+
}
|
|
5814
|
+
function getRegisteredJudge(name15) {
|
|
5815
|
+
const executor = registry.get(name15);
|
|
5816
|
+
if (!executor) {
|
|
5817
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
5818
|
+
throw new Error(
|
|
5819
|
+
`Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
5820
|
+
);
|
|
5577
5821
|
}
|
|
5822
|
+
return executor;
|
|
5823
|
+
}
|
|
5824
|
+
function clearJudgeRegistry() {
|
|
5825
|
+
registry.clear();
|
|
5578
5826
|
}
|
|
5579
5827
|
|
|
5580
5828
|
// src/assertions/validators/judge.ts
|
|
@@ -5585,6 +5833,7 @@ function computeStdDev(scores, mean) {
|
|
|
5585
5833
|
}
|
|
5586
5834
|
async function validateJudge(response, config) {
|
|
5587
5835
|
const {
|
|
5836
|
+
judge: judgeName,
|
|
5588
5837
|
rubric,
|
|
5589
5838
|
reference,
|
|
5590
5839
|
threshold = 0.7,
|
|
@@ -5597,6 +5846,29 @@ async function validateJudge(response, config) {
|
|
|
5597
5846
|
maxBudgetUsd,
|
|
5598
5847
|
maxToolOutputSize
|
|
5599
5848
|
} = config;
|
|
5849
|
+
if (judgeName !== void 0) {
|
|
5850
|
+
try {
|
|
5851
|
+
const executor = getRegisteredJudge(judgeName);
|
|
5852
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
5853
|
+
const score = judgeResult.score;
|
|
5854
|
+
const passed = score >= threshold;
|
|
5855
|
+
return {
|
|
5856
|
+
pass: passed,
|
|
5857
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
5858
|
+
};
|
|
5859
|
+
} catch (err) {
|
|
5860
|
+
return {
|
|
5861
|
+
pass: false,
|
|
5862
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
5863
|
+
};
|
|
5864
|
+
}
|
|
5865
|
+
}
|
|
5866
|
+
if (rubric === void 0) {
|
|
5867
|
+
return {
|
|
5868
|
+
pass: false,
|
|
5869
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
5870
|
+
};
|
|
5871
|
+
}
|
|
5600
5872
|
const resolvedRubric = resolveRubric(rubric);
|
|
5601
5873
|
const judgeConfig = {
|
|
5602
5874
|
...provider !== void 0 && { provider },
|
|
@@ -5643,11 +5915,17 @@ async function validateJudge(response, config) {
|
|
|
5643
5915
|
return {
|
|
5644
5916
|
pass: passed,
|
|
5645
5917
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
5646
|
-
details:
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
|
|
5650
|
-
|
|
5918
|
+
details: {
|
|
5919
|
+
score: meanScore,
|
|
5920
|
+
reasoning: lastReasoning,
|
|
5921
|
+
judgeProvider: provider ?? "anthropic",
|
|
5922
|
+
judgeModel: model,
|
|
5923
|
+
...reps > 1 && {
|
|
5924
|
+
scores,
|
|
5925
|
+
scoreStdDev: stdDev,
|
|
5926
|
+
highVariance
|
|
5927
|
+
}
|
|
5928
|
+
}
|
|
5651
5929
|
};
|
|
5652
5930
|
} catch (err) {
|
|
5653
5931
|
return {
|
|
@@ -5840,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
|
|
|
5840
6118
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
5841
6119
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
5842
6120
|
const result = validateSchema(received, schema, options);
|
|
6121
|
+
const preview = result.details?.textPreview;
|
|
5843
6122
|
return {
|
|
5844
6123
|
pass: result.pass,
|
|
5845
6124
|
message: () => {
|
|
5846
6125
|
if (this.isNot) {
|
|
5847
6126
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
5848
6127
|
}
|
|
6128
|
+
if (!result.pass && preview) {
|
|
6129
|
+
return `${result.message}
|
|
6130
|
+
|
|
6131
|
+
Actual response (truncated):
|
|
6132
|
+
${preview}`;
|
|
6133
|
+
}
|
|
5849
6134
|
return result.message;
|
|
5850
6135
|
}
|
|
5851
6136
|
};
|
|
@@ -5854,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
|
|
|
5854
6139
|
// src/assertions/matchers/toContainToolText.ts
|
|
5855
6140
|
function toContainToolText(received, expected, options = {}) {
|
|
5856
6141
|
const result = validateText(received, expected, options);
|
|
6142
|
+
const preview = result.details?.textPreview;
|
|
5857
6143
|
return {
|
|
5858
6144
|
pass: result.pass,
|
|
5859
6145
|
message: () => {
|
|
@@ -5861,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
5861
6147
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
5862
6148
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
5863
6149
|
}
|
|
6150
|
+
if (!result.pass && preview) {
|
|
6151
|
+
return `${result.message}
|
|
6152
|
+
|
|
6153
|
+
Actual response (truncated):
|
|
6154
|
+
${preview}`;
|
|
6155
|
+
}
|
|
5864
6156
|
return result.message;
|
|
5865
6157
|
}
|
|
5866
6158
|
};
|
|
@@ -5869,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
5869
6161
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
5870
6162
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
5871
6163
|
const result = validatePattern(received, patterns, options);
|
|
6164
|
+
const preview = result.details?.textPreview;
|
|
5872
6165
|
return {
|
|
5873
6166
|
pass: result.pass,
|
|
5874
6167
|
message: () => {
|
|
5875
6168
|
if (this.isNot) {
|
|
5876
6169
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
5877
6170
|
}
|
|
6171
|
+
if (!result.pass && preview) {
|
|
6172
|
+
return `${result.message}
|
|
6173
|
+
|
|
6174
|
+
Actual response (truncated):
|
|
6175
|
+
${preview}`;
|
|
6176
|
+
}
|
|
5878
6177
|
return result.message;
|
|
5879
6178
|
}
|
|
5880
6179
|
};
|
|
@@ -6026,31 +6325,68 @@ function toBeToolError(received, expected = true) {
|
|
|
6026
6325
|
|
|
6027
6326
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
6028
6327
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
6029
|
-
async function
|
|
6328
|
+
async function runSingleJudge(received, rubric, options) {
|
|
6030
6329
|
const {
|
|
6031
6330
|
reference = null,
|
|
6032
6331
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
6033
6332
|
reps,
|
|
6034
6333
|
provider,
|
|
6035
|
-
model
|
|
6334
|
+
model,
|
|
6335
|
+
judge
|
|
6036
6336
|
} = options;
|
|
6037
6337
|
const validation = await validateJudge(received, {
|
|
6038
|
-
rubric,
|
|
6338
|
+
...rubric !== void 0 && { rubric },
|
|
6039
6339
|
reference: reference ?? void 0,
|
|
6040
6340
|
threshold: passingThreshold,
|
|
6041
6341
|
...reps !== void 0 && { reps },
|
|
6042
6342
|
...provider !== void 0 && { provider },
|
|
6043
|
-
...model !== void 0 && { model }
|
|
6343
|
+
...model !== void 0 && { model },
|
|
6344
|
+
...judge !== void 0 && { judge }
|
|
6044
6345
|
});
|
|
6346
|
+
return { pass: validation.pass, message: validation.message };
|
|
6347
|
+
}
|
|
6348
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
6349
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
6350
|
+
const results = await Promise.all(
|
|
6351
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
6352
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
6353
|
+
return runSingleJudge(received, r, opts);
|
|
6354
|
+
})
|
|
6355
|
+
);
|
|
6356
|
+
const allPassed = results.every((r) => r.pass);
|
|
6357
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
6358
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
6359
|
+
const details = results.map((r) => r.message).join("\n");
|
|
6360
|
+
if (this.isNot) {
|
|
6361
|
+
return {
|
|
6362
|
+
pass: !allPassed,
|
|
6363
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
6364
|
+
};
|
|
6365
|
+
}
|
|
6366
|
+
return {
|
|
6367
|
+
pass: allPassed,
|
|
6368
|
+
message: () => `${summary}
|
|
6369
|
+
${details}`
|
|
6370
|
+
};
|
|
6371
|
+
}
|
|
6372
|
+
let rubric;
|
|
6373
|
+
let options;
|
|
6374
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
6375
|
+
rubric = rubricOrOptions;
|
|
6376
|
+
options = maybeOptions ?? {};
|
|
6377
|
+
} else {
|
|
6378
|
+
options = rubricOrOptions;
|
|
6379
|
+
}
|
|
6380
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
6045
6381
|
if (this.isNot) {
|
|
6046
6382
|
return {
|
|
6047
|
-
pass: !
|
|
6048
|
-
message: () =>
|
|
6383
|
+
pass: !result.pass,
|
|
6384
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
6049
6385
|
};
|
|
6050
6386
|
}
|
|
6051
6387
|
return {
|
|
6052
|
-
pass:
|
|
6053
|
-
message: () =>
|
|
6388
|
+
pass: result.pass,
|
|
6389
|
+
message: () => result.message
|
|
6054
6390
|
};
|
|
6055
6391
|
}
|
|
6056
6392
|
|
|
@@ -6334,6 +6670,7 @@ function getAuthConfigFromEnv() {
|
|
|
6334
6670
|
return void 0;
|
|
6335
6671
|
}
|
|
6336
6672
|
var MCPHostConfigSchema = zod.z.object({
|
|
6673
|
+
hostType: zod.z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
|
|
6337
6674
|
provider: zod.z.enum([
|
|
6338
6675
|
"openai",
|
|
6339
6676
|
"anthropic",
|
|
@@ -6344,12 +6681,18 @@ var MCPHostConfigSchema = zod.z.object({
|
|
|
6344
6681
|
"openrouter",
|
|
6345
6682
|
"xai",
|
|
6346
6683
|
"vertex-anthropic"
|
|
6347
|
-
]),
|
|
6684
|
+
]).optional(),
|
|
6348
6685
|
apiKeyEnvVar: zod.z.string().optional(),
|
|
6349
6686
|
model: zod.z.string().optional(),
|
|
6350
6687
|
maxTokens: zod.z.number().optional(),
|
|
6351
6688
|
temperature: zod.z.number().optional(),
|
|
6352
|
-
maxToolCalls: zod.z.number().optional()
|
|
6689
|
+
maxToolCalls: zod.z.number().optional(),
|
|
6690
|
+
cli: zod.z.object({
|
|
6691
|
+
command: zod.z.string(),
|
|
6692
|
+
args: zod.z.array(zod.z.string()),
|
|
6693
|
+
outputFormat: zod.z.enum(["stream-json", "json"]).optional(),
|
|
6694
|
+
timeout: zod.z.number().optional()
|
|
6695
|
+
}).optional()
|
|
6353
6696
|
});
|
|
6354
6697
|
var SnapshotSanitizerSchema = zod.z.union([
|
|
6355
6698
|
// Built-in sanitizers
|
|
@@ -6364,6 +6707,37 @@ var SnapshotSanitizerSchema = zod.z.union([
|
|
|
6364
6707
|
remove: zod.z.array(zod.z.string())
|
|
6365
6708
|
})
|
|
6366
6709
|
]);
|
|
6710
|
+
var JudgeExpectConfigSchema = zod.z.object({
|
|
6711
|
+
judge: zod.z.string().min(1).optional(),
|
|
6712
|
+
rubric: zod.z.union([
|
|
6713
|
+
zod.z.enum([
|
|
6714
|
+
"correctness",
|
|
6715
|
+
"completeness",
|
|
6716
|
+
"groundedness",
|
|
6717
|
+
"instruction-following",
|
|
6718
|
+
"conciseness"
|
|
6719
|
+
]),
|
|
6720
|
+
zod.z.object({ text: zod.z.string().min(1) })
|
|
6721
|
+
]).optional(),
|
|
6722
|
+
reference: zod.z.unknown().optional(),
|
|
6723
|
+
threshold: zod.z.number().min(0).max(1).optional(),
|
|
6724
|
+
reps: zod.z.number().int().min(1).optional(),
|
|
6725
|
+
provider: zod.z.enum([
|
|
6726
|
+
"anthropic",
|
|
6727
|
+
"vertex-anthropic",
|
|
6728
|
+
"anthropic-agent-sdk",
|
|
6729
|
+
"openai",
|
|
6730
|
+
"google"
|
|
6731
|
+
]).optional(),
|
|
6732
|
+
model: zod.z.string().optional(),
|
|
6733
|
+
apiKeyEnvVar: zod.z.string().optional(),
|
|
6734
|
+
maxTokens: zod.z.number().int().positive().optional(),
|
|
6735
|
+
temperature: zod.z.number().min(0).max(1).optional(),
|
|
6736
|
+
maxBudgetUsd: zod.z.number().positive().optional(),
|
|
6737
|
+
maxToolOutputSize: zod.z.number().int().positive().optional()
|
|
6738
|
+
}).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
|
|
6739
|
+
message: 'Either "judge" or "rubric" must be provided in passesJudge'
|
|
6740
|
+
});
|
|
6367
6741
|
var EvalExpectBlockSchema = zod.z.object({
|
|
6368
6742
|
response: zod.z.unknown().optional(),
|
|
6369
6743
|
schema: zod.z.string().optional(),
|
|
@@ -6372,28 +6746,7 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6372
6746
|
snapshot: zod.z.string().optional(),
|
|
6373
6747
|
snapshotSanitizers: zod.z.array(SnapshotSanitizerSchema).optional(),
|
|
6374
6748
|
isError: zod.z.union([zod.z.boolean(), zod.z.string(), zod.z.array(zod.z.string())]).optional(),
|
|
6375
|
-
passesJudge: zod.z.
|
|
6376
|
-
rubric: zod.z.union([
|
|
6377
|
-
zod.z.enum([
|
|
6378
|
-
"correctness",
|
|
6379
|
-
"completeness",
|
|
6380
|
-
"groundedness",
|
|
6381
|
-
"instruction-following",
|
|
6382
|
-
"conciseness"
|
|
6383
|
-
]),
|
|
6384
|
-
zod.z.object({ text: zod.z.string().min(1) })
|
|
6385
|
-
]),
|
|
6386
|
-
reference: zod.z.unknown().optional(),
|
|
6387
|
-
threshold: zod.z.number().min(0).max(1).optional(),
|
|
6388
|
-
reps: zod.z.number().int().min(1).optional(),
|
|
6389
|
-
provider: zod.z.enum(["anthropic", "openai", "google"]).optional(),
|
|
6390
|
-
model: zod.z.string().optional(),
|
|
6391
|
-
apiKeyEnvVar: zod.z.string().optional(),
|
|
6392
|
-
maxTokens: zod.z.number().int().positive().optional(),
|
|
6393
|
-
temperature: zod.z.number().min(0).max(1).optional(),
|
|
6394
|
-
maxBudgetUsd: zod.z.number().positive().optional(),
|
|
6395
|
-
maxToolOutputSize: zod.z.number().int().positive().optional()
|
|
6396
|
-
}).optional(),
|
|
6749
|
+
passesJudge: zod.z.union([JudgeExpectConfigSchema, zod.z.array(JudgeExpectConfigSchema).min(1)]).optional(),
|
|
6397
6750
|
responseSize: zod.z.object({
|
|
6398
6751
|
maxBytes: zod.z.number().optional(),
|
|
6399
6752
|
minBytes: zod.z.number().optional()
|
|
@@ -6566,6 +6919,9 @@ function createVercelOrchestrator() {
|
|
|
6566
6919
|
try {
|
|
6567
6920
|
const { generateText, stepCountIs } = await import('ai');
|
|
6568
6921
|
const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
|
|
6922
|
+
if (!config.provider) {
|
|
6923
|
+
throw new Error("provider is required for SDK host type");
|
|
6924
|
+
}
|
|
6569
6925
|
const modelId = config.model ?? defaultModel(config.provider);
|
|
6570
6926
|
const model = await loadModel(config.provider, modelId);
|
|
6571
6927
|
const mcpTools = await mcp.listTools();
|
|
@@ -6619,13 +6975,233 @@ function createVercelOrchestrator() {
|
|
|
6619
6975
|
return {
|
|
6620
6976
|
success: false,
|
|
6621
6977
|
toolCalls: [],
|
|
6622
|
-
error: enrichErrorMessage(err, config.provider)
|
|
6978
|
+
error: enrichErrorMessage(err, config.provider ?? "unknown")
|
|
6623
6979
|
};
|
|
6624
6980
|
}
|
|
6625
6981
|
}
|
|
6626
6982
|
};
|
|
6627
6983
|
}
|
|
6628
6984
|
|
|
6985
|
+
// src/evals/mcpHost/adapters/cli/parsers.ts
|
|
6986
|
+
function parseStreamJson(stdout) {
|
|
6987
|
+
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6988
|
+
const toolCalls = [];
|
|
6989
|
+
const textParts = [];
|
|
6990
|
+
const conversationHistory = [];
|
|
6991
|
+
for (const line of lines) {
|
|
6992
|
+
let event;
|
|
6993
|
+
try {
|
|
6994
|
+
event = JSON.parse(line);
|
|
6995
|
+
} catch {
|
|
6996
|
+
continue;
|
|
6997
|
+
}
|
|
6998
|
+
if (event.type === "assistant" && event.message?.content) {
|
|
6999
|
+
for (const block of event.message.content) {
|
|
7000
|
+
if (block.type === "tool_use" && block.name) {
|
|
7001
|
+
const rawName = block.name;
|
|
7002
|
+
const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
|
|
7003
|
+
toolCalls.push({
|
|
7004
|
+
name: mcpMatch ? mcpMatch[1] : rawName,
|
|
7005
|
+
arguments: block.input ?? {},
|
|
7006
|
+
id: block.id
|
|
7007
|
+
});
|
|
7008
|
+
}
|
|
7009
|
+
if (block.type === "text" && block.text) {
|
|
7010
|
+
textParts.push(block.text);
|
|
7011
|
+
}
|
|
7012
|
+
}
|
|
7013
|
+
}
|
|
7014
|
+
if (event.type === "user" && event.message?.content) {
|
|
7015
|
+
for (const block of event.message.content) {
|
|
7016
|
+
if (block.type === "tool_result") {
|
|
7017
|
+
const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
|
|
7018
|
+
conversationHistory.push({ role: "tool", content });
|
|
7019
|
+
}
|
|
7020
|
+
}
|
|
7021
|
+
}
|
|
7022
|
+
if (event.type === "result" && typeof event.result === "string") {
|
|
7023
|
+
if (textParts.length === 0) {
|
|
7024
|
+
textParts.push(event.result);
|
|
7025
|
+
}
|
|
7026
|
+
}
|
|
7027
|
+
if (event.type === "result" && event.is_error === true) {
|
|
7028
|
+
return {
|
|
7029
|
+
success: false,
|
|
7030
|
+
toolCalls,
|
|
7031
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7032
|
+
};
|
|
7033
|
+
}
|
|
7034
|
+
}
|
|
7035
|
+
const response = textParts.join("");
|
|
7036
|
+
if (response) {
|
|
7037
|
+
conversationHistory.push({ role: "assistant", content: response });
|
|
7038
|
+
}
|
|
7039
|
+
return {
|
|
7040
|
+
success: true,
|
|
7041
|
+
toolCalls,
|
|
7042
|
+
response: response || void 0,
|
|
7043
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7044
|
+
};
|
|
7045
|
+
}
|
|
7046
|
+
function createJsonParser(paths) {
|
|
7047
|
+
return (stdout) => {
|
|
7048
|
+
const data = JSON.parse(stdout);
|
|
7049
|
+
const rawToolCalls = getNestedValue(data, paths.toolCalls);
|
|
7050
|
+
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
|
|
7051
|
+
name: typeof tc.name === "string" ? tc.name : "",
|
|
7052
|
+
arguments: tc.arguments ?? tc.args ?? {}
|
|
7053
|
+
})) : [];
|
|
7054
|
+
const response = getNestedValue(data, paths.response);
|
|
7055
|
+
const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
|
|
7056
|
+
return {
|
|
7057
|
+
success,
|
|
7058
|
+
toolCalls,
|
|
7059
|
+
response: typeof response === "string" ? response : void 0
|
|
7060
|
+
};
|
|
7061
|
+
};
|
|
7062
|
+
}
|
|
7063
|
+
function getNestedValue(obj, path3) {
|
|
7064
|
+
return path3.split(".").reduce((current, key) => {
|
|
7065
|
+
if (current !== null && typeof current === "object") {
|
|
7066
|
+
return current[key];
|
|
7067
|
+
}
|
|
7068
|
+
return void 0;
|
|
7069
|
+
}, obj);
|
|
7070
|
+
}
|
|
7071
|
+
|
|
7072
|
+
// src/evals/mcpHost/adapters/cli/runner.ts
|
|
7073
|
+
var DEFAULT_TIMEOUT = 12e4;
|
|
7074
|
+
var MAX_BUFFER = 10 * 1024 * 1024;
|
|
7075
|
+
function getParser(format) {
|
|
7076
|
+
switch (format ?? "stream-json") {
|
|
7077
|
+
case "stream-json":
|
|
7078
|
+
return parseStreamJson;
|
|
7079
|
+
case "json":
|
|
7080
|
+
return createJsonParser({
|
|
7081
|
+
toolCalls: "toolCalls",
|
|
7082
|
+
response: "response",
|
|
7083
|
+
success: "success"
|
|
7084
|
+
});
|
|
7085
|
+
}
|
|
7086
|
+
}
|
|
7087
|
+
function interpolateArgs(args, scenario) {
|
|
7088
|
+
return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
|
|
7089
|
+
}
|
|
7090
|
+
async function runCLIHost(cliConfig, scenario) {
|
|
7091
|
+
const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
|
|
7092
|
+
const args = interpolateArgs(cliConfig.args, scenario);
|
|
7093
|
+
const startTime = Date.now();
|
|
7094
|
+
let stdout;
|
|
7095
|
+
try {
|
|
7096
|
+
const result2 = await spawnProcess(cliConfig.command, args, { timeout });
|
|
7097
|
+
stdout = result2.stdout;
|
|
7098
|
+
} catch (err) {
|
|
7099
|
+
const elapsed = Date.now() - startTime;
|
|
7100
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7101
|
+
if (message.includes("TIMEOUT") || message.includes("timed out")) {
|
|
7102
|
+
return {
|
|
7103
|
+
success: false,
|
|
7104
|
+
toolCalls: [],
|
|
7105
|
+
error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
|
|
7106
|
+
};
|
|
7107
|
+
}
|
|
7108
|
+
return {
|
|
7109
|
+
success: false,
|
|
7110
|
+
toolCalls: [],
|
|
7111
|
+
error: `CLI host process failed: ${message}`
|
|
7112
|
+
};
|
|
7113
|
+
}
|
|
7114
|
+
const parse = getParser(cliConfig.outputFormat);
|
|
7115
|
+
let result;
|
|
7116
|
+
try {
|
|
7117
|
+
result = parse(stdout);
|
|
7118
|
+
} catch (err) {
|
|
7119
|
+
return {
|
|
7120
|
+
success: false,
|
|
7121
|
+
toolCalls: [],
|
|
7122
|
+
error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
|
|
7123
|
+
stdout (first 500 chars): ${stdout.slice(0, 500)}`
|
|
7124
|
+
};
|
|
7125
|
+
}
|
|
7126
|
+
const validationError = validateSimulationResult(result);
|
|
7127
|
+
if (validationError) {
|
|
7128
|
+
return {
|
|
7129
|
+
success: false,
|
|
7130
|
+
toolCalls: [],
|
|
7131
|
+
error: `CLI host returned invalid result: ${validationError}`
|
|
7132
|
+
};
|
|
7133
|
+
}
|
|
7134
|
+
return result;
|
|
7135
|
+
}
|
|
7136
|
+
function validateSimulationResult(result) {
|
|
7137
|
+
if (result === null || typeof result !== "object") {
|
|
7138
|
+
return `Expected object, got ${typeof result}`;
|
|
7139
|
+
}
|
|
7140
|
+
const obj = result;
|
|
7141
|
+
if (typeof obj.success !== "boolean") {
|
|
7142
|
+
return `"success" must be a boolean, got ${typeof obj.success}`;
|
|
7143
|
+
}
|
|
7144
|
+
if (!Array.isArray(obj.toolCalls)) {
|
|
7145
|
+
return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
|
|
7146
|
+
}
|
|
7147
|
+
for (let i = 0; i < obj.toolCalls.length; i++) {
|
|
7148
|
+
const tc = obj.toolCalls[i];
|
|
7149
|
+
if (typeof tc.name !== "string") {
|
|
7150
|
+
return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
|
|
7151
|
+
}
|
|
7152
|
+
if (typeof tc.arguments !== "object" || tc.arguments === null) {
|
|
7153
|
+
return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
|
|
7154
|
+
}
|
|
7155
|
+
}
|
|
7156
|
+
return null;
|
|
7157
|
+
}
|
|
7158
|
+
function spawnProcess(command, args, options) {
|
|
7159
|
+
return new Promise((resolve2, reject) => {
|
|
7160
|
+
const child = child_process.spawn(command, args, {
|
|
7161
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
7162
|
+
});
|
|
7163
|
+
child.stdin.end();
|
|
7164
|
+
const stdoutChunks = [];
|
|
7165
|
+
const stderrChunks = [];
|
|
7166
|
+
let totalBytes = 0;
|
|
7167
|
+
child.stdout.on("data", (chunk) => {
|
|
7168
|
+
totalBytes += chunk.length;
|
|
7169
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7170
|
+
stdoutChunks.push(chunk);
|
|
7171
|
+
}
|
|
7172
|
+
});
|
|
7173
|
+
child.stderr.on("data", (chunk) => {
|
|
7174
|
+
totalBytes += chunk.length;
|
|
7175
|
+
if (totalBytes <= MAX_BUFFER) {
|
|
7176
|
+
stderrChunks.push(chunk);
|
|
7177
|
+
}
|
|
7178
|
+
});
|
|
7179
|
+
const timer = setTimeout(() => {
|
|
7180
|
+
child.kill("SIGTERM");
|
|
7181
|
+
reject(new Error(`Process timed out after ${options.timeout}ms`));
|
|
7182
|
+
}, options.timeout);
|
|
7183
|
+
child.on("error", (err) => {
|
|
7184
|
+
clearTimeout(timer);
|
|
7185
|
+
reject(err);
|
|
7186
|
+
});
|
|
7187
|
+
child.on("close", (code) => {
|
|
7188
|
+
clearTimeout(timer);
|
|
7189
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
|
|
7190
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf-8");
|
|
7191
|
+
if (code !== 0) {
|
|
7192
|
+
reject(
|
|
7193
|
+
new Error(
|
|
7194
|
+
`Command failed with exit code ${code ?? "null"}` + (stderr ? `
|
|
7195
|
+
stderr: ${stderr}` : "")
|
|
7196
|
+
)
|
|
7197
|
+
);
|
|
7198
|
+
return;
|
|
7199
|
+
}
|
|
7200
|
+
resolve2({ stdout, stderr });
|
|
7201
|
+
});
|
|
7202
|
+
});
|
|
7203
|
+
}
|
|
7204
|
+
|
|
6629
7205
|
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6630
7206
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6631
7207
|
var allProviders = [
|
|
@@ -6643,6 +7219,25 @@ var simulatorRegistry = new Map(
|
|
|
6643
7219
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6644
7220
|
);
|
|
6645
7221
|
async function simulateMCPHost(mcp, scenario, config) {
|
|
7222
|
+
const hostType = config.hostType ?? "sdk";
|
|
7223
|
+
if (hostType === "cli") {
|
|
7224
|
+
if (!config.cli) {
|
|
7225
|
+
throw new Error(
|
|
7226
|
+
`mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
|
|
7227
|
+
);
|
|
7228
|
+
}
|
|
7229
|
+
return runCLIHost(config.cli, scenario);
|
|
7230
|
+
}
|
|
7231
|
+
if (hostType === "browser" || hostType === "desktop") {
|
|
7232
|
+
throw new Error(
|
|
7233
|
+
`Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
|
|
7234
|
+
);
|
|
7235
|
+
}
|
|
7236
|
+
if (!config.provider) {
|
|
7237
|
+
throw new Error(
|
|
7238
|
+
`mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
|
|
7239
|
+
);
|
|
7240
|
+
}
|
|
6646
7241
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6647
7242
|
if (!simulator) {
|
|
6648
7243
|
throw new Error(
|
|
@@ -6834,17 +7429,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6834
7429
|
};
|
|
6835
7430
|
}
|
|
6836
7431
|
if (expectBlock.passesJudge !== void 0) {
|
|
6837
|
-
const
|
|
6838
|
-
const
|
|
6839
|
-
|
|
6840
|
-
|
|
6841
|
-
|
|
6842
|
-
|
|
6843
|
-
|
|
6844
|
-
|
|
6845
|
-
|
|
6846
|
-
|
|
6847
|
-
|
|
7432
|
+
const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
|
|
7433
|
+
const judgeResultEntries = await Promise.all(
|
|
7434
|
+
judgeConfigs.map(async (judgeConfig) => {
|
|
7435
|
+
const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
|
|
7436
|
+
const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
|
|
7437
|
+
const validation = await validateJudge(response, {
|
|
7438
|
+
...judgeConfig,
|
|
7439
|
+
reference: effectiveReference,
|
|
7440
|
+
reps: effectiveReps
|
|
7441
|
+
});
|
|
7442
|
+
const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
|
|
7443
|
+
return {
|
|
7444
|
+
pass: validation.pass,
|
|
7445
|
+
details: validation.message,
|
|
7446
|
+
score: validation.details?.score,
|
|
7447
|
+
reasoning: validation.details?.reasoning,
|
|
7448
|
+
judgeName,
|
|
7449
|
+
judgeProvider: validation.details?.judgeProvider,
|
|
7450
|
+
judgeModel: validation.details?.judgeModel
|
|
7451
|
+
};
|
|
7452
|
+
})
|
|
7453
|
+
);
|
|
7454
|
+
if (judgeResultEntries.length === 1) {
|
|
7455
|
+
results.judge = judgeResultEntries[0];
|
|
7456
|
+
} else {
|
|
7457
|
+
const allPassed = judgeResultEntries.every((r) => r.pass);
|
|
7458
|
+
const passCount = judgeResultEntries.filter((r) => r.pass).length;
|
|
7459
|
+
results.judge = {
|
|
7460
|
+
pass: allPassed,
|
|
7461
|
+
details: `${passCount}/${judgeResultEntries.length} judges passed`,
|
|
7462
|
+
judgeResults: judgeResultEntries
|
|
7463
|
+
};
|
|
7464
|
+
}
|
|
6848
7465
|
}
|
|
6849
7466
|
if (expectBlock.snapshot !== void 0) {
|
|
6850
7467
|
if (!config.playwrightExpect) {
|
|
@@ -6873,6 +7490,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6873
7490
|
}
|
|
6874
7491
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6875
7492
|
}
|
|
7493
|
+
function buildRequest(evalCase) {
|
|
7494
|
+
const request = {};
|
|
7495
|
+
if (evalCase.description) request.description = evalCase.description;
|
|
7496
|
+
if (evalCase.mode === "mcp_host") {
|
|
7497
|
+
if (evalCase.scenario) request.scenario = evalCase.scenario;
|
|
7498
|
+
if (evalCase.mcpHostConfig) {
|
|
7499
|
+
request.mcpHostConfig = {
|
|
7500
|
+
provider: evalCase.mcpHostConfig.provider,
|
|
7501
|
+
...evalCase.mcpHostConfig.model !== void 0 && {
|
|
7502
|
+
model: evalCase.mcpHostConfig.model
|
|
7503
|
+
}
|
|
7504
|
+
};
|
|
7505
|
+
}
|
|
7506
|
+
} else {
|
|
7507
|
+
if (evalCase.args) request.args = evalCase.args;
|
|
7508
|
+
}
|
|
7509
|
+
return request;
|
|
7510
|
+
}
|
|
6876
7511
|
function isMCPHostSimulationResult(value) {
|
|
6877
7512
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6878
7513
|
}
|
|
@@ -6921,6 +7556,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6921
7556
|
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6922
7557
|
source: "eval",
|
|
6923
7558
|
pass: didCasePass(error, expectationResults),
|
|
7559
|
+
request: buildRequest(evalCase),
|
|
6924
7560
|
response,
|
|
6925
7561
|
error,
|
|
6926
7562
|
expectations: expectationResults,
|
|
@@ -6946,7 +7582,7 @@ function isInfrastructureError(err) {
|
|
|
6946
7582
|
} else {
|
|
6947
7583
|
return false;
|
|
6948
7584
|
}
|
|
6949
|
-
return name15 === "
|
|
7585
|
+
return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
6950
7586
|
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
6951
7587
|
}
|
|
6952
7588
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
@@ -7063,8 +7699,13 @@ async function runEvalDataset(options, context) {
|
|
|
7063
7699
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7064
7700
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7065
7701
|
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7066
|
-
|
|
7067
|
-
|
|
7702
|
+
if (c.expect?.passesJudge == null) return sum;
|
|
7703
|
+
const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
|
|
7704
|
+
const totalReps = judges.reduce(
|
|
7705
|
+
(r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
|
|
7706
|
+
0
|
|
7707
|
+
);
|
|
7708
|
+
return sum + effectiveIterations * totalReps;
|
|
7068
7709
|
}, 0);
|
|
7069
7710
|
if (estimatedJudgeCalls > 50) {
|
|
7070
7711
|
debugEval(
|
|
@@ -7421,6 +8062,7 @@ exports.EvalDatasetSchema = EvalDatasetSchema;
|
|
|
7421
8062
|
exports.MCPConfigSchema = MCPConfigSchema;
|
|
7422
8063
|
exports.MCP_PROTOCOL_VERSION = MCP_PROTOCOL_VERSION;
|
|
7423
8064
|
exports.SnapshotSanitizers = SnapshotSanitizers;
|
|
8065
|
+
exports.clearJudgeRegistry = clearJudgeRegistry;
|
|
7424
8066
|
exports.closeMCPClient = closeMCPClient;
|
|
7425
8067
|
exports.createJudge = createJudge;
|
|
7426
8068
|
exports.createMCPClientForConfig = createMCPClientForConfig;
|
|
@@ -7431,6 +8073,7 @@ exports.discoverProtectedResource = discoverProtectedResource;
|
|
|
7431
8073
|
exports.expect = expect;
|
|
7432
8074
|
exports.extractText = extractText;
|
|
7433
8075
|
exports.getMissingDependencyMessage = getMissingDependencyMessage;
|
|
8076
|
+
exports.getRegisteredJudge = getRegisteredJudge;
|
|
7434
8077
|
exports.getResponseSizeBytes = getResponseSizeBytes;
|
|
7435
8078
|
exports.hasValidTokens = hasValidTokens;
|
|
7436
8079
|
exports.injectTokens = injectTokens;
|
|
@@ -7451,6 +8094,8 @@ exports.normalizeWhitespace = normalizeWhitespace;
|
|
|
7451
8094
|
exports.performClientCredentialsFlow = performClientCredentialsFlow;
|
|
7452
8095
|
exports.performOAuthSetup = performOAuthSetup;
|
|
7453
8096
|
exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
|
|
8097
|
+
exports.refreshAccessToken = refreshAccessToken;
|
|
8098
|
+
exports.registerJudge = registerJudge;
|
|
7454
8099
|
exports.resolveRubric = resolveRubric;
|
|
7455
8100
|
exports.runConformanceChecks = runConformanceChecks;
|
|
7456
8101
|
exports.runEvalCase = runEvalCase;
|