@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -18,7 +18,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
18
18
  import { ProxyAgent, Agent } from 'undici';
19
19
  import { readFileSync } from 'fs';
20
20
  import { query } from '@anthropic-ai/claude-agent-sdk';
21
- import { execFile } from 'child_process';
21
+ import { execFile, spawn } from 'child_process';
22
22
  import { promisify } from 'util';
23
23
 
24
24
  var __defProp = Object.defineProperty;
@@ -3279,7 +3279,11 @@ async function performOAuthSetup(config) {
3279
3279
  const page = await context.newPage();
3280
3280
  page.setDefaultTimeout(timeoutMs);
3281
3281
  await page.goto(authorizationUrl.toString());
3282
- await completeLoginForm(page, config);
3282
+ if ("customLoginFlow" in config && config.customLoginFlow) {
3283
+ await config.customLoginFlow(page);
3284
+ } else {
3285
+ await completeLoginForm(page, config);
3286
+ }
3283
3287
  await page.waitForURL(
3284
3288
  (url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
3285
3289
  { timeout: timeoutMs }
@@ -4380,7 +4384,7 @@ function escapeHtml(text) {
4380
4384
 
4381
4385
  // package.json
4382
4386
  var package_default = {
4383
- version: "1.0.0-beta.7"};
4387
+ version: "1.0.0"};
4384
4388
 
4385
4389
  // src/mcp/clientFactory.ts
4386
4390
  function getRetryAfterDelayMs(err) {
@@ -4599,6 +4603,17 @@ async function createMCPClientForConfig(config, options) {
4599
4603
  }
4600
4604
  async function closeMCPClient(client) {
4601
4605
  try {
4606
+ const transport = client.transport;
4607
+ if (transport instanceof StreamableHTTPClientTransport) {
4608
+ try {
4609
+ await transport.terminateSession();
4610
+ } catch (sessionError) {
4611
+ debugClient(
4612
+ "Error terminating session: %s",
4613
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
4614
+ );
4615
+ }
4616
+ }
4602
4617
  await client.close();
4603
4618
  } catch (error) {
4604
4619
  debugClient(
@@ -4827,11 +4842,13 @@ function validateSchema(response, schema, options = {}) {
4827
4842
  } catch (error) {
4828
4843
  const zodError = error;
4829
4844
  const issues = formatZodIssues(zodError);
4845
+ const text = stringifyResponse(response);
4830
4846
  return {
4831
4847
  pass: false,
4832
4848
  message: `Response does not match schema: ${issues}`,
4833
4849
  details: {
4834
- issues: zodError.issues
4850
+ issues: zodError.issues,
4851
+ textPreview: truncateForDisplay2(text)
4835
4852
  }
4836
4853
  };
4837
4854
  }
@@ -4884,6 +4901,12 @@ function formatZodIssues(error) {
4884
4901
  });
4885
4902
  return issues.join("; ");
4886
4903
  }
4904
+ function truncateForDisplay2(str, maxLength = 200) {
4905
+ if (str.length <= maxLength) {
4906
+ return str;
4907
+ }
4908
+ return str.slice(0, maxLength) + "... (truncated)";
4909
+ }
4887
4910
 
4888
4911
  // src/assertions/validators/text.ts
4889
4912
  function validateText(response, expected, options = {}) {
@@ -4910,11 +4933,11 @@ function validateText(response, expected, options = {}) {
4910
4933
  details: {
4911
4934
  missing,
4912
4935
  textLength: text.length,
4913
- textPreview: truncateForDisplay2(text)
4936
+ textPreview: truncateForDisplay3(text)
4914
4937
  }
4915
4938
  };
4916
4939
  }
4917
- function truncateForDisplay2(str, maxLength = 200) {
4940
+ function truncateForDisplay3(str, maxLength = 200) {
4918
4941
  if (str.length <= maxLength) {
4919
4942
  return str;
4920
4943
  }
@@ -4946,7 +4969,7 @@ function validatePattern(response, patterns, options = {}) {
4946
4969
  details: {
4947
4970
  unmatched,
4948
4971
  textLength: text.length,
4949
- textPreview: truncateForDisplay3(text)
4972
+ textPreview: truncateForDisplay4(text)
4950
4973
  }
4951
4974
  };
4952
4975
  }
@@ -4966,7 +4989,7 @@ function patternToString(pattern) {
4966
4989
  }
4967
4990
  return `/${pattern}/`;
4968
4991
  }
4969
- function truncateForDisplay3(str, maxLength = 200) {
4992
+ function truncateForDisplay4(str, maxLength = 200) {
4970
4993
  if (str.length <= maxLength) {
4971
4994
  return str;
4972
4995
  }
@@ -4989,7 +5012,7 @@ function validateError(response, expected = true) {
4989
5012
  pass: false,
4990
5013
  message: "Expected an error response but got success",
4991
5014
  details: {
4992
- textPreview: truncateForDisplay4(extractText2(response))
5015
+ textPreview: truncateForDisplay5(extractText2(response))
4993
5016
  }
4994
5017
  };
4995
5018
  } else {
@@ -5001,7 +5024,7 @@ function validateError(response, expected = true) {
5001
5024
  }
5002
5025
  return {
5003
5026
  pass: false,
5004
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
5027
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
5005
5028
  details: {
5006
5029
  errorMessage
5007
5030
  }
@@ -5014,7 +5037,7 @@ function validateError(response, expected = true) {
5014
5037
  pass: false,
5015
5038
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
5016
5039
  details: {
5017
- textPreview: truncateForDisplay4(extractText2(response))
5040
+ textPreview: truncateForDisplay5(extractText2(response))
5018
5041
  }
5019
5042
  };
5020
5043
  }
@@ -5036,7 +5059,7 @@ function validateError(response, expected = true) {
5036
5059
  }
5037
5060
  };
5038
5061
  }
5039
- function truncateForDisplay4(str, maxLength = 200) {
5062
+ function truncateForDisplay5(str, maxLength = 200) {
5040
5063
  if (str.length <= maxLength) {
5041
5064
  return str;
5042
5065
  }
@@ -5097,9 +5120,17 @@ function formatBytes(bytes) {
5097
5120
  function isSimulationResult(value) {
5098
5121
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
5099
5122
  }
5123
+ function isPatternMatcher(v) {
5124
+ return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
5125
+ }
5100
5126
  function partialMatch(actual, expected) {
5101
5127
  return Object.entries(expected).every(([k, v]) => {
5102
5128
  const actualVal = actual[k];
5129
+ if (isPatternMatcher(v)) {
5130
+ if (typeof actualVal !== "string") return false;
5131
+ const re = new RegExp(v.$pattern, v.$flags);
5132
+ return re.test(actualVal);
5133
+ }
5103
5134
  if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
5104
5135
  return partialMatch(
5105
5136
  actualVal,
@@ -5146,6 +5177,10 @@ function validateToolCalls(response, expectation) {
5146
5177
  return {
5147
5178
  pass: false,
5148
5179
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
5180
+ details: {
5181
+ actual: actual.map((c) => c.name),
5182
+ expected: expected.name
5183
+ },
5149
5184
  metrics
5150
5185
  };
5151
5186
  }
@@ -5162,6 +5197,10 @@ function validateToolCalls(response, expectation) {
5162
5197
  return {
5163
5198
  pass: false,
5164
5199
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
5200
+ details: {
5201
+ actual: actual.map((c) => c.name),
5202
+ expected: expected.name
5203
+ },
5165
5204
  metrics
5166
5205
  };
5167
5206
  }
@@ -5174,6 +5213,10 @@ function validateToolCalls(response, expectation) {
5174
5213
  return {
5175
5214
  pass: false,
5176
5215
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
5216
+ details: {
5217
+ actual: actual.map((c) => c.name),
5218
+ unexpected: unexpected.map((c) => c.name)
5219
+ },
5177
5220
  metrics
5178
5221
  };
5179
5222
  }
@@ -5192,19 +5235,22 @@ function validateToolCallCount(response, options) {
5192
5235
  if (exact !== void 0 && count !== exact) {
5193
5236
  return {
5194
5237
  pass: false,
5195
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
5238
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
5239
+ details: { actual: count, expected: exact }
5196
5240
  };
5197
5241
  }
5198
5242
  if (min !== void 0 && count < min) {
5199
5243
  return {
5200
5244
  pass: false,
5201
- message: `Expected at least ${min} tool call(s), but got ${count}`
5245
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
5246
+ details: { actual: count, min }
5202
5247
  };
5203
5248
  }
5204
5249
  if (max !== void 0 && count > max) {
5205
5250
  return {
5206
5251
  pass: false,
5207
- message: `Expected at most ${max} tool call(s), but got ${count}`
5252
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
5253
+ details: { actual: count, max }
5208
5254
  };
5209
5255
  }
5210
5256
  return {
@@ -5238,7 +5284,175 @@ var JudgeResponseSchema = z.object({
5238
5284
  reasoning: z.string()
5239
5285
  });
5240
5286
 
5241
- // src/judge/claudeAgentJudge.ts
5287
+ // src/judge/anthropicJudge.ts
5288
+ function createAnthropicJudge(config = {}) {
5289
+ const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
5290
+ const apiKey = process.env[apiKeyEnvVar];
5291
+ if (!apiKey) {
5292
+ throw new Error(
5293
+ `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
5294
+ );
5295
+ }
5296
+ const model = config.model ?? "claude-sonnet-4-20250514";
5297
+ const maxTokens = config.maxTokens ?? 1e3;
5298
+ const temperature = config.temperature ?? 0;
5299
+ return {
5300
+ async evaluate(candidate, reference, rubric) {
5301
+ let anthropicModule;
5302
+ try {
5303
+ anthropicModule = await import('@anthropic-ai/sdk');
5304
+ } catch (err) {
5305
+ throw new Error(
5306
+ `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
5307
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5308
+ );
5309
+ }
5310
+ const client = new anthropicModule.default({ apiKey });
5311
+ const prompt = buildJudgePrompt(candidate, reference, rubric);
5312
+ const startTime = Date.now();
5313
+ const response = await client.messages.create({
5314
+ model,
5315
+ max_tokens: maxTokens,
5316
+ temperature,
5317
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5318
+ messages: [{ role: "user", content: prompt }]
5319
+ });
5320
+ const durationMs = Date.now() - startTime;
5321
+ const textBlock = response.content.find(
5322
+ (b) => b.type === "text"
5323
+ );
5324
+ const text = textBlock?.text ?? "";
5325
+ const parsed = parseJudgeResponse(text);
5326
+ return {
5327
+ pass: parsed.pass,
5328
+ score: parsed.score,
5329
+ reasoning: parsed.reasoning,
5330
+ usage: {
5331
+ inputTokens: response.usage?.input_tokens ?? 0,
5332
+ outputTokens: response.usage?.output_tokens ?? 0,
5333
+ totalCostUsd: 0,
5334
+ durationMs
5335
+ }
5336
+ };
5337
+ }
5338
+ };
5339
+ }
5340
+ function buildJudgePrompt(candidate, reference, rubric) {
5341
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5342
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5343
+ return `Rubric:
5344
+ ${rubric}
5345
+
5346
+ <candidate_response>
5347
+ ${candidateStr}
5348
+ </candidate_response>
5349
+
5350
+ <reference_answer>
5351
+ ${referenceStr ?? "No reference provided."}
5352
+ </reference_answer>
5353
+
5354
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5355
+ }
5356
+ function parseJudgeResponse(text) {
5357
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5358
+ let parsed;
5359
+ try {
5360
+ parsed = JSON.parse(cleaned);
5361
+ } catch {
5362
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5363
+ }
5364
+ const result = JudgeResponseSchema.safeParse(parsed);
5365
+ if (!result.success) {
5366
+ throw new Error(
5367
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5368
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5369
+ );
5370
+ }
5371
+ return result.data;
5372
+ }
5373
+
5374
+ // src/judge/vertexAnthropicJudge.ts
5375
+ function createVertexAnthropicJudge(config = {}) {
5376
+ const model = config.model ?? "claude-sonnet-4-20250514";
5377
+ const maxTokens = config.maxTokens ?? 1e3;
5378
+ const temperature = config.temperature ?? 0;
5379
+ return {
5380
+ async evaluate(candidate, reference, rubric) {
5381
+ let vertexModule;
5382
+ try {
5383
+ vertexModule = await import('@anthropic-ai/vertex-sdk');
5384
+ } catch (err) {
5385
+ throw new Error(
5386
+ `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
5387
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5388
+ );
5389
+ }
5390
+ const client = new vertexModule.AnthropicVertex({
5391
+ projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
5392
+ region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
5393
+ });
5394
+ const prompt = buildJudgePrompt2(candidate, reference, rubric);
5395
+ const startTime = Date.now();
5396
+ const response = await client.messages.create({
5397
+ model,
5398
+ max_tokens: maxTokens,
5399
+ temperature,
5400
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5401
+ messages: [{ role: "user", content: prompt }]
5402
+ });
5403
+ const durationMs = Date.now() - startTime;
5404
+ const textBlock = response.content.find(
5405
+ (b) => b.type === "text"
5406
+ );
5407
+ const text = textBlock?.text ?? "";
5408
+ const parsed = parseJudgeResponse2(text);
5409
+ return {
5410
+ pass: parsed.pass,
5411
+ score: parsed.score,
5412
+ reasoning: parsed.reasoning,
5413
+ usage: {
5414
+ inputTokens: response.usage?.input_tokens ?? 0,
5415
+ outputTokens: response.usage?.output_tokens ?? 0,
5416
+ totalCostUsd: 0,
5417
+ durationMs
5418
+ }
5419
+ };
5420
+ }
5421
+ };
5422
+ }
5423
+ function buildJudgePrompt2(candidate, reference, rubric) {
5424
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5425
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5426
+ return `Rubric:
5427
+ ${rubric}
5428
+
5429
+ <candidate_response>
5430
+ ${candidateStr}
5431
+ </candidate_response>
5432
+
5433
+ <reference_answer>
5434
+ ${referenceStr ?? "No reference provided."}
5435
+ </reference_answer>
5436
+
5437
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5438
+ }
5439
+ function parseJudgeResponse2(text) {
5440
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5441
+ let parsed;
5442
+ try {
5443
+ parsed = JSON.parse(cleaned);
5444
+ } catch {
5445
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5446
+ }
5447
+ const result = JudgeResponseSchema.safeParse(parsed);
5448
+ if (!result.success) {
5449
+ throw new Error(
5450
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5451
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5452
+ );
5453
+ }
5454
+ return result.data;
5455
+ }
5242
5456
  function createClaudeAgentJudge(config) {
5243
5457
  const model = config.model ?? "claude-sonnet-4-20250514";
5244
5458
  const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -5256,7 +5470,7 @@ function createClaudeAgentJudge(config) {
5256
5470
  exceedsMaxToolOutputSize: true
5257
5471
  };
5258
5472
  }
5259
- const prompt = buildJudgePrompt(candidate, reference, rubric);
5473
+ const prompt = buildJudgePrompt3(candidate, reference, rubric);
5260
5474
  try {
5261
5475
  let resultMessage;
5262
5476
  for await (const message of query({
@@ -5288,7 +5502,7 @@ function createClaudeAgentJudge(config) {
5288
5502
  );
5289
5503
  }
5290
5504
  const responseText = resultMessage.result ?? "";
5291
- const parsed = parseJudgeResponse(responseText);
5505
+ const parsed = parseJudgeResponse3(responseText);
5292
5506
  const usage = {
5293
5507
  inputTokens: resultMessage.usage?.input_tokens ?? 0,
5294
5508
  outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -5317,7 +5531,7 @@ function createClaudeAgentJudge(config) {
5317
5531
  function buildSystemPrompt() {
5318
5532
  return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
5319
5533
  }
5320
- function buildJudgePrompt(candidate, reference, rubric) {
5534
+ function buildJudgePrompt3(candidate, reference, rubric) {
5321
5535
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5322
5536
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5323
5537
  const parts = [];
@@ -5334,7 +5548,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
5334
5548
  );
5335
5549
  return parts.join("");
5336
5550
  }
5337
- function parseJudgeResponse(text) {
5551
+ function parseJudgeResponse3(text) {
5338
5552
  let jsonText = text.trim();
5339
5553
  if (jsonText.startsWith("```json")) {
5340
5554
  jsonText = jsonText.slice(7);
@@ -5391,7 +5605,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5391
5605
  );
5392
5606
  }
5393
5607
  const client = new openaiModule.default({ apiKey });
5394
- const prompt = buildJudgePrompt2(candidate, reference, rubric);
5608
+ const prompt = buildJudgePrompt4(candidate, reference, rubric);
5395
5609
  const startTime = Date.now();
5396
5610
  const completion = await client.chat.completions.create({
5397
5611
  model,
@@ -5407,7 +5621,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5407
5621
  });
5408
5622
  const durationMs = Date.now() - startTime;
5409
5623
  const text = completion.choices[0]?.message.content ?? "";
5410
- const parsed = parseJudgeResponse2(text);
5624
+ const parsed = parseJudgeResponse4(text);
5411
5625
  return {
5412
5626
  pass: parsed.pass,
5413
5627
  score: parsed.score,
@@ -5422,7 +5636,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5422
5636
  }
5423
5637
  };
5424
5638
  }
5425
- function buildJudgePrompt2(candidate, reference, rubric) {
5639
+ function buildJudgePrompt4(candidate, reference, rubric) {
5426
5640
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5427
5641
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5428
5642
  return `Rubric:
@@ -5438,7 +5652,7 @@ ${referenceStr ?? "No reference provided."}
5438
5652
 
5439
5653
  Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5440
5654
  }
5441
- function parseJudgeResponse2(text) {
5655
+ function parseJudgeResponse4(text) {
5442
5656
  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5443
5657
  let parsed;
5444
5658
  try {
@@ -5540,14 +5754,48 @@ function createJudge(config = {}) {
5540
5754
  const provider = config.provider ?? "anthropic";
5541
5755
  switch (provider) {
5542
5756
  case "anthropic":
5757
+ return createAnthropicJudge(config);
5758
+ case "vertex-anthropic":
5759
+ return createVertexAnthropicJudge(config);
5760
+ case "anthropic-agent-sdk":
5543
5761
  return createClaudeAgentJudge(config);
5544
5762
  case "openai":
5545
5763
  return createOpenAIJudge(config);
5546
5764
  case "google":
5547
5765
  return createGoogleJudge(config);
5548
5766
  default:
5549
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
5767
+ throw new Error(
5768
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
5769
+ );
5770
+ }
5771
+ }
5772
+
5773
+ // src/judge/judgeRegistry.ts
5774
+ var registry = /* @__PURE__ */ new Map();
5775
+ function registerJudge(name15, executor) {
5776
+ const existing = registry.get(name15);
5777
+ if (existing !== void 0) {
5778
+ if (existing === executor) {
5779
+ return;
5780
+ }
5781
+ throw new Error(
5782
+ `Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
5783
+ );
5784
+ }
5785
+ registry.set(name15, executor);
5786
+ }
5787
+ function getRegisteredJudge(name15) {
5788
+ const executor = registry.get(name15);
5789
+ if (!executor) {
5790
+ const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
5791
+ throw new Error(
5792
+ `Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
5793
+ );
5550
5794
  }
5795
+ return executor;
5796
+ }
5797
+ function clearJudgeRegistry() {
5798
+ registry.clear();
5551
5799
  }
5552
5800
 
5553
5801
  // src/assertions/validators/judge.ts
@@ -5558,6 +5806,7 @@ function computeStdDev(scores, mean) {
5558
5806
  }
5559
5807
  async function validateJudge(response, config) {
5560
5808
  const {
5809
+ judge: judgeName,
5561
5810
  rubric,
5562
5811
  reference,
5563
5812
  threshold = 0.7,
@@ -5570,6 +5819,29 @@ async function validateJudge(response, config) {
5570
5819
  maxBudgetUsd,
5571
5820
  maxToolOutputSize
5572
5821
  } = config;
5822
+ if (judgeName !== void 0) {
5823
+ try {
5824
+ const executor = getRegisteredJudge(judgeName);
5825
+ const judgeResult = await executor(response, reference ?? void 0);
5826
+ const score = judgeResult.score;
5827
+ const passed = score >= threshold;
5828
+ return {
5829
+ pass: passed,
5830
+ message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
5831
+ };
5832
+ } catch (err) {
5833
+ return {
5834
+ pass: false,
5835
+ message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
5836
+ };
5837
+ }
5838
+ }
5839
+ if (rubric === void 0) {
5840
+ return {
5841
+ pass: false,
5842
+ message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
5843
+ };
5844
+ }
5573
5845
  const resolvedRubric = resolveRubric(rubric);
5574
5846
  const judgeConfig = {
5575
5847
  ...provider !== void 0 && { provider },
@@ -5616,11 +5888,17 @@ async function validateJudge(response, config) {
5616
5888
  return {
5617
5889
  pass: passed,
5618
5890
  message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
5619
- details: reps > 1 ? {
5620
- scores,
5621
- scoreStdDev: stdDev,
5622
- highVariance
5623
- } : void 0
5891
+ details: {
5892
+ score: meanScore,
5893
+ reasoning: lastReasoning,
5894
+ judgeProvider: provider ?? "anthropic",
5895
+ judgeModel: model,
5896
+ ...reps > 1 && {
5897
+ scores,
5898
+ scoreStdDev: stdDev,
5899
+ highVariance
5900
+ }
5901
+ }
5624
5902
  };
5625
5903
  } catch (err) {
5626
5904
  return {
@@ -5813,12 +6091,19 @@ function toMatchToolResponse(received, expected) {
5813
6091
  // src/assertions/matchers/toMatchToolSchema.ts
5814
6092
  function toMatchToolSchema(received, schema, options = {}) {
5815
6093
  const result = validateSchema(received, schema, options);
6094
+ const preview = result.details?.textPreview;
5816
6095
  return {
5817
6096
  pass: result.pass,
5818
6097
  message: () => {
5819
6098
  if (this.isNot) {
5820
6099
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
5821
6100
  }
6101
+ if (!result.pass && preview) {
6102
+ return `${result.message}
6103
+
6104
+ Actual response (truncated):
6105
+ ${preview}`;
6106
+ }
5822
6107
  return result.message;
5823
6108
  }
5824
6109
  };
@@ -5827,6 +6112,7 @@ function toMatchToolSchema(received, schema, options = {}) {
5827
6112
  // src/assertions/matchers/toContainToolText.ts
5828
6113
  function toContainToolText(received, expected, options = {}) {
5829
6114
  const result = validateText(received, expected, options);
6115
+ const preview = result.details?.textPreview;
5830
6116
  return {
5831
6117
  pass: result.pass,
5832
6118
  message: () => {
@@ -5834,6 +6120,12 @@ function toContainToolText(received, expected, options = {}) {
5834
6120
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
5835
6121
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
5836
6122
  }
6123
+ if (!result.pass && preview) {
6124
+ return `${result.message}
6125
+
6126
+ Actual response (truncated):
6127
+ ${preview}`;
6128
+ }
5837
6129
  return result.message;
5838
6130
  }
5839
6131
  };
@@ -5842,12 +6134,19 @@ function toContainToolText(received, expected, options = {}) {
5842
6134
  // src/assertions/matchers/toMatchToolPattern.ts
5843
6135
  function toMatchToolPattern(received, patterns, options = {}) {
5844
6136
  const result = validatePattern(received, patterns, options);
6137
+ const preview = result.details?.textPreview;
5845
6138
  return {
5846
6139
  pass: result.pass,
5847
6140
  message: () => {
5848
6141
  if (this.isNot) {
5849
6142
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
5850
6143
  }
6144
+ if (!result.pass && preview) {
6145
+ return `${result.message}
6146
+
6147
+ Actual response (truncated):
6148
+ ${preview}`;
6149
+ }
5851
6150
  return result.message;
5852
6151
  }
5853
6152
  };
@@ -5999,31 +6298,68 @@ function toBeToolError(received, expected = true) {
5999
6298
 
6000
6299
  // src/assertions/matchers/toPassToolJudge.ts
6001
6300
  var DEFAULT_PASSING_THRESHOLD = 0.7;
6002
- async function toPassToolJudge(received, rubric, options = {}) {
6301
+ async function runSingleJudge(received, rubric, options) {
6003
6302
  const {
6004
6303
  reference = null,
6005
6304
  passingThreshold = DEFAULT_PASSING_THRESHOLD,
6006
6305
  reps,
6007
6306
  provider,
6008
- model
6307
+ model,
6308
+ judge
6009
6309
  } = options;
6010
6310
  const validation = await validateJudge(received, {
6011
- rubric,
6311
+ ...rubric !== void 0 && { rubric },
6012
6312
  reference: reference ?? void 0,
6013
6313
  threshold: passingThreshold,
6014
6314
  ...reps !== void 0 && { reps },
6015
6315
  ...provider !== void 0 && { provider },
6016
- ...model !== void 0 && { model }
6316
+ ...model !== void 0 && { model },
6317
+ ...judge !== void 0 && { judge }
6017
6318
  });
6319
+ return { pass: validation.pass, message: validation.message };
6320
+ }
6321
+ async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
6322
+ if (Array.isArray(rubricOrOptions)) {
6323
+ const results = await Promise.all(
6324
+ rubricOrOptions.map(async (judgeConfig) => {
6325
+ const { rubric: r, ...opts } = judgeConfig;
6326
+ return runSingleJudge(received, r, opts);
6327
+ })
6328
+ );
6329
+ const allPassed = results.every((r) => r.pass);
6330
+ const passCount = results.filter((r) => r.pass).length;
6331
+ const summary = `${passCount}/${results.length} judges passed`;
6332
+ const details = results.map((r) => r.message).join("\n");
6333
+ if (this.isNot) {
6334
+ return {
6335
+ pass: !allPassed,
6336
+ message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
6337
+ };
6338
+ }
6339
+ return {
6340
+ pass: allPassed,
6341
+ message: () => `${summary}
6342
+ ${details}`
6343
+ };
6344
+ }
6345
+ let rubric;
6346
+ let options;
6347
+ if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
6348
+ rubric = rubricOrOptions;
6349
+ options = maybeOptions ?? {};
6350
+ } else {
6351
+ options = rubricOrOptions;
6352
+ }
6353
+ const result = await runSingleJudge(received, rubric, options);
6018
6354
  if (this.isNot) {
6019
6355
  return {
6020
- pass: !validation.pass,
6021
- message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6356
+ pass: !result.pass,
6357
+ message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6022
6358
  };
6023
6359
  }
6024
6360
  return {
6025
- pass: validation.pass,
6026
- message: () => validation.message
6361
+ pass: result.pass,
6362
+ message: () => result.message
6027
6363
  };
6028
6364
  }
6029
6365
 
@@ -6307,6 +6643,7 @@ function getAuthConfigFromEnv() {
6307
6643
  return void 0;
6308
6644
  }
6309
6645
  var MCPHostConfigSchema = z.object({
6646
+ hostType: z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
6310
6647
  provider: z.enum([
6311
6648
  "openai",
6312
6649
  "anthropic",
@@ -6317,12 +6654,18 @@ var MCPHostConfigSchema = z.object({
6317
6654
  "openrouter",
6318
6655
  "xai",
6319
6656
  "vertex-anthropic"
6320
- ]),
6657
+ ]).optional(),
6321
6658
  apiKeyEnvVar: z.string().optional(),
6322
6659
  model: z.string().optional(),
6323
6660
  maxTokens: z.number().optional(),
6324
6661
  temperature: z.number().optional(),
6325
- maxToolCalls: z.number().optional()
6662
+ maxToolCalls: z.number().optional(),
6663
+ cli: z.object({
6664
+ command: z.string(),
6665
+ args: z.array(z.string()),
6666
+ outputFormat: z.enum(["stream-json", "json"]).optional(),
6667
+ timeout: z.number().optional()
6668
+ }).optional()
6326
6669
  });
6327
6670
  var SnapshotSanitizerSchema = z.union([
6328
6671
  // Built-in sanitizers
@@ -6337,6 +6680,37 @@ var SnapshotSanitizerSchema = z.union([
6337
6680
  remove: z.array(z.string())
6338
6681
  })
6339
6682
  ]);
6683
+ var JudgeExpectConfigSchema = z.object({
6684
+ judge: z.string().min(1).optional(),
6685
+ rubric: z.union([
6686
+ z.enum([
6687
+ "correctness",
6688
+ "completeness",
6689
+ "groundedness",
6690
+ "instruction-following",
6691
+ "conciseness"
6692
+ ]),
6693
+ z.object({ text: z.string().min(1) })
6694
+ ]).optional(),
6695
+ reference: z.unknown().optional(),
6696
+ threshold: z.number().min(0).max(1).optional(),
6697
+ reps: z.number().int().min(1).optional(),
6698
+ provider: z.enum([
6699
+ "anthropic",
6700
+ "vertex-anthropic",
6701
+ "anthropic-agent-sdk",
6702
+ "openai",
6703
+ "google"
6704
+ ]).optional(),
6705
+ model: z.string().optional(),
6706
+ apiKeyEnvVar: z.string().optional(),
6707
+ maxTokens: z.number().int().positive().optional(),
6708
+ temperature: z.number().min(0).max(1).optional(),
6709
+ maxBudgetUsd: z.number().positive().optional(),
6710
+ maxToolOutputSize: z.number().int().positive().optional()
6711
+ }).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
6712
+ message: 'Either "judge" or "rubric" must be provided in passesJudge'
6713
+ });
6340
6714
  var EvalExpectBlockSchema = z.object({
6341
6715
  response: z.unknown().optional(),
6342
6716
  schema: z.string().optional(),
@@ -6345,28 +6719,7 @@ var EvalExpectBlockSchema = z.object({
6345
6719
  snapshot: z.string().optional(),
6346
6720
  snapshotSanitizers: z.array(SnapshotSanitizerSchema).optional(),
6347
6721
  isError: z.union([z.boolean(), z.string(), z.array(z.string())]).optional(),
6348
- passesJudge: z.object({
6349
- rubric: z.union([
6350
- z.enum([
6351
- "correctness",
6352
- "completeness",
6353
- "groundedness",
6354
- "instruction-following",
6355
- "conciseness"
6356
- ]),
6357
- z.object({ text: z.string().min(1) })
6358
- ]),
6359
- reference: z.unknown().optional(),
6360
- threshold: z.number().min(0).max(1).optional(),
6361
- reps: z.number().int().min(1).optional(),
6362
- provider: z.enum(["anthropic", "openai", "google"]).optional(),
6363
- model: z.string().optional(),
6364
- apiKeyEnvVar: z.string().optional(),
6365
- maxTokens: z.number().int().positive().optional(),
6366
- temperature: z.number().min(0).max(1).optional(),
6367
- maxBudgetUsd: z.number().positive().optional(),
6368
- maxToolOutputSize: z.number().int().positive().optional()
6369
- }).optional(),
6722
+ passesJudge: z.union([JudgeExpectConfigSchema, z.array(JudgeExpectConfigSchema).min(1)]).optional(),
6370
6723
  responseSize: z.object({
6371
6724
  maxBytes: z.number().optional(),
6372
6725
  minBytes: z.number().optional()
@@ -6539,6 +6892,9 @@ function createVercelOrchestrator() {
6539
6892
  try {
6540
6893
  const { generateText, stepCountIs } = await import('ai');
6541
6894
  const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
6895
+ if (!config.provider) {
6896
+ throw new Error("provider is required for SDK host type");
6897
+ }
6542
6898
  const modelId = config.model ?? defaultModel(config.provider);
6543
6899
  const model = await loadModel(config.provider, modelId);
6544
6900
  const mcpTools = await mcp.listTools();
@@ -6592,13 +6948,233 @@ function createVercelOrchestrator() {
6592
6948
  return {
6593
6949
  success: false,
6594
6950
  toolCalls: [],
6595
- error: enrichErrorMessage(err, config.provider)
6951
+ error: enrichErrorMessage(err, config.provider ?? "unknown")
6596
6952
  };
6597
6953
  }
6598
6954
  }
6599
6955
  };
6600
6956
  }
6601
6957
 
6958
+ // src/evals/mcpHost/adapters/cli/parsers.ts
6959
+ function parseStreamJson(stdout) {
6960
+ const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6961
+ const toolCalls = [];
6962
+ const textParts = [];
6963
+ const conversationHistory = [];
6964
+ for (const line of lines) {
6965
+ let event;
6966
+ try {
6967
+ event = JSON.parse(line);
6968
+ } catch {
6969
+ continue;
6970
+ }
6971
+ if (event.type === "assistant" && event.message?.content) {
6972
+ for (const block of event.message.content) {
6973
+ if (block.type === "tool_use" && block.name) {
6974
+ const rawName = block.name;
6975
+ const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
6976
+ toolCalls.push({
6977
+ name: mcpMatch ? mcpMatch[1] : rawName,
6978
+ arguments: block.input ?? {},
6979
+ id: block.id
6980
+ });
6981
+ }
6982
+ if (block.type === "text" && block.text) {
6983
+ textParts.push(block.text);
6984
+ }
6985
+ }
6986
+ }
6987
+ if (event.type === "user" && event.message?.content) {
6988
+ for (const block of event.message.content) {
6989
+ if (block.type === "tool_result") {
6990
+ const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
6991
+ conversationHistory.push({ role: "tool", content });
6992
+ }
6993
+ }
6994
+ }
6995
+ if (event.type === "result" && typeof event.result === "string") {
6996
+ if (textParts.length === 0) {
6997
+ textParts.push(event.result);
6998
+ }
6999
+ }
7000
+ if (event.type === "result" && event.is_error === true) {
7001
+ return {
7002
+ success: false,
7003
+ toolCalls,
7004
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7005
+ };
7006
+ }
7007
+ }
7008
+ const response = textParts.join("");
7009
+ if (response) {
7010
+ conversationHistory.push({ role: "assistant", content: response });
7011
+ }
7012
+ return {
7013
+ success: true,
7014
+ toolCalls,
7015
+ response: response || void 0,
7016
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7017
+ };
7018
+ }
7019
+ function createJsonParser(paths) {
7020
+ return (stdout) => {
7021
+ const data = JSON.parse(stdout);
7022
+ const rawToolCalls = getNestedValue(data, paths.toolCalls);
7023
+ const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
7024
+ name: typeof tc.name === "string" ? tc.name : "",
7025
+ arguments: tc.arguments ?? tc.args ?? {}
7026
+ })) : [];
7027
+ const response = getNestedValue(data, paths.response);
7028
+ const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
7029
+ return {
7030
+ success,
7031
+ toolCalls,
7032
+ response: typeof response === "string" ? response : void 0
7033
+ };
7034
+ };
7035
+ }
7036
+ function getNestedValue(obj, path3) {
7037
+ return path3.split(".").reduce((current, key) => {
7038
+ if (current !== null && typeof current === "object") {
7039
+ return current[key];
7040
+ }
7041
+ return void 0;
7042
+ }, obj);
7043
+ }
7044
+
7045
+ // src/evals/mcpHost/adapters/cli/runner.ts
7046
+ var DEFAULT_TIMEOUT = 12e4;
7047
+ var MAX_BUFFER = 10 * 1024 * 1024;
7048
+ function getParser(format) {
7049
+ switch (format ?? "stream-json") {
7050
+ case "stream-json":
7051
+ return parseStreamJson;
7052
+ case "json":
7053
+ return createJsonParser({
7054
+ toolCalls: "toolCalls",
7055
+ response: "response",
7056
+ success: "success"
7057
+ });
7058
+ }
7059
+ }
7060
+ function interpolateArgs(args, scenario) {
7061
+ return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
7062
+ }
7063
+ async function runCLIHost(cliConfig, scenario) {
7064
+ const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
7065
+ const args = interpolateArgs(cliConfig.args, scenario);
7066
+ const startTime = Date.now();
7067
+ let stdout;
7068
+ try {
7069
+ const result2 = await spawnProcess(cliConfig.command, args, { timeout });
7070
+ stdout = result2.stdout;
7071
+ } catch (err) {
7072
+ const elapsed = Date.now() - startTime;
7073
+ const message = err instanceof Error ? err.message : String(err);
7074
+ if (message.includes("TIMEOUT") || message.includes("timed out")) {
7075
+ return {
7076
+ success: false,
7077
+ toolCalls: [],
7078
+ error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
7079
+ };
7080
+ }
7081
+ return {
7082
+ success: false,
7083
+ toolCalls: [],
7084
+ error: `CLI host process failed: ${message}`
7085
+ };
7086
+ }
7087
+ const parse = getParser(cliConfig.outputFormat);
7088
+ let result;
7089
+ try {
7090
+ result = parse(stdout);
7091
+ } catch (err) {
7092
+ return {
7093
+ success: false,
7094
+ toolCalls: [],
7095
+ error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
7096
+ stdout (first 500 chars): ${stdout.slice(0, 500)}`
7097
+ };
7098
+ }
7099
+ const validationError = validateSimulationResult(result);
7100
+ if (validationError) {
7101
+ return {
7102
+ success: false,
7103
+ toolCalls: [],
7104
+ error: `CLI host returned invalid result: ${validationError}`
7105
+ };
7106
+ }
7107
+ return result;
7108
+ }
7109
+ function validateSimulationResult(result) {
7110
+ if (result === null || typeof result !== "object") {
7111
+ return `Expected object, got ${typeof result}`;
7112
+ }
7113
+ const obj = result;
7114
+ if (typeof obj.success !== "boolean") {
7115
+ return `"success" must be a boolean, got ${typeof obj.success}`;
7116
+ }
7117
+ if (!Array.isArray(obj.toolCalls)) {
7118
+ return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
7119
+ }
7120
+ for (let i = 0; i < obj.toolCalls.length; i++) {
7121
+ const tc = obj.toolCalls[i];
7122
+ if (typeof tc.name !== "string") {
7123
+ return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
7124
+ }
7125
+ if (typeof tc.arguments !== "object" || tc.arguments === null) {
7126
+ return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
7127
+ }
7128
+ }
7129
+ return null;
7130
+ }
7131
+ function spawnProcess(command, args, options) {
7132
+ return new Promise((resolve2, reject) => {
7133
+ const child = spawn(command, args, {
7134
+ stdio: ["pipe", "pipe", "pipe"]
7135
+ });
7136
+ child.stdin.end();
7137
+ const stdoutChunks = [];
7138
+ const stderrChunks = [];
7139
+ let totalBytes = 0;
7140
+ child.stdout.on("data", (chunk) => {
7141
+ totalBytes += chunk.length;
7142
+ if (totalBytes <= MAX_BUFFER) {
7143
+ stdoutChunks.push(chunk);
7144
+ }
7145
+ });
7146
+ child.stderr.on("data", (chunk) => {
7147
+ totalBytes += chunk.length;
7148
+ if (totalBytes <= MAX_BUFFER) {
7149
+ stderrChunks.push(chunk);
7150
+ }
7151
+ });
7152
+ const timer = setTimeout(() => {
7153
+ child.kill("SIGTERM");
7154
+ reject(new Error(`Process timed out after ${options.timeout}ms`));
7155
+ }, options.timeout);
7156
+ child.on("error", (err) => {
7157
+ clearTimeout(timer);
7158
+ reject(err);
7159
+ });
7160
+ child.on("close", (code) => {
7161
+ clearTimeout(timer);
7162
+ const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
7163
+ const stderr = Buffer.concat(stderrChunks).toString("utf-8");
7164
+ if (code !== 0) {
7165
+ reject(
7166
+ new Error(
7167
+ `Command failed with exit code ${code ?? "null"}` + (stderr ? `
7168
+ stderr: ${stderr}` : "")
7169
+ )
7170
+ );
7171
+ return;
7172
+ }
7173
+ resolve2({ stdout, stderr });
7174
+ });
7175
+ });
7176
+ }
7177
+
6602
7178
  // src/evals/mcpHost/mcpHostSimulation.ts
6603
7179
  var vercelOrchestrator = createVercelOrchestrator();
6604
7180
  var allProviders = [
@@ -6616,6 +7192,25 @@ var simulatorRegistry = new Map(
6616
7192
  allProviders.map((p) => [p, vercelOrchestrator])
6617
7193
  );
6618
7194
  async function simulateMCPHost(mcp, scenario, config) {
7195
+ const hostType = config.hostType ?? "sdk";
7196
+ if (hostType === "cli") {
7197
+ if (!config.cli) {
7198
+ throw new Error(
7199
+ `mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
7200
+ );
7201
+ }
7202
+ return runCLIHost(config.cli, scenario);
7203
+ }
7204
+ if (hostType === "browser" || hostType === "desktop") {
7205
+ throw new Error(
7206
+ `Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
7207
+ );
7208
+ }
7209
+ if (!config.provider) {
7210
+ throw new Error(
7211
+ `mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
7212
+ );
7213
+ }
6619
7214
  const simulator = simulatorRegistry.get(config.provider);
6620
7215
  if (!simulator) {
6621
7216
  throw new Error(
@@ -6807,17 +7402,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6807
7402
  };
6808
7403
  }
6809
7404
  if (expectBlock.passesJudge !== void 0) {
6810
- const effectiveReps = expectBlock.passesJudge.reps ?? config.judgeReps ?? 1;
6811
- const effectiveReference = expectBlock.passesJudge.reference !== void 0 ? expectBlock.passesJudge.reference : config.canonicalAnswer;
6812
- const validation = await validateJudge(response, {
6813
- ...expectBlock.passesJudge,
6814
- reference: effectiveReference,
6815
- reps: effectiveReps
6816
- });
6817
- results.judge = {
6818
- pass: validation.pass,
6819
- details: validation.message
6820
- };
7405
+ const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
7406
+ const judgeResultEntries = await Promise.all(
7407
+ judgeConfigs.map(async (judgeConfig) => {
7408
+ const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
7409
+ const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
7410
+ const validation = await validateJudge(response, {
7411
+ ...judgeConfig,
7412
+ reference: effectiveReference,
7413
+ reps: effectiveReps
7414
+ });
7415
+ const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
7416
+ return {
7417
+ pass: validation.pass,
7418
+ details: validation.message,
7419
+ score: validation.details?.score,
7420
+ reasoning: validation.details?.reasoning,
7421
+ judgeName,
7422
+ judgeProvider: validation.details?.judgeProvider,
7423
+ judgeModel: validation.details?.judgeModel
7424
+ };
7425
+ })
7426
+ );
7427
+ if (judgeResultEntries.length === 1) {
7428
+ results.judge = judgeResultEntries[0];
7429
+ } else {
7430
+ const allPassed = judgeResultEntries.every((r) => r.pass);
7431
+ const passCount = judgeResultEntries.filter((r) => r.pass).length;
7432
+ results.judge = {
7433
+ pass: allPassed,
7434
+ details: `${passCount}/${judgeResultEntries.length} judges passed`,
7435
+ judgeResults: judgeResultEntries
7436
+ };
7437
+ }
6821
7438
  }
6822
7439
  if (expectBlock.snapshot !== void 0) {
6823
7440
  if (!config.playwrightExpect) {
@@ -6846,6 +7463,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6846
7463
  }
6847
7464
  return { expectations: results, toolPrecision, toolRecall };
6848
7465
  }
7466
+ function buildRequest(evalCase) {
7467
+ const request = {};
7468
+ if (evalCase.description) request.description = evalCase.description;
7469
+ if (evalCase.mode === "mcp_host") {
7470
+ if (evalCase.scenario) request.scenario = evalCase.scenario;
7471
+ if (evalCase.mcpHostConfig) {
7472
+ request.mcpHostConfig = {
7473
+ provider: evalCase.mcpHostConfig.provider,
7474
+ ...evalCase.mcpHostConfig.model !== void 0 && {
7475
+ model: evalCase.mcpHostConfig.model
7476
+ }
7477
+ };
7478
+ }
7479
+ } else {
7480
+ if (evalCase.args) request.args = evalCase.args;
7481
+ }
7482
+ return request;
7483
+ }
6849
7484
  function isMCPHostSimulationResult(value) {
6850
7485
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6851
7486
  }
@@ -6894,6 +7529,7 @@ async function runSingleIteration(evalCase, context, options) {
6894
7529
  toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6895
7530
  source: "eval",
6896
7531
  pass: didCasePass(error, expectationResults),
7532
+ request: buildRequest(evalCase),
6897
7533
  response,
6898
7534
  error,
6899
7535
  expectations: expectationResults,
@@ -6919,7 +7555,7 @@ function isInfrastructureError(err) {
6919
7555
  } else {
6920
7556
  return false;
6921
7557
  }
6922
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7558
+ return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
6923
7559
  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6924
7560
  }
6925
7561
  async function runEvalCase(evalCase, context, options = {}) {
@@ -7036,8 +7672,13 @@ async function runEvalDataset(options, context) {
7036
7672
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7037
7673
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7038
7674
  const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7039
- const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7040
- return sum + effectiveIterations * judgeReps;
7675
+ if (c.expect?.passesJudge == null) return sum;
7676
+ const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
7677
+ const totalReps = judges.reduce(
7678
+ (r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
7679
+ 0
7680
+ );
7681
+ return sum + effectiveIterations * totalReps;
7041
7682
  }, 0);
7042
7683
  if (estimatedJudgeCalls > 50) {
7043
7684
  debugEval(
@@ -7385,6 +8026,6 @@ function formatCapabilities(capabilities) {
7385
8026
  return parts.length > 0 ? parts.join(", ") : "none declared";
7386
8027
  }
7387
8028
 
7388
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
8029
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7389
8030
  //# sourceMappingURL=index.js.map
7390
8031
  //# sourceMappingURL=index.js.map