@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3306,7 +3306,11 @@ async function performOAuthSetup(config) {
3306
3306
  const page = await context.newPage();
3307
3307
  page.setDefaultTimeout(timeoutMs);
3308
3308
  await page.goto(authorizationUrl.toString());
3309
- await completeLoginForm(page, config);
3309
+ if ("customLoginFlow" in config && config.customLoginFlow) {
3310
+ await config.customLoginFlow(page);
3311
+ } else {
3312
+ await completeLoginForm(page, config);
3313
+ }
3310
3314
  await page.waitForURL(
3311
3315
  (url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
3312
3316
  { timeout: timeoutMs }
@@ -4407,7 +4411,7 @@ function escapeHtml(text) {
4407
4411
 
4408
4412
  // package.json
4409
4413
  var package_default = {
4410
- version: "1.0.0-beta.7"};
4414
+ version: "1.0.0"};
4411
4415
 
4412
4416
  // src/mcp/clientFactory.ts
4413
4417
  function getRetryAfterDelayMs(err) {
@@ -4626,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
4626
4630
  }
4627
4631
  async function closeMCPClient(client) {
4628
4632
  try {
4633
+ const transport = client.transport;
4634
+ if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
4635
+ try {
4636
+ await transport.terminateSession();
4637
+ } catch (sessionError) {
4638
+ debugClient(
4639
+ "Error terminating session: %s",
4640
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
4641
+ );
4642
+ }
4643
+ }
4629
4644
  await client.close();
4630
4645
  } catch (error) {
4631
4646
  debugClient(
@@ -4854,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
4854
4869
  } catch (error) {
4855
4870
  const zodError = error;
4856
4871
  const issues = formatZodIssues(zodError);
4872
+ const text = stringifyResponse(response);
4857
4873
  return {
4858
4874
  pass: false,
4859
4875
  message: `Response does not match schema: ${issues}`,
4860
4876
  details: {
4861
- issues: zodError.issues
4877
+ issues: zodError.issues,
4878
+ textPreview: truncateForDisplay2(text)
4862
4879
  }
4863
4880
  };
4864
4881
  }
@@ -4911,6 +4928,12 @@ function formatZodIssues(error) {
4911
4928
  });
4912
4929
  return issues.join("; ");
4913
4930
  }
4931
+ function truncateForDisplay2(str, maxLength = 200) {
4932
+ if (str.length <= maxLength) {
4933
+ return str;
4934
+ }
4935
+ return str.slice(0, maxLength) + "... (truncated)";
4936
+ }
4914
4937
 
4915
4938
  // src/assertions/validators/text.ts
4916
4939
  function validateText(response, expected, options = {}) {
@@ -4937,11 +4960,11 @@ function validateText(response, expected, options = {}) {
4937
4960
  details: {
4938
4961
  missing,
4939
4962
  textLength: text.length,
4940
- textPreview: truncateForDisplay2(text)
4963
+ textPreview: truncateForDisplay3(text)
4941
4964
  }
4942
4965
  };
4943
4966
  }
4944
- function truncateForDisplay2(str, maxLength = 200) {
4967
+ function truncateForDisplay3(str, maxLength = 200) {
4945
4968
  if (str.length <= maxLength) {
4946
4969
  return str;
4947
4970
  }
@@ -4973,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
4973
4996
  details: {
4974
4997
  unmatched,
4975
4998
  textLength: text.length,
4976
- textPreview: truncateForDisplay3(text)
4999
+ textPreview: truncateForDisplay4(text)
4977
5000
  }
4978
5001
  };
4979
5002
  }
@@ -4993,7 +5016,7 @@ function patternToString(pattern) {
4993
5016
  }
4994
5017
  return `/${pattern}/`;
4995
5018
  }
4996
- function truncateForDisplay3(str, maxLength = 200) {
5019
+ function truncateForDisplay4(str, maxLength = 200) {
4997
5020
  if (str.length <= maxLength) {
4998
5021
  return str;
4999
5022
  }
@@ -5016,7 +5039,7 @@ function validateError(response, expected = true) {
5016
5039
  pass: false,
5017
5040
  message: "Expected an error response but got success",
5018
5041
  details: {
5019
- textPreview: truncateForDisplay4(extractText2(response))
5042
+ textPreview: truncateForDisplay5(extractText2(response))
5020
5043
  }
5021
5044
  };
5022
5045
  } else {
@@ -5028,7 +5051,7 @@ function validateError(response, expected = true) {
5028
5051
  }
5029
5052
  return {
5030
5053
  pass: false,
5031
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
5054
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
5032
5055
  details: {
5033
5056
  errorMessage
5034
5057
  }
@@ -5041,7 +5064,7 @@ function validateError(response, expected = true) {
5041
5064
  pass: false,
5042
5065
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
5043
5066
  details: {
5044
- textPreview: truncateForDisplay4(extractText2(response))
5067
+ textPreview: truncateForDisplay5(extractText2(response))
5045
5068
  }
5046
5069
  };
5047
5070
  }
@@ -5063,7 +5086,7 @@ function validateError(response, expected = true) {
5063
5086
  }
5064
5087
  };
5065
5088
  }
5066
- function truncateForDisplay4(str, maxLength = 200) {
5089
+ function truncateForDisplay5(str, maxLength = 200) {
5067
5090
  if (str.length <= maxLength) {
5068
5091
  return str;
5069
5092
  }
@@ -5124,9 +5147,17 @@ function formatBytes(bytes) {
5124
5147
  function isSimulationResult(value) {
5125
5148
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
5126
5149
  }
5150
+ function isPatternMatcher(v) {
5151
+ return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
5152
+ }
5127
5153
  function partialMatch(actual, expected) {
5128
5154
  return Object.entries(expected).every(([k, v]) => {
5129
5155
  const actualVal = actual[k];
5156
+ if (isPatternMatcher(v)) {
5157
+ if (typeof actualVal !== "string") return false;
5158
+ const re = new RegExp(v.$pattern, v.$flags);
5159
+ return re.test(actualVal);
5160
+ }
5130
5161
  if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
5131
5162
  return partialMatch(
5132
5163
  actualVal,
@@ -5173,6 +5204,10 @@ function validateToolCalls(response, expectation) {
5173
5204
  return {
5174
5205
  pass: false,
5175
5206
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
5207
+ details: {
5208
+ actual: actual.map((c) => c.name),
5209
+ expected: expected.name
5210
+ },
5176
5211
  metrics
5177
5212
  };
5178
5213
  }
@@ -5189,6 +5224,10 @@ function validateToolCalls(response, expectation) {
5189
5224
  return {
5190
5225
  pass: false,
5191
5226
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
5227
+ details: {
5228
+ actual: actual.map((c) => c.name),
5229
+ expected: expected.name
5230
+ },
5192
5231
  metrics
5193
5232
  };
5194
5233
  }
@@ -5201,6 +5240,10 @@ function validateToolCalls(response, expectation) {
5201
5240
  return {
5202
5241
  pass: false,
5203
5242
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
5243
+ details: {
5244
+ actual: actual.map((c) => c.name),
5245
+ unexpected: unexpected.map((c) => c.name)
5246
+ },
5204
5247
  metrics
5205
5248
  };
5206
5249
  }
@@ -5219,19 +5262,22 @@ function validateToolCallCount(response, options) {
5219
5262
  if (exact !== void 0 && count !== exact) {
5220
5263
  return {
5221
5264
  pass: false,
5222
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
5265
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
5266
+ details: { actual: count, expected: exact }
5223
5267
  };
5224
5268
  }
5225
5269
  if (min !== void 0 && count < min) {
5226
5270
  return {
5227
5271
  pass: false,
5228
- message: `Expected at least ${min} tool call(s), but got ${count}`
5272
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
5273
+ details: { actual: count, min }
5229
5274
  };
5230
5275
  }
5231
5276
  if (max !== void 0 && count > max) {
5232
5277
  return {
5233
5278
  pass: false,
5234
- message: `Expected at most ${max} tool call(s), but got ${count}`
5279
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
5280
+ details: { actual: count, max }
5235
5281
  };
5236
5282
  }
5237
5283
  return {
@@ -5265,7 +5311,175 @@ var JudgeResponseSchema = zod.z.object({
5265
5311
  reasoning: zod.z.string()
5266
5312
  });
5267
5313
 
5268
- // src/judge/claudeAgentJudge.ts
5314
+ // src/judge/anthropicJudge.ts
5315
+ function createAnthropicJudge(config = {}) {
5316
+ const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
5317
+ const apiKey = process.env[apiKeyEnvVar];
5318
+ if (!apiKey) {
5319
+ throw new Error(
5320
+ `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
5321
+ );
5322
+ }
5323
+ const model = config.model ?? "claude-sonnet-4-20250514";
5324
+ const maxTokens = config.maxTokens ?? 1e3;
5325
+ const temperature = config.temperature ?? 0;
5326
+ return {
5327
+ async evaluate(candidate, reference, rubric) {
5328
+ let anthropicModule;
5329
+ try {
5330
+ anthropicModule = await import('@anthropic-ai/sdk');
5331
+ } catch (err) {
5332
+ throw new Error(
5333
+ `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
5334
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5335
+ );
5336
+ }
5337
+ const client = new anthropicModule.default({ apiKey });
5338
+ const prompt = buildJudgePrompt(candidate, reference, rubric);
5339
+ const startTime = Date.now();
5340
+ const response = await client.messages.create({
5341
+ model,
5342
+ max_tokens: maxTokens,
5343
+ temperature,
5344
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5345
+ messages: [{ role: "user", content: prompt }]
5346
+ });
5347
+ const durationMs = Date.now() - startTime;
5348
+ const textBlock = response.content.find(
5349
+ (b) => b.type === "text"
5350
+ );
5351
+ const text = textBlock?.text ?? "";
5352
+ const parsed = parseJudgeResponse(text);
5353
+ return {
5354
+ pass: parsed.pass,
5355
+ score: parsed.score,
5356
+ reasoning: parsed.reasoning,
5357
+ usage: {
5358
+ inputTokens: response.usage?.input_tokens ?? 0,
5359
+ outputTokens: response.usage?.output_tokens ?? 0,
5360
+ totalCostUsd: 0,
5361
+ durationMs
5362
+ }
5363
+ };
5364
+ }
5365
+ };
5366
+ }
5367
+ function buildJudgePrompt(candidate, reference, rubric) {
5368
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5369
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5370
+ return `Rubric:
5371
+ ${rubric}
5372
+
5373
+ <candidate_response>
5374
+ ${candidateStr}
5375
+ </candidate_response>
5376
+
5377
+ <reference_answer>
5378
+ ${referenceStr ?? "No reference provided."}
5379
+ </reference_answer>
5380
+
5381
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5382
+ }
5383
+ function parseJudgeResponse(text) {
5384
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5385
+ let parsed;
5386
+ try {
5387
+ parsed = JSON.parse(cleaned);
5388
+ } catch {
5389
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5390
+ }
5391
+ const result = JudgeResponseSchema.safeParse(parsed);
5392
+ if (!result.success) {
5393
+ throw new Error(
5394
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5395
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5396
+ );
5397
+ }
5398
+ return result.data;
5399
+ }
5400
+
5401
+ // src/judge/vertexAnthropicJudge.ts
5402
+ function createVertexAnthropicJudge(config = {}) {
5403
+ const model = config.model ?? "claude-sonnet-4-20250514";
5404
+ const maxTokens = config.maxTokens ?? 1e3;
5405
+ const temperature = config.temperature ?? 0;
5406
+ return {
5407
+ async evaluate(candidate, reference, rubric) {
5408
+ let vertexModule;
5409
+ try {
5410
+ vertexModule = await import('@anthropic-ai/vertex-sdk');
5411
+ } catch (err) {
5412
+ throw new Error(
5413
+ `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
5414
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5415
+ );
5416
+ }
5417
+ const client = new vertexModule.AnthropicVertex({
5418
+ projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
5419
+ region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
5420
+ });
5421
+ const prompt = buildJudgePrompt2(candidate, reference, rubric);
5422
+ const startTime = Date.now();
5423
+ const response = await client.messages.create({
5424
+ model,
5425
+ max_tokens: maxTokens,
5426
+ temperature,
5427
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5428
+ messages: [{ role: "user", content: prompt }]
5429
+ });
5430
+ const durationMs = Date.now() - startTime;
5431
+ const textBlock = response.content.find(
5432
+ (b) => b.type === "text"
5433
+ );
5434
+ const text = textBlock?.text ?? "";
5435
+ const parsed = parseJudgeResponse2(text);
5436
+ return {
5437
+ pass: parsed.pass,
5438
+ score: parsed.score,
5439
+ reasoning: parsed.reasoning,
5440
+ usage: {
5441
+ inputTokens: response.usage?.input_tokens ?? 0,
5442
+ outputTokens: response.usage?.output_tokens ?? 0,
5443
+ totalCostUsd: 0,
5444
+ durationMs
5445
+ }
5446
+ };
5447
+ }
5448
+ };
5449
+ }
5450
+ function buildJudgePrompt2(candidate, reference, rubric) {
5451
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5452
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5453
+ return `Rubric:
5454
+ ${rubric}
5455
+
5456
+ <candidate_response>
5457
+ ${candidateStr}
5458
+ </candidate_response>
5459
+
5460
+ <reference_answer>
5461
+ ${referenceStr ?? "No reference provided."}
5462
+ </reference_answer>
5463
+
5464
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5465
+ }
5466
+ function parseJudgeResponse2(text) {
5467
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5468
+ let parsed;
5469
+ try {
5470
+ parsed = JSON.parse(cleaned);
5471
+ } catch {
5472
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5473
+ }
5474
+ const result = JudgeResponseSchema.safeParse(parsed);
5475
+ if (!result.success) {
5476
+ throw new Error(
5477
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5478
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5479
+ );
5480
+ }
5481
+ return result.data;
5482
+ }
5269
5483
  function createClaudeAgentJudge(config) {
5270
5484
  const model = config.model ?? "claude-sonnet-4-20250514";
5271
5485
  const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -5283,7 +5497,7 @@ function createClaudeAgentJudge(config) {
5283
5497
  exceedsMaxToolOutputSize: true
5284
5498
  };
5285
5499
  }
5286
- const prompt = buildJudgePrompt(candidate, reference, rubric);
5500
+ const prompt = buildJudgePrompt3(candidate, reference, rubric);
5287
5501
  try {
5288
5502
  let resultMessage;
5289
5503
  for await (const message of claudeAgentSdk.query({
@@ -5315,7 +5529,7 @@ function createClaudeAgentJudge(config) {
5315
5529
  );
5316
5530
  }
5317
5531
  const responseText = resultMessage.result ?? "";
5318
- const parsed = parseJudgeResponse(responseText);
5532
+ const parsed = parseJudgeResponse3(responseText);
5319
5533
  const usage = {
5320
5534
  inputTokens: resultMessage.usage?.input_tokens ?? 0,
5321
5535
  outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -5344,7 +5558,7 @@ function createClaudeAgentJudge(config) {
5344
5558
  function buildSystemPrompt() {
5345
5559
  return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
5346
5560
  }
5347
- function buildJudgePrompt(candidate, reference, rubric) {
5561
+ function buildJudgePrompt3(candidate, reference, rubric) {
5348
5562
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5349
5563
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5350
5564
  const parts = [];
@@ -5361,7 +5575,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
5361
5575
  );
5362
5576
  return parts.join("");
5363
5577
  }
5364
- function parseJudgeResponse(text) {
5578
+ function parseJudgeResponse3(text) {
5365
5579
  let jsonText = text.trim();
5366
5580
  if (jsonText.startsWith("```json")) {
5367
5581
  jsonText = jsonText.slice(7);
@@ -5418,7 +5632,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5418
5632
  );
5419
5633
  }
5420
5634
  const client = new openaiModule.default({ apiKey });
5421
- const prompt = buildJudgePrompt2(candidate, reference, rubric);
5635
+ const prompt = buildJudgePrompt4(candidate, reference, rubric);
5422
5636
  const startTime = Date.now();
5423
5637
  const completion = await client.chat.completions.create({
5424
5638
  model,
@@ -5434,7 +5648,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5434
5648
  });
5435
5649
  const durationMs = Date.now() - startTime;
5436
5650
  const text = completion.choices[0]?.message.content ?? "";
5437
- const parsed = parseJudgeResponse2(text);
5651
+ const parsed = parseJudgeResponse4(text);
5438
5652
  return {
5439
5653
  pass: parsed.pass,
5440
5654
  score: parsed.score,
@@ -5449,7 +5663,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5449
5663
  }
5450
5664
  };
5451
5665
  }
5452
- function buildJudgePrompt2(candidate, reference, rubric) {
5666
+ function buildJudgePrompt4(candidate, reference, rubric) {
5453
5667
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5454
5668
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5455
5669
  return `Rubric:
@@ -5465,7 +5679,7 @@ ${referenceStr ?? "No reference provided."}
5465
5679
 
5466
5680
  Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5467
5681
  }
5468
- function parseJudgeResponse2(text) {
5682
+ function parseJudgeResponse4(text) {
5469
5683
  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5470
5684
  let parsed;
5471
5685
  try {
@@ -5567,14 +5781,48 @@ function createJudge(config = {}) {
5567
5781
  const provider = config.provider ?? "anthropic";
5568
5782
  switch (provider) {
5569
5783
  case "anthropic":
5784
+ return createAnthropicJudge(config);
5785
+ case "vertex-anthropic":
5786
+ return createVertexAnthropicJudge(config);
5787
+ case "anthropic-agent-sdk":
5570
5788
  return createClaudeAgentJudge(config);
5571
5789
  case "openai":
5572
5790
  return createOpenAIJudge(config);
5573
5791
  case "google":
5574
5792
  return createGoogleJudge(config);
5575
5793
  default:
5576
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
5794
+ throw new Error(
5795
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
5796
+ );
5797
+ }
5798
+ }
5799
+
5800
+ // src/judge/judgeRegistry.ts
5801
+ var registry = /* @__PURE__ */ new Map();
5802
+ function registerJudge(name15, executor) {
5803
+ const existing = registry.get(name15);
5804
+ if (existing !== void 0) {
5805
+ if (existing === executor) {
5806
+ return;
5807
+ }
5808
+ throw new Error(
5809
+ `Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
5810
+ );
5811
+ }
5812
+ registry.set(name15, executor);
5813
+ }
5814
+ function getRegisteredJudge(name15) {
5815
+ const executor = registry.get(name15);
5816
+ if (!executor) {
5817
+ const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
5818
+ throw new Error(
5819
+ `Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
5820
+ );
5577
5821
  }
5822
+ return executor;
5823
+ }
5824
+ function clearJudgeRegistry() {
5825
+ registry.clear();
5578
5826
  }
5579
5827
 
5580
5828
  // src/assertions/validators/judge.ts
@@ -5585,6 +5833,7 @@ function computeStdDev(scores, mean) {
5585
5833
  }
5586
5834
  async function validateJudge(response, config) {
5587
5835
  const {
5836
+ judge: judgeName,
5588
5837
  rubric,
5589
5838
  reference,
5590
5839
  threshold = 0.7,
@@ -5597,6 +5846,29 @@ async function validateJudge(response, config) {
5597
5846
  maxBudgetUsd,
5598
5847
  maxToolOutputSize
5599
5848
  } = config;
5849
+ if (judgeName !== void 0) {
5850
+ try {
5851
+ const executor = getRegisteredJudge(judgeName);
5852
+ const judgeResult = await executor(response, reference ?? void 0);
5853
+ const score = judgeResult.score;
5854
+ const passed = score >= threshold;
5855
+ return {
5856
+ pass: passed,
5857
+ message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
5858
+ };
5859
+ } catch (err) {
5860
+ return {
5861
+ pass: false,
5862
+ message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
5863
+ };
5864
+ }
5865
+ }
5866
+ if (rubric === void 0) {
5867
+ return {
5868
+ pass: false,
5869
+ message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
5870
+ };
5871
+ }
5600
5872
  const resolvedRubric = resolveRubric(rubric);
5601
5873
  const judgeConfig = {
5602
5874
  ...provider !== void 0 && { provider },
@@ -5643,11 +5915,17 @@ async function validateJudge(response, config) {
5643
5915
  return {
5644
5916
  pass: passed,
5645
5917
  message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
5646
- details: reps > 1 ? {
5647
- scores,
5648
- scoreStdDev: stdDev,
5649
- highVariance
5650
- } : void 0
5918
+ details: {
5919
+ score: meanScore,
5920
+ reasoning: lastReasoning,
5921
+ judgeProvider: provider ?? "anthropic",
5922
+ judgeModel: model,
5923
+ ...reps > 1 && {
5924
+ scores,
5925
+ scoreStdDev: stdDev,
5926
+ highVariance
5927
+ }
5928
+ }
5651
5929
  };
5652
5930
  } catch (err) {
5653
5931
  return {
@@ -5840,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
5840
6118
  // src/assertions/matchers/toMatchToolSchema.ts
5841
6119
  function toMatchToolSchema(received, schema, options = {}) {
5842
6120
  const result = validateSchema(received, schema, options);
6121
+ const preview = result.details?.textPreview;
5843
6122
  return {
5844
6123
  pass: result.pass,
5845
6124
  message: () => {
5846
6125
  if (this.isNot) {
5847
6126
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
5848
6127
  }
6128
+ if (!result.pass && preview) {
6129
+ return `${result.message}
6130
+
6131
+ Actual response (truncated):
6132
+ ${preview}`;
6133
+ }
5849
6134
  return result.message;
5850
6135
  }
5851
6136
  };
@@ -5854,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
5854
6139
  // src/assertions/matchers/toContainToolText.ts
5855
6140
  function toContainToolText(received, expected, options = {}) {
5856
6141
  const result = validateText(received, expected, options);
6142
+ const preview = result.details?.textPreview;
5857
6143
  return {
5858
6144
  pass: result.pass,
5859
6145
  message: () => {
@@ -5861,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
5861
6147
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
5862
6148
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
5863
6149
  }
6150
+ if (!result.pass && preview) {
6151
+ return `${result.message}
6152
+
6153
+ Actual response (truncated):
6154
+ ${preview}`;
6155
+ }
5864
6156
  return result.message;
5865
6157
  }
5866
6158
  };
@@ -5869,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
5869
6161
  // src/assertions/matchers/toMatchToolPattern.ts
5870
6162
  function toMatchToolPattern(received, patterns, options = {}) {
5871
6163
  const result = validatePattern(received, patterns, options);
6164
+ const preview = result.details?.textPreview;
5872
6165
  return {
5873
6166
  pass: result.pass,
5874
6167
  message: () => {
5875
6168
  if (this.isNot) {
5876
6169
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
5877
6170
  }
6171
+ if (!result.pass && preview) {
6172
+ return `${result.message}
6173
+
6174
+ Actual response (truncated):
6175
+ ${preview}`;
6176
+ }
5878
6177
  return result.message;
5879
6178
  }
5880
6179
  };
@@ -6026,31 +6325,68 @@ function toBeToolError(received, expected = true) {
6026
6325
 
6027
6326
  // src/assertions/matchers/toPassToolJudge.ts
6028
6327
  var DEFAULT_PASSING_THRESHOLD = 0.7;
6029
- async function toPassToolJudge(received, rubric, options = {}) {
6328
+ async function runSingleJudge(received, rubric, options) {
6030
6329
  const {
6031
6330
  reference = null,
6032
6331
  passingThreshold = DEFAULT_PASSING_THRESHOLD,
6033
6332
  reps,
6034
6333
  provider,
6035
- model
6334
+ model,
6335
+ judge
6036
6336
  } = options;
6037
6337
  const validation = await validateJudge(received, {
6038
- rubric,
6338
+ ...rubric !== void 0 && { rubric },
6039
6339
  reference: reference ?? void 0,
6040
6340
  threshold: passingThreshold,
6041
6341
  ...reps !== void 0 && { reps },
6042
6342
  ...provider !== void 0 && { provider },
6043
- ...model !== void 0 && { model }
6343
+ ...model !== void 0 && { model },
6344
+ ...judge !== void 0 && { judge }
6044
6345
  });
6346
+ return { pass: validation.pass, message: validation.message };
6347
+ }
6348
+ async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
6349
+ if (Array.isArray(rubricOrOptions)) {
6350
+ const results = await Promise.all(
6351
+ rubricOrOptions.map(async (judgeConfig) => {
6352
+ const { rubric: r, ...opts } = judgeConfig;
6353
+ return runSingleJudge(received, r, opts);
6354
+ })
6355
+ );
6356
+ const allPassed = results.every((r) => r.pass);
6357
+ const passCount = results.filter((r) => r.pass).length;
6358
+ const summary = `${passCount}/${results.length} judges passed`;
6359
+ const details = results.map((r) => r.message).join("\n");
6360
+ if (this.isNot) {
6361
+ return {
6362
+ pass: !allPassed,
6363
+ message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
6364
+ };
6365
+ }
6366
+ return {
6367
+ pass: allPassed,
6368
+ message: () => `${summary}
6369
+ ${details}`
6370
+ };
6371
+ }
6372
+ let rubric;
6373
+ let options;
6374
+ if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
6375
+ rubric = rubricOrOptions;
6376
+ options = maybeOptions ?? {};
6377
+ } else {
6378
+ options = rubricOrOptions;
6379
+ }
6380
+ const result = await runSingleJudge(received, rubric, options);
6045
6381
  if (this.isNot) {
6046
6382
  return {
6047
- pass: !validation.pass,
6048
- message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6383
+ pass: !result.pass,
6384
+ message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6049
6385
  };
6050
6386
  }
6051
6387
  return {
6052
- pass: validation.pass,
6053
- message: () => validation.message
6388
+ pass: result.pass,
6389
+ message: () => result.message
6054
6390
  };
6055
6391
  }
6056
6392
 
@@ -6334,6 +6670,7 @@ function getAuthConfigFromEnv() {
6334
6670
  return void 0;
6335
6671
  }
6336
6672
  var MCPHostConfigSchema = zod.z.object({
6673
+ hostType: zod.z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
6337
6674
  provider: zod.z.enum([
6338
6675
  "openai",
6339
6676
  "anthropic",
@@ -6344,12 +6681,18 @@ var MCPHostConfigSchema = zod.z.object({
6344
6681
  "openrouter",
6345
6682
  "xai",
6346
6683
  "vertex-anthropic"
6347
- ]),
6684
+ ]).optional(),
6348
6685
  apiKeyEnvVar: zod.z.string().optional(),
6349
6686
  model: zod.z.string().optional(),
6350
6687
  maxTokens: zod.z.number().optional(),
6351
6688
  temperature: zod.z.number().optional(),
6352
- maxToolCalls: zod.z.number().optional()
6689
+ maxToolCalls: zod.z.number().optional(),
6690
+ cli: zod.z.object({
6691
+ command: zod.z.string(),
6692
+ args: zod.z.array(zod.z.string()),
6693
+ outputFormat: zod.z.enum(["stream-json", "json"]).optional(),
6694
+ timeout: zod.z.number().optional()
6695
+ }).optional()
6353
6696
  });
6354
6697
  var SnapshotSanitizerSchema = zod.z.union([
6355
6698
  // Built-in sanitizers
@@ -6364,6 +6707,37 @@ var SnapshotSanitizerSchema = zod.z.union([
6364
6707
  remove: zod.z.array(zod.z.string())
6365
6708
  })
6366
6709
  ]);
6710
+ var JudgeExpectConfigSchema = zod.z.object({
6711
+ judge: zod.z.string().min(1).optional(),
6712
+ rubric: zod.z.union([
6713
+ zod.z.enum([
6714
+ "correctness",
6715
+ "completeness",
6716
+ "groundedness",
6717
+ "instruction-following",
6718
+ "conciseness"
6719
+ ]),
6720
+ zod.z.object({ text: zod.z.string().min(1) })
6721
+ ]).optional(),
6722
+ reference: zod.z.unknown().optional(),
6723
+ threshold: zod.z.number().min(0).max(1).optional(),
6724
+ reps: zod.z.number().int().min(1).optional(),
6725
+ provider: zod.z.enum([
6726
+ "anthropic",
6727
+ "vertex-anthropic",
6728
+ "anthropic-agent-sdk",
6729
+ "openai",
6730
+ "google"
6731
+ ]).optional(),
6732
+ model: zod.z.string().optional(),
6733
+ apiKeyEnvVar: zod.z.string().optional(),
6734
+ maxTokens: zod.z.number().int().positive().optional(),
6735
+ temperature: zod.z.number().min(0).max(1).optional(),
6736
+ maxBudgetUsd: zod.z.number().positive().optional(),
6737
+ maxToolOutputSize: zod.z.number().int().positive().optional()
6738
+ }).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
6739
+ message: 'Either "judge" or "rubric" must be provided in passesJudge'
6740
+ });
6367
6741
  var EvalExpectBlockSchema = zod.z.object({
6368
6742
  response: zod.z.unknown().optional(),
6369
6743
  schema: zod.z.string().optional(),
@@ -6372,28 +6746,7 @@ var EvalExpectBlockSchema = zod.z.object({
6372
6746
  snapshot: zod.z.string().optional(),
6373
6747
  snapshotSanitizers: zod.z.array(SnapshotSanitizerSchema).optional(),
6374
6748
  isError: zod.z.union([zod.z.boolean(), zod.z.string(), zod.z.array(zod.z.string())]).optional(),
6375
- passesJudge: zod.z.object({
6376
- rubric: zod.z.union([
6377
- zod.z.enum([
6378
- "correctness",
6379
- "completeness",
6380
- "groundedness",
6381
- "instruction-following",
6382
- "conciseness"
6383
- ]),
6384
- zod.z.object({ text: zod.z.string().min(1) })
6385
- ]),
6386
- reference: zod.z.unknown().optional(),
6387
- threshold: zod.z.number().min(0).max(1).optional(),
6388
- reps: zod.z.number().int().min(1).optional(),
6389
- provider: zod.z.enum(["anthropic", "openai", "google"]).optional(),
6390
- model: zod.z.string().optional(),
6391
- apiKeyEnvVar: zod.z.string().optional(),
6392
- maxTokens: zod.z.number().int().positive().optional(),
6393
- temperature: zod.z.number().min(0).max(1).optional(),
6394
- maxBudgetUsd: zod.z.number().positive().optional(),
6395
- maxToolOutputSize: zod.z.number().int().positive().optional()
6396
- }).optional(),
6749
+ passesJudge: zod.z.union([JudgeExpectConfigSchema, zod.z.array(JudgeExpectConfigSchema).min(1)]).optional(),
6397
6750
  responseSize: zod.z.object({
6398
6751
  maxBytes: zod.z.number().optional(),
6399
6752
  minBytes: zod.z.number().optional()
@@ -6566,6 +6919,9 @@ function createVercelOrchestrator() {
6566
6919
  try {
6567
6920
  const { generateText, stepCountIs } = await import('ai');
6568
6921
  const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
6922
+ if (!config.provider) {
6923
+ throw new Error("provider is required for SDK host type");
6924
+ }
6569
6925
  const modelId = config.model ?? defaultModel(config.provider);
6570
6926
  const model = await loadModel(config.provider, modelId);
6571
6927
  const mcpTools = await mcp.listTools();
@@ -6619,13 +6975,233 @@ function createVercelOrchestrator() {
6619
6975
  return {
6620
6976
  success: false,
6621
6977
  toolCalls: [],
6622
- error: enrichErrorMessage(err, config.provider)
6978
+ error: enrichErrorMessage(err, config.provider ?? "unknown")
6623
6979
  };
6624
6980
  }
6625
6981
  }
6626
6982
  };
6627
6983
  }
6628
6984
 
6985
+ // src/evals/mcpHost/adapters/cli/parsers.ts
6986
+ function parseStreamJson(stdout) {
6987
+ const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6988
+ const toolCalls = [];
6989
+ const textParts = [];
6990
+ const conversationHistory = [];
6991
+ for (const line of lines) {
6992
+ let event;
6993
+ try {
6994
+ event = JSON.parse(line);
6995
+ } catch {
6996
+ continue;
6997
+ }
6998
+ if (event.type === "assistant" && event.message?.content) {
6999
+ for (const block of event.message.content) {
7000
+ if (block.type === "tool_use" && block.name) {
7001
+ const rawName = block.name;
7002
+ const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
7003
+ toolCalls.push({
7004
+ name: mcpMatch ? mcpMatch[1] : rawName,
7005
+ arguments: block.input ?? {},
7006
+ id: block.id
7007
+ });
7008
+ }
7009
+ if (block.type === "text" && block.text) {
7010
+ textParts.push(block.text);
7011
+ }
7012
+ }
7013
+ }
7014
+ if (event.type === "user" && event.message?.content) {
7015
+ for (const block of event.message.content) {
7016
+ if (block.type === "tool_result") {
7017
+ const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
7018
+ conversationHistory.push({ role: "tool", content });
7019
+ }
7020
+ }
7021
+ }
7022
+ if (event.type === "result" && typeof event.result === "string") {
7023
+ if (textParts.length === 0) {
7024
+ textParts.push(event.result);
7025
+ }
7026
+ }
7027
+ if (event.type === "result" && event.is_error === true) {
7028
+ return {
7029
+ success: false,
7030
+ toolCalls,
7031
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7032
+ };
7033
+ }
7034
+ }
7035
+ const response = textParts.join("");
7036
+ if (response) {
7037
+ conversationHistory.push({ role: "assistant", content: response });
7038
+ }
7039
+ return {
7040
+ success: true,
7041
+ toolCalls,
7042
+ response: response || void 0,
7043
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7044
+ };
7045
+ }
7046
+ function createJsonParser(paths) {
7047
+ return (stdout) => {
7048
+ const data = JSON.parse(stdout);
7049
+ const rawToolCalls = getNestedValue(data, paths.toolCalls);
7050
+ const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
7051
+ name: typeof tc.name === "string" ? tc.name : "",
7052
+ arguments: tc.arguments ?? tc.args ?? {}
7053
+ })) : [];
7054
+ const response = getNestedValue(data, paths.response);
7055
+ const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
7056
+ return {
7057
+ success,
7058
+ toolCalls,
7059
+ response: typeof response === "string" ? response : void 0
7060
+ };
7061
+ };
7062
+ }
7063
+ function getNestedValue(obj, path3) {
7064
+ return path3.split(".").reduce((current, key) => {
7065
+ if (current !== null && typeof current === "object") {
7066
+ return current[key];
7067
+ }
7068
+ return void 0;
7069
+ }, obj);
7070
+ }
7071
+
7072
+ // src/evals/mcpHost/adapters/cli/runner.ts
7073
+ var DEFAULT_TIMEOUT = 12e4;
7074
+ var MAX_BUFFER = 10 * 1024 * 1024;
7075
+ function getParser(format) {
7076
+ switch (format ?? "stream-json") {
7077
+ case "stream-json":
7078
+ return parseStreamJson;
7079
+ case "json":
7080
+ return createJsonParser({
7081
+ toolCalls: "toolCalls",
7082
+ response: "response",
7083
+ success: "success"
7084
+ });
7085
+ }
7086
+ }
7087
+ function interpolateArgs(args, scenario) {
7088
+ return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
7089
+ }
7090
+ async function runCLIHost(cliConfig, scenario) {
7091
+ const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
7092
+ const args = interpolateArgs(cliConfig.args, scenario);
7093
+ const startTime = Date.now();
7094
+ let stdout;
7095
+ try {
7096
+ const result2 = await spawnProcess(cliConfig.command, args, { timeout });
7097
+ stdout = result2.stdout;
7098
+ } catch (err) {
7099
+ const elapsed = Date.now() - startTime;
7100
+ const message = err instanceof Error ? err.message : String(err);
7101
+ if (message.includes("TIMEOUT") || message.includes("timed out")) {
7102
+ return {
7103
+ success: false,
7104
+ toolCalls: [],
7105
+ error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
7106
+ };
7107
+ }
7108
+ return {
7109
+ success: false,
7110
+ toolCalls: [],
7111
+ error: `CLI host process failed: ${message}`
7112
+ };
7113
+ }
7114
+ const parse = getParser(cliConfig.outputFormat);
7115
+ let result;
7116
+ try {
7117
+ result = parse(stdout);
7118
+ } catch (err) {
7119
+ return {
7120
+ success: false,
7121
+ toolCalls: [],
7122
+ error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
7123
+ stdout (first 500 chars): ${stdout.slice(0, 500)}`
7124
+ };
7125
+ }
7126
+ const validationError = validateSimulationResult(result);
7127
+ if (validationError) {
7128
+ return {
7129
+ success: false,
7130
+ toolCalls: [],
7131
+ error: `CLI host returned invalid result: ${validationError}`
7132
+ };
7133
+ }
7134
+ return result;
7135
+ }
7136
+ function validateSimulationResult(result) {
7137
+ if (result === null || typeof result !== "object") {
7138
+ return `Expected object, got ${typeof result}`;
7139
+ }
7140
+ const obj = result;
7141
+ if (typeof obj.success !== "boolean") {
7142
+ return `"success" must be a boolean, got ${typeof obj.success}`;
7143
+ }
7144
+ if (!Array.isArray(obj.toolCalls)) {
7145
+ return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
7146
+ }
7147
+ for (let i = 0; i < obj.toolCalls.length; i++) {
7148
+ const tc = obj.toolCalls[i];
7149
+ if (typeof tc.name !== "string") {
7150
+ return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
7151
+ }
7152
+ if (typeof tc.arguments !== "object" || tc.arguments === null) {
7153
+ return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
7154
+ }
7155
+ }
7156
+ return null;
7157
+ }
7158
+ function spawnProcess(command, args, options) {
7159
+ return new Promise((resolve2, reject) => {
7160
+ const child = child_process.spawn(command, args, {
7161
+ stdio: ["pipe", "pipe", "pipe"]
7162
+ });
7163
+ child.stdin.end();
7164
+ const stdoutChunks = [];
7165
+ const stderrChunks = [];
7166
+ let totalBytes = 0;
7167
+ child.stdout.on("data", (chunk) => {
7168
+ totalBytes += chunk.length;
7169
+ if (totalBytes <= MAX_BUFFER) {
7170
+ stdoutChunks.push(chunk);
7171
+ }
7172
+ });
7173
+ child.stderr.on("data", (chunk) => {
7174
+ totalBytes += chunk.length;
7175
+ if (totalBytes <= MAX_BUFFER) {
7176
+ stderrChunks.push(chunk);
7177
+ }
7178
+ });
7179
+ const timer = setTimeout(() => {
7180
+ child.kill("SIGTERM");
7181
+ reject(new Error(`Process timed out after ${options.timeout}ms`));
7182
+ }, options.timeout);
7183
+ child.on("error", (err) => {
7184
+ clearTimeout(timer);
7185
+ reject(err);
7186
+ });
7187
+ child.on("close", (code) => {
7188
+ clearTimeout(timer);
7189
+ const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
7190
+ const stderr = Buffer.concat(stderrChunks).toString("utf-8");
7191
+ if (code !== 0) {
7192
+ reject(
7193
+ new Error(
7194
+ `Command failed with exit code ${code ?? "null"}` + (stderr ? `
7195
+ stderr: ${stderr}` : "")
7196
+ )
7197
+ );
7198
+ return;
7199
+ }
7200
+ resolve2({ stdout, stderr });
7201
+ });
7202
+ });
7203
+ }
7204
+
6629
7205
  // src/evals/mcpHost/mcpHostSimulation.ts
6630
7206
  var vercelOrchestrator = createVercelOrchestrator();
6631
7207
  var allProviders = [
@@ -6643,6 +7219,25 @@ var simulatorRegistry = new Map(
6643
7219
  allProviders.map((p) => [p, vercelOrchestrator])
6644
7220
  );
6645
7221
  async function simulateMCPHost(mcp, scenario, config) {
7222
+ const hostType = config.hostType ?? "sdk";
7223
+ if (hostType === "cli") {
7224
+ if (!config.cli) {
7225
+ throw new Error(
7226
+ `mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
7227
+ );
7228
+ }
7229
+ return runCLIHost(config.cli, scenario);
7230
+ }
7231
+ if (hostType === "browser" || hostType === "desktop") {
7232
+ throw new Error(
7233
+ `Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
7234
+ );
7235
+ }
7236
+ if (!config.provider) {
7237
+ throw new Error(
7238
+ `mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
7239
+ );
7240
+ }
6646
7241
  const simulator = simulatorRegistry.get(config.provider);
6647
7242
  if (!simulator) {
6648
7243
  throw new Error(
@@ -6834,17 +7429,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6834
7429
  };
6835
7430
  }
6836
7431
  if (expectBlock.passesJudge !== void 0) {
6837
- const effectiveReps = expectBlock.passesJudge.reps ?? config.judgeReps ?? 1;
6838
- const effectiveReference = expectBlock.passesJudge.reference !== void 0 ? expectBlock.passesJudge.reference : config.canonicalAnswer;
6839
- const validation = await validateJudge(response, {
6840
- ...expectBlock.passesJudge,
6841
- reference: effectiveReference,
6842
- reps: effectiveReps
6843
- });
6844
- results.judge = {
6845
- pass: validation.pass,
6846
- details: validation.message
6847
- };
7432
+ const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
7433
+ const judgeResultEntries = await Promise.all(
7434
+ judgeConfigs.map(async (judgeConfig) => {
7435
+ const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
7436
+ const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
7437
+ const validation = await validateJudge(response, {
7438
+ ...judgeConfig,
7439
+ reference: effectiveReference,
7440
+ reps: effectiveReps
7441
+ });
7442
+ const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
7443
+ return {
7444
+ pass: validation.pass,
7445
+ details: validation.message,
7446
+ score: validation.details?.score,
7447
+ reasoning: validation.details?.reasoning,
7448
+ judgeName,
7449
+ judgeProvider: validation.details?.judgeProvider,
7450
+ judgeModel: validation.details?.judgeModel
7451
+ };
7452
+ })
7453
+ );
7454
+ if (judgeResultEntries.length === 1) {
7455
+ results.judge = judgeResultEntries[0];
7456
+ } else {
7457
+ const allPassed = judgeResultEntries.every((r) => r.pass);
7458
+ const passCount = judgeResultEntries.filter((r) => r.pass).length;
7459
+ results.judge = {
7460
+ pass: allPassed,
7461
+ details: `${passCount}/${judgeResultEntries.length} judges passed`,
7462
+ judgeResults: judgeResultEntries
7463
+ };
7464
+ }
6848
7465
  }
6849
7466
  if (expectBlock.snapshot !== void 0) {
6850
7467
  if (!config.playwrightExpect) {
@@ -6873,6 +7490,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6873
7490
  }
6874
7491
  return { expectations: results, toolPrecision, toolRecall };
6875
7492
  }
7493
+ function buildRequest(evalCase) {
7494
+ const request = {};
7495
+ if (evalCase.description) request.description = evalCase.description;
7496
+ if (evalCase.mode === "mcp_host") {
7497
+ if (evalCase.scenario) request.scenario = evalCase.scenario;
7498
+ if (evalCase.mcpHostConfig) {
7499
+ request.mcpHostConfig = {
7500
+ provider: evalCase.mcpHostConfig.provider,
7501
+ ...evalCase.mcpHostConfig.model !== void 0 && {
7502
+ model: evalCase.mcpHostConfig.model
7503
+ }
7504
+ };
7505
+ }
7506
+ } else {
7507
+ if (evalCase.args) request.args = evalCase.args;
7508
+ }
7509
+ return request;
7510
+ }
6876
7511
  function isMCPHostSimulationResult(value) {
6877
7512
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6878
7513
  }
@@ -6921,6 +7556,7 @@ async function runSingleIteration(evalCase, context, options) {
6921
7556
  toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6922
7557
  source: "eval",
6923
7558
  pass: didCasePass(error, expectationResults),
7559
+ request: buildRequest(evalCase),
6924
7560
  response,
6925
7561
  error,
6926
7562
  expectations: expectationResults,
@@ -6946,7 +7582,7 @@ function isInfrastructureError(err) {
6946
7582
  } else {
6947
7583
  return false;
6948
7584
  }
6949
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7585
+ return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
6950
7586
  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6951
7587
  }
6952
7588
  async function runEvalCase(evalCase, context, options = {}) {
@@ -7063,8 +7699,13 @@ async function runEvalDataset(options, context) {
7063
7699
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7064
7700
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7065
7701
  const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7066
- const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7067
- return sum + effectiveIterations * judgeReps;
7702
+ if (c.expect?.passesJudge == null) return sum;
7703
+ const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
7704
+ const totalReps = judges.reduce(
7705
+ (r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
7706
+ 0
7707
+ );
7708
+ return sum + effectiveIterations * totalReps;
7068
7709
  }, 0);
7069
7710
  if (estimatedJudgeCalls > 50) {
7070
7711
  debugEval(
@@ -7421,6 +8062,7 @@ exports.EvalDatasetSchema = EvalDatasetSchema;
7421
8062
  exports.MCPConfigSchema = MCPConfigSchema;
7422
8063
  exports.MCP_PROTOCOL_VERSION = MCP_PROTOCOL_VERSION;
7423
8064
  exports.SnapshotSanitizers = SnapshotSanitizers;
8065
+ exports.clearJudgeRegistry = clearJudgeRegistry;
7424
8066
  exports.closeMCPClient = closeMCPClient;
7425
8067
  exports.createJudge = createJudge;
7426
8068
  exports.createMCPClientForConfig = createMCPClientForConfig;
@@ -7431,6 +8073,7 @@ exports.discoverProtectedResource = discoverProtectedResource;
7431
8073
  exports.expect = expect;
7432
8074
  exports.extractText = extractText;
7433
8075
  exports.getMissingDependencyMessage = getMissingDependencyMessage;
8076
+ exports.getRegisteredJudge = getRegisteredJudge;
7434
8077
  exports.getResponseSizeBytes = getResponseSizeBytes;
7435
8078
  exports.hasValidTokens = hasValidTokens;
7436
8079
  exports.injectTokens = injectTokens;
@@ -7451,6 +8094,8 @@ exports.normalizeWhitespace = normalizeWhitespace;
7451
8094
  exports.performClientCredentialsFlow = performClientCredentialsFlow;
7452
8095
  exports.performOAuthSetup = performOAuthSetup;
7453
8096
  exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
8097
+ exports.refreshAccessToken = refreshAccessToken;
8098
+ exports.registerJudge = registerJudge;
7454
8099
  exports.resolveRubric = resolveRubric;
7455
8100
  exports.runConformanceChecks = runConformanceChecks;
7456
8101
  exports.runEvalCase = runEvalCase;