@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3306,7 +3306,11 @@ async function performOAuthSetup(config) {
3306
3306
  const page = await context.newPage();
3307
3307
  page.setDefaultTimeout(timeoutMs);
3308
3308
  await page.goto(authorizationUrl.toString());
3309
- await completeLoginForm(page, config);
3309
+ if ("customLoginFlow" in config && config.customLoginFlow) {
3310
+ await config.customLoginFlow(page);
3311
+ } else {
3312
+ await completeLoginForm(page, config);
3313
+ }
3310
3314
  await page.waitForURL(
3311
3315
  (url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
3312
3316
  { timeout: timeoutMs }
@@ -4407,7 +4411,7 @@ function escapeHtml(text) {
4407
4411
 
4408
4412
  // package.json
4409
4413
  var package_default = {
4410
- version: "1.0.0-beta.6"};
4414
+ version: "1.0.0-beta.8"};
4411
4415
 
4412
4416
  // src/mcp/clientFactory.ts
4413
4417
  function getRetryAfterDelayMs(err) {
@@ -5124,9 +5128,17 @@ function formatBytes(bytes) {
5124
5128
  function isSimulationResult(value) {
5125
5129
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
5126
5130
  }
5131
+ function isPatternMatcher(v) {
5132
+ return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
5133
+ }
5127
5134
  function partialMatch(actual, expected) {
5128
5135
  return Object.entries(expected).every(([k, v]) => {
5129
5136
  const actualVal = actual[k];
5137
+ if (isPatternMatcher(v)) {
5138
+ if (typeof actualVal !== "string") return false;
5139
+ const re = new RegExp(v.$pattern, v.$flags);
5140
+ return re.test(actualVal);
5141
+ }
5130
5142
  if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
5131
5143
  return partialMatch(
5132
5144
  actualVal,
@@ -5265,7 +5277,175 @@ var JudgeResponseSchema = zod.z.object({
5265
5277
  reasoning: zod.z.string()
5266
5278
  });
5267
5279
 
5268
- // src/judge/claudeAgentJudge.ts
5280
+ // src/judge/anthropicJudge.ts
5281
+ function createAnthropicJudge(config = {}) {
5282
+ const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
5283
+ const apiKey = process.env[apiKeyEnvVar];
5284
+ if (!apiKey) {
5285
+ throw new Error(
5286
+ `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
5287
+ );
5288
+ }
5289
+ const model = config.model ?? "claude-sonnet-4-20250514";
5290
+ const maxTokens = config.maxTokens ?? 1e3;
5291
+ const temperature = config.temperature ?? 0;
5292
+ return {
5293
+ async evaluate(candidate, reference, rubric) {
5294
+ let anthropicModule;
5295
+ try {
5296
+ anthropicModule = await import('@anthropic-ai/sdk');
5297
+ } catch (err) {
5298
+ throw new Error(
5299
+ `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
5300
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5301
+ );
5302
+ }
5303
+ const client = new anthropicModule.default({ apiKey });
5304
+ const prompt = buildJudgePrompt(candidate, reference, rubric);
5305
+ const startTime = Date.now();
5306
+ const response = await client.messages.create({
5307
+ model,
5308
+ max_tokens: maxTokens,
5309
+ temperature,
5310
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5311
+ messages: [{ role: "user", content: prompt }]
5312
+ });
5313
+ const durationMs = Date.now() - startTime;
5314
+ const textBlock = response.content.find(
5315
+ (b) => b.type === "text"
5316
+ );
5317
+ const text = textBlock?.text ?? "";
5318
+ const parsed = parseJudgeResponse(text);
5319
+ return {
5320
+ pass: parsed.pass,
5321
+ score: parsed.score,
5322
+ reasoning: parsed.reasoning,
5323
+ usage: {
5324
+ inputTokens: response.usage?.input_tokens ?? 0,
5325
+ outputTokens: response.usage?.output_tokens ?? 0,
5326
+ totalCostUsd: 0,
5327
+ durationMs
5328
+ }
5329
+ };
5330
+ }
5331
+ };
5332
+ }
5333
+ function buildJudgePrompt(candidate, reference, rubric) {
5334
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5335
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5336
+ return `Rubric:
5337
+ ${rubric}
5338
+
5339
+ <candidate_response>
5340
+ ${candidateStr}
5341
+ </candidate_response>
5342
+
5343
+ <reference_answer>
5344
+ ${referenceStr ?? "No reference provided."}
5345
+ </reference_answer>
5346
+
5347
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5348
+ }
5349
+ function parseJudgeResponse(text) {
5350
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5351
+ let parsed;
5352
+ try {
5353
+ parsed = JSON.parse(cleaned);
5354
+ } catch {
5355
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5356
+ }
5357
+ const result = JudgeResponseSchema.safeParse(parsed);
5358
+ if (!result.success) {
5359
+ throw new Error(
5360
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5361
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5362
+ );
5363
+ }
5364
+ return result.data;
5365
+ }
5366
+
5367
+ // src/judge/vertexAnthropicJudge.ts
5368
+ function createVertexAnthropicJudge(config = {}) {
5369
+ const model = config.model ?? "claude-sonnet-4-20250514";
5370
+ const maxTokens = config.maxTokens ?? 1e3;
5371
+ const temperature = config.temperature ?? 0;
5372
+ return {
5373
+ async evaluate(candidate, reference, rubric) {
5374
+ let vertexModule;
5375
+ try {
5376
+ vertexModule = await import('@anthropic-ai/vertex-sdk');
5377
+ } catch (err) {
5378
+ throw new Error(
5379
+ `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
5380
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5381
+ );
5382
+ }
5383
+ const client = new vertexModule.AnthropicVertex({
5384
+ projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
5385
+ region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
5386
+ });
5387
+ const prompt = buildJudgePrompt2(candidate, reference, rubric);
5388
+ const startTime = Date.now();
5389
+ const response = await client.messages.create({
5390
+ model,
5391
+ max_tokens: maxTokens,
5392
+ temperature,
5393
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5394
+ messages: [{ role: "user", content: prompt }]
5395
+ });
5396
+ const durationMs = Date.now() - startTime;
5397
+ const textBlock = response.content.find(
5398
+ (b) => b.type === "text"
5399
+ );
5400
+ const text = textBlock?.text ?? "";
5401
+ const parsed = parseJudgeResponse2(text);
5402
+ return {
5403
+ pass: parsed.pass,
5404
+ score: parsed.score,
5405
+ reasoning: parsed.reasoning,
5406
+ usage: {
5407
+ inputTokens: response.usage?.input_tokens ?? 0,
5408
+ outputTokens: response.usage?.output_tokens ?? 0,
5409
+ totalCostUsd: 0,
5410
+ durationMs
5411
+ }
5412
+ };
5413
+ }
5414
+ };
5415
+ }
5416
+ function buildJudgePrompt2(candidate, reference, rubric) {
5417
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5418
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5419
+ return `Rubric:
5420
+ ${rubric}
5421
+
5422
+ <candidate_response>
5423
+ ${candidateStr}
5424
+ </candidate_response>
5425
+
5426
+ <reference_answer>
5427
+ ${referenceStr ?? "No reference provided."}
5428
+ </reference_answer>
5429
+
5430
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5431
+ }
5432
+ function parseJudgeResponse2(text) {
5433
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5434
+ let parsed;
5435
+ try {
5436
+ parsed = JSON.parse(cleaned);
5437
+ } catch {
5438
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5439
+ }
5440
+ const result = JudgeResponseSchema.safeParse(parsed);
5441
+ if (!result.success) {
5442
+ throw new Error(
5443
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5444
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5445
+ );
5446
+ }
5447
+ return result.data;
5448
+ }
5269
5449
  function createClaudeAgentJudge(config) {
5270
5450
  const model = config.model ?? "claude-sonnet-4-20250514";
5271
5451
  const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -5283,7 +5463,7 @@ function createClaudeAgentJudge(config) {
5283
5463
  exceedsMaxToolOutputSize: true
5284
5464
  };
5285
5465
  }
5286
- const prompt = buildJudgePrompt(candidate, reference, rubric);
5466
+ const prompt = buildJudgePrompt3(candidate, reference, rubric);
5287
5467
  try {
5288
5468
  let resultMessage;
5289
5469
  for await (const message of claudeAgentSdk.query({
@@ -5315,7 +5495,7 @@ function createClaudeAgentJudge(config) {
5315
5495
  );
5316
5496
  }
5317
5497
  const responseText = resultMessage.result ?? "";
5318
- const parsed = parseJudgeResponse(responseText);
5498
+ const parsed = parseJudgeResponse3(responseText);
5319
5499
  const usage = {
5320
5500
  inputTokens: resultMessage.usage?.input_tokens ?? 0,
5321
5501
  outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -5344,7 +5524,7 @@ function createClaudeAgentJudge(config) {
5344
5524
  function buildSystemPrompt() {
5345
5525
  return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
5346
5526
  }
5347
- function buildJudgePrompt(candidate, reference, rubric) {
5527
+ function buildJudgePrompt3(candidate, reference, rubric) {
5348
5528
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5349
5529
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5350
5530
  const parts = [];
@@ -5361,7 +5541,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
5361
5541
  );
5362
5542
  return parts.join("");
5363
5543
  }
5364
- function parseJudgeResponse(text) {
5544
+ function parseJudgeResponse3(text) {
5365
5545
  let jsonText = text.trim();
5366
5546
  if (jsonText.startsWith("```json")) {
5367
5547
  jsonText = jsonText.slice(7);
@@ -5418,7 +5598,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5418
5598
  );
5419
5599
  }
5420
5600
  const client = new openaiModule.default({ apiKey });
5421
- const prompt = buildJudgePrompt2(candidate, reference, rubric);
5601
+ const prompt = buildJudgePrompt4(candidate, reference, rubric);
5422
5602
  const startTime = Date.now();
5423
5603
  const completion = await client.chat.completions.create({
5424
5604
  model,
@@ -5434,7 +5614,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5434
5614
  });
5435
5615
  const durationMs = Date.now() - startTime;
5436
5616
  const text = completion.choices[0]?.message.content ?? "";
5437
- const parsed = parseJudgeResponse2(text);
5617
+ const parsed = parseJudgeResponse4(text);
5438
5618
  return {
5439
5619
  pass: parsed.pass,
5440
5620
  score: parsed.score,
@@ -5449,7 +5629,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5449
5629
  }
5450
5630
  };
5451
5631
  }
5452
- function buildJudgePrompt2(candidate, reference, rubric) {
5632
+ function buildJudgePrompt4(candidate, reference, rubric) {
5453
5633
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5454
5634
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5455
5635
  return `Rubric:
@@ -5465,7 +5645,7 @@ ${referenceStr ?? "No reference provided."}
5465
5645
 
5466
5646
  Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5467
5647
  }
5468
- function parseJudgeResponse2(text) {
5648
+ function parseJudgeResponse4(text) {
5469
5649
  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5470
5650
  let parsed;
5471
5651
  try {
@@ -5567,6 +5747,10 @@ function createJudge(config = {}) {
5567
5747
  const provider = config.provider ?? "anthropic";
5568
5748
  switch (provider) {
5569
5749
  case "anthropic":
5750
+ return createAnthropicJudge(config);
5751
+ case "vertex-anthropic":
5752
+ return createVertexAnthropicJudge(config);
5753
+ case "anthropic-agent-sdk":
5570
5754
  return createClaudeAgentJudge(config);
5571
5755
  case "openai":
5572
5756
  return createOpenAIJudge(config);
@@ -5577,6 +5761,34 @@ function createJudge(config = {}) {
5577
5761
  }
5578
5762
  }
5579
5763
 
5764
+ // src/judge/judgeRegistry.ts
5765
+ var registry = /* @__PURE__ */ new Map();
5766
+ function registerJudge(name15, executor) {
5767
+ const existing = registry.get(name15);
5768
+ if (existing !== void 0) {
5769
+ if (existing === executor) {
5770
+ return;
5771
+ }
5772
+ throw new Error(
5773
+ `Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
5774
+ );
5775
+ }
5776
+ registry.set(name15, executor);
5777
+ }
5778
+ function getRegisteredJudge(name15) {
5779
+ const executor = registry.get(name15);
5780
+ if (!executor) {
5781
+ const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
5782
+ throw new Error(
5783
+ `Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
5784
+ );
5785
+ }
5786
+ return executor;
5787
+ }
5788
+ function clearJudgeRegistry() {
5789
+ registry.clear();
5790
+ }
5791
+
5580
5792
  // src/assertions/validators/judge.ts
5581
5793
  function computeStdDev(scores, mean) {
5582
5794
  if (scores.length <= 1) return 0;
@@ -5585,6 +5797,7 @@ function computeStdDev(scores, mean) {
5585
5797
  }
5586
5798
  async function validateJudge(response, config) {
5587
5799
  const {
5800
+ judge: judgeName,
5588
5801
  rubric,
5589
5802
  reference,
5590
5803
  threshold = 0.7,
@@ -5597,6 +5810,29 @@ async function validateJudge(response, config) {
5597
5810
  maxBudgetUsd,
5598
5811
  maxToolOutputSize
5599
5812
  } = config;
5813
+ if (judgeName !== void 0) {
5814
+ try {
5815
+ const executor = getRegisteredJudge(judgeName);
5816
+ const judgeResult = await executor(response, reference ?? void 0);
5817
+ const score = judgeResult.score;
5818
+ const passed = score >= threshold;
5819
+ return {
5820
+ pass: passed,
5821
+ message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
5822
+ };
5823
+ } catch (err) {
5824
+ return {
5825
+ pass: false,
5826
+ message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
5827
+ };
5828
+ }
5829
+ }
5830
+ if (rubric === void 0) {
5831
+ return {
5832
+ pass: false,
5833
+ message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
5834
+ };
5835
+ }
5600
5836
  const resolvedRubric = resolveRubric(rubric);
5601
5837
  const judgeConfig = {
5602
5838
  ...provider !== void 0 && { provider },
@@ -5643,11 +5879,17 @@ async function validateJudge(response, config) {
5643
5879
  return {
5644
5880
  pass: passed,
5645
5881
  message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
5646
- details: reps > 1 ? {
5647
- scores,
5648
- scoreStdDev: stdDev,
5649
- highVariance
5650
- } : void 0
5882
+ details: {
5883
+ score: meanScore,
5884
+ reasoning: lastReasoning,
5885
+ judgeProvider: provider ?? "anthropic",
5886
+ judgeModel: model,
5887
+ ...reps > 1 && {
5888
+ scores,
5889
+ scoreStdDev: stdDev,
5890
+ highVariance
5891
+ }
5892
+ }
5651
5893
  };
5652
5894
  } catch (err) {
5653
5895
  return {
@@ -6026,31 +6268,68 @@ function toBeToolError(received, expected = true) {
6026
6268
 
6027
6269
  // src/assertions/matchers/toPassToolJudge.ts
6028
6270
  var DEFAULT_PASSING_THRESHOLD = 0.7;
6029
- async function toPassToolJudge(received, rubric, options = {}) {
6271
+ async function runSingleJudge(received, rubric, options) {
6030
6272
  const {
6031
6273
  reference = null,
6032
6274
  passingThreshold = DEFAULT_PASSING_THRESHOLD,
6033
6275
  reps,
6034
6276
  provider,
6035
- model
6277
+ model,
6278
+ judge
6036
6279
  } = options;
6037
6280
  const validation = await validateJudge(received, {
6038
- rubric,
6281
+ ...rubric !== void 0 && { rubric },
6039
6282
  reference: reference ?? void 0,
6040
6283
  threshold: passingThreshold,
6041
6284
  ...reps !== void 0 && { reps },
6042
6285
  ...provider !== void 0 && { provider },
6043
- ...model !== void 0 && { model }
6286
+ ...model !== void 0 && { model },
6287
+ ...judge !== void 0 && { judge }
6044
6288
  });
6289
+ return { pass: validation.pass, message: validation.message };
6290
+ }
6291
+ async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
6292
+ if (Array.isArray(rubricOrOptions)) {
6293
+ const results = await Promise.all(
6294
+ rubricOrOptions.map(async (judgeConfig) => {
6295
+ const { rubric: r, ...opts } = judgeConfig;
6296
+ return runSingleJudge(received, r, opts);
6297
+ })
6298
+ );
6299
+ const allPassed = results.every((r) => r.pass);
6300
+ const passCount = results.filter((r) => r.pass).length;
6301
+ const summary = `${passCount}/${results.length} judges passed`;
6302
+ const details = results.map((r) => r.message).join("\n");
6303
+ if (this.isNot) {
6304
+ return {
6305
+ pass: !allPassed,
6306
+ message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
6307
+ };
6308
+ }
6309
+ return {
6310
+ pass: allPassed,
6311
+ message: () => `${summary}
6312
+ ${details}`
6313
+ };
6314
+ }
6315
+ let rubric;
6316
+ let options;
6317
+ if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
6318
+ rubric = rubricOrOptions;
6319
+ options = maybeOptions ?? {};
6320
+ } else {
6321
+ options = rubricOrOptions;
6322
+ }
6323
+ const result = await runSingleJudge(received, rubric, options);
6045
6324
  if (this.isNot) {
6046
6325
  return {
6047
- pass: !validation.pass,
6048
- message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6326
+ pass: !result.pass,
6327
+ message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6049
6328
  };
6050
6329
  }
6051
6330
  return {
6052
- pass: validation.pass,
6053
- message: () => validation.message
6331
+ pass: result.pass,
6332
+ message: () => result.message
6054
6333
  };
6055
6334
  }
6056
6335
 
@@ -6334,6 +6613,7 @@ function getAuthConfigFromEnv() {
6334
6613
  return void 0;
6335
6614
  }
6336
6615
  var MCPHostConfigSchema = zod.z.object({
6616
+ hostType: zod.z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
6337
6617
  provider: zod.z.enum([
6338
6618
  "openai",
6339
6619
  "anthropic",
@@ -6344,12 +6624,18 @@ var MCPHostConfigSchema = zod.z.object({
6344
6624
  "openrouter",
6345
6625
  "xai",
6346
6626
  "vertex-anthropic"
6347
- ]),
6627
+ ]).optional(),
6348
6628
  apiKeyEnvVar: zod.z.string().optional(),
6349
6629
  model: zod.z.string().optional(),
6350
6630
  maxTokens: zod.z.number().optional(),
6351
6631
  temperature: zod.z.number().optional(),
6352
- maxToolCalls: zod.z.number().optional()
6632
+ maxToolCalls: zod.z.number().optional(),
6633
+ cli: zod.z.object({
6634
+ command: zod.z.string(),
6635
+ args: zod.z.array(zod.z.string()),
6636
+ outputFormat: zod.z.enum(["stream-json", "json"]).optional(),
6637
+ timeout: zod.z.number().optional()
6638
+ }).optional()
6353
6639
  });
6354
6640
  var SnapshotSanitizerSchema = zod.z.union([
6355
6641
  // Built-in sanitizers
@@ -6364,6 +6650,37 @@ var SnapshotSanitizerSchema = zod.z.union([
6364
6650
  remove: zod.z.array(zod.z.string())
6365
6651
  })
6366
6652
  ]);
6653
+ var JudgeExpectConfigSchema = zod.z.object({
6654
+ judge: zod.z.string().min(1).optional(),
6655
+ rubric: zod.z.union([
6656
+ zod.z.enum([
6657
+ "correctness",
6658
+ "completeness",
6659
+ "groundedness",
6660
+ "instruction-following",
6661
+ "conciseness"
6662
+ ]),
6663
+ zod.z.object({ text: zod.z.string().min(1) })
6664
+ ]).optional(),
6665
+ reference: zod.z.unknown().optional(),
6666
+ threshold: zod.z.number().min(0).max(1).optional(),
6667
+ reps: zod.z.number().int().min(1).optional(),
6668
+ provider: zod.z.enum([
6669
+ "anthropic",
6670
+ "vertex-anthropic",
6671
+ "anthropic-agent-sdk",
6672
+ "openai",
6673
+ "google"
6674
+ ]).optional(),
6675
+ model: zod.z.string().optional(),
6676
+ apiKeyEnvVar: zod.z.string().optional(),
6677
+ maxTokens: zod.z.number().int().positive().optional(),
6678
+ temperature: zod.z.number().min(0).max(1).optional(),
6679
+ maxBudgetUsd: zod.z.number().positive().optional(),
6680
+ maxToolOutputSize: zod.z.number().int().positive().optional()
6681
+ }).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
6682
+ message: 'Either "judge" or "rubric" must be provided in passesJudge'
6683
+ });
6367
6684
  var EvalExpectBlockSchema = zod.z.object({
6368
6685
  response: zod.z.unknown().optional(),
6369
6686
  schema: zod.z.string().optional(),
@@ -6372,28 +6689,7 @@ var EvalExpectBlockSchema = zod.z.object({
6372
6689
  snapshot: zod.z.string().optional(),
6373
6690
  snapshotSanitizers: zod.z.array(SnapshotSanitizerSchema).optional(),
6374
6691
  isError: zod.z.union([zod.z.boolean(), zod.z.string(), zod.z.array(zod.z.string())]).optional(),
6375
- passesJudge: zod.z.object({
6376
- rubric: zod.z.union([
6377
- zod.z.enum([
6378
- "correctness",
6379
- "completeness",
6380
- "groundedness",
6381
- "instruction-following",
6382
- "conciseness"
6383
- ]),
6384
- zod.z.object({ text: zod.z.string().min(1) })
6385
- ]),
6386
- reference: zod.z.unknown().optional(),
6387
- threshold: zod.z.number().min(0).max(1).optional(),
6388
- reps: zod.z.number().int().min(1).optional(),
6389
- provider: zod.z.enum(["anthropic", "openai", "google"]).optional(),
6390
- model: zod.z.string().optional(),
6391
- apiKeyEnvVar: zod.z.string().optional(),
6392
- maxTokens: zod.z.number().int().positive().optional(),
6393
- temperature: zod.z.number().min(0).max(1).optional(),
6394
- maxBudgetUsd: zod.z.number().positive().optional(),
6395
- maxToolOutputSize: zod.z.number().int().positive().optional()
6396
- }).optional(),
6692
+ passesJudge: zod.z.union([JudgeExpectConfigSchema, zod.z.array(JudgeExpectConfigSchema).min(1)]).optional(),
6397
6693
  responseSize: zod.z.object({
6398
6694
  maxBytes: zod.z.number().optional(),
6399
6695
  minBytes: zod.z.number().optional()
@@ -6566,6 +6862,9 @@ function createVercelOrchestrator() {
6566
6862
  try {
6567
6863
  const { generateText, stepCountIs } = await import('ai');
6568
6864
  const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
6865
+ if (!config.provider) {
6866
+ throw new Error("provider is required for SDK host type");
6867
+ }
6569
6868
  const modelId = config.model ?? defaultModel(config.provider);
6570
6869
  const model = await loadModel(config.provider, modelId);
6571
6870
  const mcpTools = await mcp.listTools();
@@ -6619,13 +6918,233 @@ function createVercelOrchestrator() {
6619
6918
  return {
6620
6919
  success: false,
6621
6920
  toolCalls: [],
6622
- error: enrichErrorMessage(err, config.provider)
6921
+ error: enrichErrorMessage(err, config.provider ?? "unknown")
6623
6922
  };
6624
6923
  }
6625
6924
  }
6626
6925
  };
6627
6926
  }
6628
6927
 
6928
+ // src/evals/mcpHost/adapters/cli/parsers.ts
6929
+ function parseStreamJson(stdout) {
6930
+ const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6931
+ const toolCalls = [];
6932
+ const textParts = [];
6933
+ const conversationHistory = [];
6934
+ for (const line of lines) {
6935
+ let event;
6936
+ try {
6937
+ event = JSON.parse(line);
6938
+ } catch {
6939
+ continue;
6940
+ }
6941
+ if (event.type === "assistant" && event.message?.content) {
6942
+ for (const block of event.message.content) {
6943
+ if (block.type === "tool_use" && block.name) {
6944
+ const rawName = block.name;
6945
+ const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
6946
+ toolCalls.push({
6947
+ name: mcpMatch ? mcpMatch[1] : rawName,
6948
+ arguments: block.input ?? {},
6949
+ id: block.id
6950
+ });
6951
+ }
6952
+ if (block.type === "text" && block.text) {
6953
+ textParts.push(block.text);
6954
+ }
6955
+ }
6956
+ }
6957
+ if (event.type === "user" && event.message?.content) {
6958
+ for (const block of event.message.content) {
6959
+ if (block.type === "tool_result") {
6960
+ const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
6961
+ conversationHistory.push({ role: "tool", content });
6962
+ }
6963
+ }
6964
+ }
6965
+ if (event.type === "result" && typeof event.result === "string") {
6966
+ if (textParts.length === 0) {
6967
+ textParts.push(event.result);
6968
+ }
6969
+ }
6970
+ if (event.type === "result" && event.is_error === true) {
6971
+ return {
6972
+ success: false,
6973
+ toolCalls,
6974
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error"
6975
+ };
6976
+ }
6977
+ }
6978
+ const response = textParts.join("");
6979
+ if (response) {
6980
+ conversationHistory.push({ role: "assistant", content: response });
6981
+ }
6982
+ return {
6983
+ success: true,
6984
+ toolCalls,
6985
+ response: response || void 0,
6986
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
6987
+ };
6988
+ }
6989
+ function createJsonParser(paths) {
6990
+ return (stdout) => {
6991
+ const data = JSON.parse(stdout);
6992
+ const rawToolCalls = getNestedValue(data, paths.toolCalls);
6993
+ const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
6994
+ name: typeof tc.name === "string" ? tc.name : "",
6995
+ arguments: tc.arguments ?? tc.args ?? {}
6996
+ })) : [];
6997
+ const response = getNestedValue(data, paths.response);
6998
+ const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
6999
+ return {
7000
+ success,
7001
+ toolCalls,
7002
+ response: typeof response === "string" ? response : void 0
7003
+ };
7004
+ };
7005
+ }
7006
+ function getNestedValue(obj, path3) {
7007
+ return path3.split(".").reduce((current, key) => {
7008
+ if (current !== null && typeof current === "object") {
7009
+ return current[key];
7010
+ }
7011
+ return void 0;
7012
+ }, obj);
7013
+ }
7014
+
7015
+ // src/evals/mcpHost/adapters/cli/runner.ts
7016
+ var DEFAULT_TIMEOUT = 12e4;
7017
+ var MAX_BUFFER = 10 * 1024 * 1024;
7018
+ function getParser(format) {
7019
+ switch (format ?? "stream-json") {
7020
+ case "stream-json":
7021
+ return parseStreamJson;
7022
+ case "json":
7023
+ return createJsonParser({
7024
+ toolCalls: "toolCalls",
7025
+ response: "response",
7026
+ success: "success"
7027
+ });
7028
+ }
7029
+ }
7030
+ function interpolateArgs(args, scenario) {
7031
+ return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
7032
+ }
7033
+ async function runCLIHost(cliConfig, scenario) {
7034
+ const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
7035
+ const args = interpolateArgs(cliConfig.args, scenario);
7036
+ const startTime = Date.now();
7037
+ let stdout;
7038
+ try {
7039
+ const result2 = await spawnProcess(cliConfig.command, args, { timeout });
7040
+ stdout = result2.stdout;
7041
+ } catch (err) {
7042
+ const elapsed = Date.now() - startTime;
7043
+ const message = err instanceof Error ? err.message : String(err);
7044
+ if (message.includes("TIMEOUT") || message.includes("timed out")) {
7045
+ return {
7046
+ success: false,
7047
+ toolCalls: [],
7048
+ error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
7049
+ };
7050
+ }
7051
+ return {
7052
+ success: false,
7053
+ toolCalls: [],
7054
+ error: `CLI host process failed: ${message}`
7055
+ };
7056
+ }
7057
+ const parse = getParser(cliConfig.outputFormat);
7058
+ let result;
7059
+ try {
7060
+ result = parse(stdout);
7061
+ } catch (err) {
7062
+ return {
7063
+ success: false,
7064
+ toolCalls: [],
7065
+ error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
7066
+ stdout (first 500 chars): ${stdout.slice(0, 500)}`
7067
+ };
7068
+ }
7069
+ const validationError = validateSimulationResult(result);
7070
+ if (validationError) {
7071
+ return {
7072
+ success: false,
7073
+ toolCalls: [],
7074
+ error: `CLI host returned invalid result: ${validationError}`
7075
+ };
7076
+ }
7077
+ return result;
7078
+ }
7079
+ function validateSimulationResult(result) {
7080
+ if (result === null || typeof result !== "object") {
7081
+ return `Expected object, got ${typeof result}`;
7082
+ }
7083
+ const obj = result;
7084
+ if (typeof obj.success !== "boolean") {
7085
+ return `"success" must be a boolean, got ${typeof obj.success}`;
7086
+ }
7087
+ if (!Array.isArray(obj.toolCalls)) {
7088
+ return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
7089
+ }
7090
+ for (let i = 0; i < obj.toolCalls.length; i++) {
7091
+ const tc = obj.toolCalls[i];
7092
+ if (typeof tc.name !== "string") {
7093
+ return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
7094
+ }
7095
+ if (typeof tc.arguments !== "object" || tc.arguments === null) {
7096
+ return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
7097
+ }
7098
+ }
7099
+ return null;
7100
+ }
7101
+ function spawnProcess(command, args, options) {
7102
+ return new Promise((resolve2, reject) => {
7103
+ const child = child_process.spawn(command, args, {
7104
+ stdio: ["pipe", "pipe", "pipe"]
7105
+ });
7106
+ child.stdin.end();
7107
+ const stdoutChunks = [];
7108
+ const stderrChunks = [];
7109
+ let totalBytes = 0;
7110
+ child.stdout.on("data", (chunk) => {
7111
+ totalBytes += chunk.length;
7112
+ if (totalBytes <= MAX_BUFFER) {
7113
+ stdoutChunks.push(chunk);
7114
+ }
7115
+ });
7116
+ child.stderr.on("data", (chunk) => {
7117
+ totalBytes += chunk.length;
7118
+ if (totalBytes <= MAX_BUFFER) {
7119
+ stderrChunks.push(chunk);
7120
+ }
7121
+ });
7122
+ const timer = setTimeout(() => {
7123
+ child.kill("SIGTERM");
7124
+ reject(new Error(`Process timed out after ${options.timeout}ms`));
7125
+ }, options.timeout);
7126
+ child.on("error", (err) => {
7127
+ clearTimeout(timer);
7128
+ reject(err);
7129
+ });
7130
+ child.on("close", (code) => {
7131
+ clearTimeout(timer);
7132
+ const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
7133
+ const stderr = Buffer.concat(stderrChunks).toString("utf-8");
7134
+ if (code !== 0) {
7135
+ reject(
7136
+ new Error(
7137
+ `Command failed with exit code ${code ?? "null"}` + (stderr ? `
7138
+ stderr: ${stderr}` : "")
7139
+ )
7140
+ );
7141
+ return;
7142
+ }
7143
+ resolve2({ stdout, stderr });
7144
+ });
7145
+ });
7146
+ }
7147
+
6629
7148
  // src/evals/mcpHost/mcpHostSimulation.ts
6630
7149
  var vercelOrchestrator = createVercelOrchestrator();
6631
7150
  var allProviders = [
@@ -6643,6 +7162,25 @@ var simulatorRegistry = new Map(
6643
7162
  allProviders.map((p) => [p, vercelOrchestrator])
6644
7163
  );
6645
7164
  async function simulateMCPHost(mcp, scenario, config) {
7165
+ const hostType = config.hostType ?? "sdk";
7166
+ if (hostType === "cli") {
7167
+ if (!config.cli) {
7168
+ throw new Error(
7169
+ `mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
7170
+ );
7171
+ }
7172
+ return runCLIHost(config.cli, scenario);
7173
+ }
7174
+ if (hostType === "browser" || hostType === "desktop") {
7175
+ throw new Error(
7176
+ `Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
7177
+ );
7178
+ }
7179
+ if (!config.provider) {
7180
+ throw new Error(
7181
+ `mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
7182
+ );
7183
+ }
6646
7184
  const simulator = simulatorRegistry.get(config.provider);
6647
7185
  if (!simulator) {
6648
7186
  throw new Error(
@@ -6834,17 +7372,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6834
7372
  };
6835
7373
  }
6836
7374
  if (expectBlock.passesJudge !== void 0) {
6837
- const effectiveReps = expectBlock.passesJudge.reps ?? config.judgeReps ?? 1;
6838
- const effectiveReference = expectBlock.passesJudge.reference !== void 0 ? expectBlock.passesJudge.reference : config.canonicalAnswer;
6839
- const validation = await validateJudge(response, {
6840
- ...expectBlock.passesJudge,
6841
- reference: effectiveReference,
6842
- reps: effectiveReps
6843
- });
6844
- results.judge = {
6845
- pass: validation.pass,
6846
- details: validation.message
6847
- };
7375
+ const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
7376
+ const judgeResultEntries = await Promise.all(
7377
+ judgeConfigs.map(async (judgeConfig) => {
7378
+ const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
7379
+ const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
7380
+ const validation = await validateJudge(response, {
7381
+ ...judgeConfig,
7382
+ reference: effectiveReference,
7383
+ reps: effectiveReps
7384
+ });
7385
+ const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
7386
+ return {
7387
+ pass: validation.pass,
7388
+ details: validation.message,
7389
+ score: validation.details?.score,
7390
+ reasoning: validation.details?.reasoning,
7391
+ judgeName,
7392
+ judgeProvider: validation.details?.judgeProvider,
7393
+ judgeModel: validation.details?.judgeModel
7394
+ };
7395
+ })
7396
+ );
7397
+ if (judgeResultEntries.length === 1) {
7398
+ results.judge = judgeResultEntries[0];
7399
+ } else {
7400
+ const allPassed = judgeResultEntries.every((r) => r.pass);
7401
+ const passCount = judgeResultEntries.filter((r) => r.pass).length;
7402
+ results.judge = {
7403
+ pass: allPassed,
7404
+ details: `${passCount}/${judgeResultEntries.length} judges passed`,
7405
+ judgeResults: judgeResultEntries
7406
+ };
7407
+ }
6848
7408
  }
6849
7409
  if (expectBlock.snapshot !== void 0) {
6850
7410
  if (!config.playwrightExpect) {
@@ -6873,6 +7433,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6873
7433
  }
6874
7434
  return { expectations: results, toolPrecision, toolRecall };
6875
7435
  }
7436
+ function buildRequest(evalCase) {
7437
+ const request = {};
7438
+ if (evalCase.description) request.description = evalCase.description;
7439
+ if (evalCase.mode === "mcp_host") {
7440
+ if (evalCase.scenario) request.scenario = evalCase.scenario;
7441
+ if (evalCase.mcpHostConfig) {
7442
+ request.mcpHostConfig = {
7443
+ provider: evalCase.mcpHostConfig.provider,
7444
+ ...evalCase.mcpHostConfig.model !== void 0 && {
7445
+ model: evalCase.mcpHostConfig.model
7446
+ }
7447
+ };
7448
+ }
7449
+ } else {
7450
+ if (evalCase.args) request.args = evalCase.args;
7451
+ }
7452
+ return request;
7453
+ }
6876
7454
  function isMCPHostSimulationResult(value) {
6877
7455
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6878
7456
  }
@@ -6921,6 +7499,7 @@ async function runSingleIteration(evalCase, context, options) {
6921
7499
  toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6922
7500
  source: "eval",
6923
7501
  pass: didCasePass(error, expectationResults),
7502
+ request: buildRequest(evalCase),
6924
7503
  response,
6925
7504
  error,
6926
7505
  expectations: expectationResults,
@@ -7063,8 +7642,13 @@ async function runEvalDataset(options, context) {
7063
7642
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7064
7643
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7065
7644
  const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7066
- const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7067
- return sum + effectiveIterations * judgeReps;
7645
+ if (c.expect?.passesJudge == null) return sum;
7646
+ const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
7647
+ const totalReps = judges.reduce(
7648
+ (r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
7649
+ 0
7650
+ );
7651
+ return sum + effectiveIterations * totalReps;
7068
7652
  }, 0);
7069
7653
  if (estimatedJudgeCalls > 50) {
7070
7654
  debugEval(
@@ -7421,6 +8005,7 @@ exports.EvalDatasetSchema = EvalDatasetSchema;
7421
8005
  exports.MCPConfigSchema = MCPConfigSchema;
7422
8006
  exports.MCP_PROTOCOL_VERSION = MCP_PROTOCOL_VERSION;
7423
8007
  exports.SnapshotSanitizers = SnapshotSanitizers;
8008
+ exports.clearJudgeRegistry = clearJudgeRegistry;
7424
8009
  exports.closeMCPClient = closeMCPClient;
7425
8010
  exports.createJudge = createJudge;
7426
8011
  exports.createMCPClientForConfig = createMCPClientForConfig;
@@ -7431,6 +8016,7 @@ exports.discoverProtectedResource = discoverProtectedResource;
7431
8016
  exports.expect = expect;
7432
8017
  exports.extractText = extractText;
7433
8018
  exports.getMissingDependencyMessage = getMissingDependencyMessage;
8019
+ exports.getRegisteredJudge = getRegisteredJudge;
7434
8020
  exports.getResponseSizeBytes = getResponseSizeBytes;
7435
8021
  exports.hasValidTokens = hasValidTokens;
7436
8022
  exports.injectTokens = injectTokens;
@@ -7451,6 +8037,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
7451
8037
  exports.performClientCredentialsFlow = performClientCredentialsFlow;
7452
8038
  exports.performOAuthSetup = performOAuthSetup;
7453
8039
  exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
8040
+ exports.registerJudge = registerJudge;
7454
8041
  exports.resolveRubric = resolveRubric;
7455
8042
  exports.runConformanceChecks = runConformanceChecks;
7456
8043
  exports.runEvalCase = runEvalCase;