@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -18,7 +18,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
18
18
  import { ProxyAgent, Agent } from 'undici';
19
19
  import { readFileSync } from 'fs';
20
20
  import { query } from '@anthropic-ai/claude-agent-sdk';
21
- import { execFile } from 'child_process';
21
+ import { execFile, spawn } from 'child_process';
22
22
  import { promisify } from 'util';
23
23
 
24
24
  var __defProp = Object.defineProperty;
@@ -3279,7 +3279,11 @@ async function performOAuthSetup(config) {
3279
3279
  const page = await context.newPage();
3280
3280
  page.setDefaultTimeout(timeoutMs);
3281
3281
  await page.goto(authorizationUrl.toString());
3282
- await completeLoginForm(page, config);
3282
+ if ("customLoginFlow" in config && config.customLoginFlow) {
3283
+ await config.customLoginFlow(page);
3284
+ } else {
3285
+ await completeLoginForm(page, config);
3286
+ }
3283
3287
  await page.waitForURL(
3284
3288
  (url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
3285
3289
  { timeout: timeoutMs }
@@ -4380,7 +4384,7 @@ function escapeHtml(text) {
4380
4384
 
4381
4385
  // package.json
4382
4386
  var package_default = {
4383
- version: "1.0.0-beta.6"};
4387
+ version: "1.0.0-beta.8"};
4384
4388
 
4385
4389
  // src/mcp/clientFactory.ts
4386
4390
  function getRetryAfterDelayMs(err) {
@@ -5097,9 +5101,17 @@ function formatBytes(bytes) {
5097
5101
  function isSimulationResult(value) {
5098
5102
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
5099
5103
  }
5104
+ function isPatternMatcher(v) {
5105
+ return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
5106
+ }
5100
5107
  function partialMatch(actual, expected) {
5101
5108
  return Object.entries(expected).every(([k, v]) => {
5102
5109
  const actualVal = actual[k];
5110
+ if (isPatternMatcher(v)) {
5111
+ if (typeof actualVal !== "string") return false;
5112
+ const re = new RegExp(v.$pattern, v.$flags);
5113
+ return re.test(actualVal);
5114
+ }
5103
5115
  if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
5104
5116
  return partialMatch(
5105
5117
  actualVal,
@@ -5238,7 +5250,175 @@ var JudgeResponseSchema = z.object({
5238
5250
  reasoning: z.string()
5239
5251
  });
5240
5252
 
5241
- // src/judge/claudeAgentJudge.ts
5253
+ // src/judge/anthropicJudge.ts
5254
+ function createAnthropicJudge(config = {}) {
5255
+ const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
5256
+ const apiKey = process.env[apiKeyEnvVar];
5257
+ if (!apiKey) {
5258
+ throw new Error(
5259
+ `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
5260
+ );
5261
+ }
5262
+ const model = config.model ?? "claude-sonnet-4-20250514";
5263
+ const maxTokens = config.maxTokens ?? 1e3;
5264
+ const temperature = config.temperature ?? 0;
5265
+ return {
5266
+ async evaluate(candidate, reference, rubric) {
5267
+ let anthropicModule;
5268
+ try {
5269
+ anthropicModule = await import('@anthropic-ai/sdk');
5270
+ } catch (err) {
5271
+ throw new Error(
5272
+ `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
5273
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5274
+ );
5275
+ }
5276
+ const client = new anthropicModule.default({ apiKey });
5277
+ const prompt = buildJudgePrompt(candidate, reference, rubric);
5278
+ const startTime = Date.now();
5279
+ const response = await client.messages.create({
5280
+ model,
5281
+ max_tokens: maxTokens,
5282
+ temperature,
5283
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5284
+ messages: [{ role: "user", content: prompt }]
5285
+ });
5286
+ const durationMs = Date.now() - startTime;
5287
+ const textBlock = response.content.find(
5288
+ (b) => b.type === "text"
5289
+ );
5290
+ const text = textBlock?.text ?? "";
5291
+ const parsed = parseJudgeResponse(text);
5292
+ return {
5293
+ pass: parsed.pass,
5294
+ score: parsed.score,
5295
+ reasoning: parsed.reasoning,
5296
+ usage: {
5297
+ inputTokens: response.usage?.input_tokens ?? 0,
5298
+ outputTokens: response.usage?.output_tokens ?? 0,
5299
+ totalCostUsd: 0,
5300
+ durationMs
5301
+ }
5302
+ };
5303
+ }
5304
+ };
5305
+ }
5306
+ function buildJudgePrompt(candidate, reference, rubric) {
5307
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5308
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5309
+ return `Rubric:
5310
+ ${rubric}
5311
+
5312
+ <candidate_response>
5313
+ ${candidateStr}
5314
+ </candidate_response>
5315
+
5316
+ <reference_answer>
5317
+ ${referenceStr ?? "No reference provided."}
5318
+ </reference_answer>
5319
+
5320
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5321
+ }
5322
+ function parseJudgeResponse(text) {
5323
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5324
+ let parsed;
5325
+ try {
5326
+ parsed = JSON.parse(cleaned);
5327
+ } catch {
5328
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5329
+ }
5330
+ const result = JudgeResponseSchema.safeParse(parsed);
5331
+ if (!result.success) {
5332
+ throw new Error(
5333
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5334
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5335
+ );
5336
+ }
5337
+ return result.data;
5338
+ }
5339
+
5340
+ // src/judge/vertexAnthropicJudge.ts
5341
+ function createVertexAnthropicJudge(config = {}) {
5342
+ const model = config.model ?? "claude-sonnet-4-20250514";
5343
+ const maxTokens = config.maxTokens ?? 1e3;
5344
+ const temperature = config.temperature ?? 0;
5345
+ return {
5346
+ async evaluate(candidate, reference, rubric) {
5347
+ let vertexModule;
5348
+ try {
5349
+ vertexModule = await import('@anthropic-ai/vertex-sdk');
5350
+ } catch (err) {
5351
+ throw new Error(
5352
+ `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
5353
+ Original error: ${err instanceof Error ? err.message : String(err)}`
5354
+ );
5355
+ }
5356
+ const client = new vertexModule.AnthropicVertex({
5357
+ projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
5358
+ region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
5359
+ });
5360
+ const prompt = buildJudgePrompt2(candidate, reference, rubric);
5361
+ const startTime = Date.now();
5362
+ const response = await client.messages.create({
5363
+ model,
5364
+ max_tokens: maxTokens,
5365
+ temperature,
5366
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
5367
+ messages: [{ role: "user", content: prompt }]
5368
+ });
5369
+ const durationMs = Date.now() - startTime;
5370
+ const textBlock = response.content.find(
5371
+ (b) => b.type === "text"
5372
+ );
5373
+ const text = textBlock?.text ?? "";
5374
+ const parsed = parseJudgeResponse2(text);
5375
+ return {
5376
+ pass: parsed.pass,
5377
+ score: parsed.score,
5378
+ reasoning: parsed.reasoning,
5379
+ usage: {
5380
+ inputTokens: response.usage?.input_tokens ?? 0,
5381
+ outputTokens: response.usage?.output_tokens ?? 0,
5382
+ totalCostUsd: 0,
5383
+ durationMs
5384
+ }
5385
+ };
5386
+ }
5387
+ };
5388
+ }
5389
+ function buildJudgePrompt2(candidate, reference, rubric) {
5390
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5391
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5392
+ return `Rubric:
5393
+ ${rubric}
5394
+
5395
+ <candidate_response>
5396
+ ${candidateStr}
5397
+ </candidate_response>
5398
+
5399
+ <reference_answer>
5400
+ ${referenceStr ?? "No reference provided."}
5401
+ </reference_answer>
5402
+
5403
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5404
+ }
5405
+ function parseJudgeResponse2(text) {
5406
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5407
+ let parsed;
5408
+ try {
5409
+ parsed = JSON.parse(cleaned);
5410
+ } catch {
5411
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
5412
+ }
5413
+ const result = JudgeResponseSchema.safeParse(parsed);
5414
+ if (!result.success) {
5415
+ throw new Error(
5416
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
5417
+ Validation errors: ${JSON.stringify(result.error.issues)}`
5418
+ );
5419
+ }
5420
+ return result.data;
5421
+ }
5242
5422
  function createClaudeAgentJudge(config) {
5243
5423
  const model = config.model ?? "claude-sonnet-4-20250514";
5244
5424
  const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -5256,7 +5436,7 @@ function createClaudeAgentJudge(config) {
5256
5436
  exceedsMaxToolOutputSize: true
5257
5437
  };
5258
5438
  }
5259
- const prompt = buildJudgePrompt(candidate, reference, rubric);
5439
+ const prompt = buildJudgePrompt3(candidate, reference, rubric);
5260
5440
  try {
5261
5441
  let resultMessage;
5262
5442
  for await (const message of query({
@@ -5288,7 +5468,7 @@ function createClaudeAgentJudge(config) {
5288
5468
  );
5289
5469
  }
5290
5470
  const responseText = resultMessage.result ?? "";
5291
- const parsed = parseJudgeResponse(responseText);
5471
+ const parsed = parseJudgeResponse3(responseText);
5292
5472
  const usage = {
5293
5473
  inputTokens: resultMessage.usage?.input_tokens ?? 0,
5294
5474
  outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -5317,7 +5497,7 @@ function createClaudeAgentJudge(config) {
5317
5497
  function buildSystemPrompt() {
5318
5498
  return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
5319
5499
  }
5320
- function buildJudgePrompt(candidate, reference, rubric) {
5500
+ function buildJudgePrompt3(candidate, reference, rubric) {
5321
5501
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5322
5502
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5323
5503
  const parts = [];
@@ -5334,7 +5514,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
5334
5514
  );
5335
5515
  return parts.join("");
5336
5516
  }
5337
- function parseJudgeResponse(text) {
5517
+ function parseJudgeResponse3(text) {
5338
5518
  let jsonText = text.trim();
5339
5519
  if (jsonText.startsWith("```json")) {
5340
5520
  jsonText = jsonText.slice(7);
@@ -5391,7 +5571,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5391
5571
  );
5392
5572
  }
5393
5573
  const client = new openaiModule.default({ apiKey });
5394
- const prompt = buildJudgePrompt2(candidate, reference, rubric);
5574
+ const prompt = buildJudgePrompt4(candidate, reference, rubric);
5395
5575
  const startTime = Date.now();
5396
5576
  const completion = await client.chat.completions.create({
5397
5577
  model,
@@ -5407,7 +5587,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5407
5587
  });
5408
5588
  const durationMs = Date.now() - startTime;
5409
5589
  const text = completion.choices[0]?.message.content ?? "";
5410
- const parsed = parseJudgeResponse2(text);
5590
+ const parsed = parseJudgeResponse4(text);
5411
5591
  return {
5412
5592
  pass: parsed.pass,
5413
5593
  score: parsed.score,
@@ -5422,7 +5602,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
5422
5602
  }
5423
5603
  };
5424
5604
  }
5425
- function buildJudgePrompt2(candidate, reference, rubric) {
5605
+ function buildJudgePrompt4(candidate, reference, rubric) {
5426
5606
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
5427
5607
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
5428
5608
  return `Rubric:
@@ -5438,7 +5618,7 @@ ${referenceStr ?? "No reference provided."}
5438
5618
 
5439
5619
  Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
5440
5620
  }
5441
- function parseJudgeResponse2(text) {
5621
+ function parseJudgeResponse4(text) {
5442
5622
  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
5443
5623
  let parsed;
5444
5624
  try {
@@ -5540,6 +5720,10 @@ function createJudge(config = {}) {
5540
5720
  const provider = config.provider ?? "anthropic";
5541
5721
  switch (provider) {
5542
5722
  case "anthropic":
5723
+ return createAnthropicJudge(config);
5724
+ case "vertex-anthropic":
5725
+ return createVertexAnthropicJudge(config);
5726
+ case "anthropic-agent-sdk":
5543
5727
  return createClaudeAgentJudge(config);
5544
5728
  case "openai":
5545
5729
  return createOpenAIJudge(config);
@@ -5550,6 +5734,34 @@ function createJudge(config = {}) {
5550
5734
  }
5551
5735
  }
5552
5736
 
5737
+ // src/judge/judgeRegistry.ts
5738
+ var registry = /* @__PURE__ */ new Map();
5739
+ function registerJudge(name15, executor) {
5740
+ const existing = registry.get(name15);
5741
+ if (existing !== void 0) {
5742
+ if (existing === executor) {
5743
+ return;
5744
+ }
5745
+ throw new Error(
5746
+ `Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
5747
+ );
5748
+ }
5749
+ registry.set(name15, executor);
5750
+ }
5751
+ function getRegisteredJudge(name15) {
5752
+ const executor = registry.get(name15);
5753
+ if (!executor) {
5754
+ const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
5755
+ throw new Error(
5756
+ `Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
5757
+ );
5758
+ }
5759
+ return executor;
5760
+ }
5761
+ function clearJudgeRegistry() {
5762
+ registry.clear();
5763
+ }
5764
+
5553
5765
  // src/assertions/validators/judge.ts
5554
5766
  function computeStdDev(scores, mean) {
5555
5767
  if (scores.length <= 1) return 0;
@@ -5558,6 +5770,7 @@ function computeStdDev(scores, mean) {
5558
5770
  }
5559
5771
  async function validateJudge(response, config) {
5560
5772
  const {
5773
+ judge: judgeName,
5561
5774
  rubric,
5562
5775
  reference,
5563
5776
  threshold = 0.7,
@@ -5570,6 +5783,29 @@ async function validateJudge(response, config) {
5570
5783
  maxBudgetUsd,
5571
5784
  maxToolOutputSize
5572
5785
  } = config;
5786
+ if (judgeName !== void 0) {
5787
+ try {
5788
+ const executor = getRegisteredJudge(judgeName);
5789
+ const judgeResult = await executor(response, reference ?? void 0);
5790
+ const score = judgeResult.score;
5791
+ const passed = score >= threshold;
5792
+ return {
5793
+ pass: passed,
5794
+ message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
5795
+ };
5796
+ } catch (err) {
5797
+ return {
5798
+ pass: false,
5799
+ message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
5800
+ };
5801
+ }
5802
+ }
5803
+ if (rubric === void 0) {
5804
+ return {
5805
+ pass: false,
5806
+ message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
5807
+ };
5808
+ }
5573
5809
  const resolvedRubric = resolveRubric(rubric);
5574
5810
  const judgeConfig = {
5575
5811
  ...provider !== void 0 && { provider },
@@ -5616,11 +5852,17 @@ async function validateJudge(response, config) {
5616
5852
  return {
5617
5853
  pass: passed,
5618
5854
  message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
5619
- details: reps > 1 ? {
5620
- scores,
5621
- scoreStdDev: stdDev,
5622
- highVariance
5623
- } : void 0
5855
+ details: {
5856
+ score: meanScore,
5857
+ reasoning: lastReasoning,
5858
+ judgeProvider: provider ?? "anthropic",
5859
+ judgeModel: model,
5860
+ ...reps > 1 && {
5861
+ scores,
5862
+ scoreStdDev: stdDev,
5863
+ highVariance
5864
+ }
5865
+ }
5624
5866
  };
5625
5867
  } catch (err) {
5626
5868
  return {
@@ -5999,31 +6241,68 @@ function toBeToolError(received, expected = true) {
5999
6241
 
6000
6242
  // src/assertions/matchers/toPassToolJudge.ts
6001
6243
  var DEFAULT_PASSING_THRESHOLD = 0.7;
6002
- async function toPassToolJudge(received, rubric, options = {}) {
6244
+ async function runSingleJudge(received, rubric, options) {
6003
6245
  const {
6004
6246
  reference = null,
6005
6247
  passingThreshold = DEFAULT_PASSING_THRESHOLD,
6006
6248
  reps,
6007
6249
  provider,
6008
- model
6250
+ model,
6251
+ judge
6009
6252
  } = options;
6010
6253
  const validation = await validateJudge(received, {
6011
- rubric,
6254
+ ...rubric !== void 0 && { rubric },
6012
6255
  reference: reference ?? void 0,
6013
6256
  threshold: passingThreshold,
6014
6257
  ...reps !== void 0 && { reps },
6015
6258
  ...provider !== void 0 && { provider },
6016
- ...model !== void 0 && { model }
6259
+ ...model !== void 0 && { model },
6260
+ ...judge !== void 0 && { judge }
6017
6261
  });
6262
+ return { pass: validation.pass, message: validation.message };
6263
+ }
6264
+ async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
6265
+ if (Array.isArray(rubricOrOptions)) {
6266
+ const results = await Promise.all(
6267
+ rubricOrOptions.map(async (judgeConfig) => {
6268
+ const { rubric: r, ...opts } = judgeConfig;
6269
+ return runSingleJudge(received, r, opts);
6270
+ })
6271
+ );
6272
+ const allPassed = results.every((r) => r.pass);
6273
+ const passCount = results.filter((r) => r.pass).length;
6274
+ const summary = `${passCount}/${results.length} judges passed`;
6275
+ const details = results.map((r) => r.message).join("\n");
6276
+ if (this.isNot) {
6277
+ return {
6278
+ pass: !allPassed,
6279
+ message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
6280
+ };
6281
+ }
6282
+ return {
6283
+ pass: allPassed,
6284
+ message: () => `${summary}
6285
+ ${details}`
6286
+ };
6287
+ }
6288
+ let rubric;
6289
+ let options;
6290
+ if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
6291
+ rubric = rubricOrOptions;
6292
+ options = maybeOptions ?? {};
6293
+ } else {
6294
+ options = rubricOrOptions;
6295
+ }
6296
+ const result = await runSingleJudge(received, rubric, options);
6018
6297
  if (this.isNot) {
6019
6298
  return {
6020
- pass: !validation.pass,
6021
- message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6299
+ pass: !result.pass,
6300
+ message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
6022
6301
  };
6023
6302
  }
6024
6303
  return {
6025
- pass: validation.pass,
6026
- message: () => validation.message
6304
+ pass: result.pass,
6305
+ message: () => result.message
6027
6306
  };
6028
6307
  }
6029
6308
 
@@ -6307,6 +6586,7 @@ function getAuthConfigFromEnv() {
6307
6586
  return void 0;
6308
6587
  }
6309
6588
  var MCPHostConfigSchema = z.object({
6589
+ hostType: z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
6310
6590
  provider: z.enum([
6311
6591
  "openai",
6312
6592
  "anthropic",
@@ -6317,12 +6597,18 @@ var MCPHostConfigSchema = z.object({
6317
6597
  "openrouter",
6318
6598
  "xai",
6319
6599
  "vertex-anthropic"
6320
- ]),
6600
+ ]).optional(),
6321
6601
  apiKeyEnvVar: z.string().optional(),
6322
6602
  model: z.string().optional(),
6323
6603
  maxTokens: z.number().optional(),
6324
6604
  temperature: z.number().optional(),
6325
- maxToolCalls: z.number().optional()
6605
+ maxToolCalls: z.number().optional(),
6606
+ cli: z.object({
6607
+ command: z.string(),
6608
+ args: z.array(z.string()),
6609
+ outputFormat: z.enum(["stream-json", "json"]).optional(),
6610
+ timeout: z.number().optional()
6611
+ }).optional()
6326
6612
  });
6327
6613
  var SnapshotSanitizerSchema = z.union([
6328
6614
  // Built-in sanitizers
@@ -6337,6 +6623,37 @@ var SnapshotSanitizerSchema = z.union([
6337
6623
  remove: z.array(z.string())
6338
6624
  })
6339
6625
  ]);
6626
+ var JudgeExpectConfigSchema = z.object({
6627
+ judge: z.string().min(1).optional(),
6628
+ rubric: z.union([
6629
+ z.enum([
6630
+ "correctness",
6631
+ "completeness",
6632
+ "groundedness",
6633
+ "instruction-following",
6634
+ "conciseness"
6635
+ ]),
6636
+ z.object({ text: z.string().min(1) })
6637
+ ]).optional(),
6638
+ reference: z.unknown().optional(),
6639
+ threshold: z.number().min(0).max(1).optional(),
6640
+ reps: z.number().int().min(1).optional(),
6641
+ provider: z.enum([
6642
+ "anthropic",
6643
+ "vertex-anthropic",
6644
+ "anthropic-agent-sdk",
6645
+ "openai",
6646
+ "google"
6647
+ ]).optional(),
6648
+ model: z.string().optional(),
6649
+ apiKeyEnvVar: z.string().optional(),
6650
+ maxTokens: z.number().int().positive().optional(),
6651
+ temperature: z.number().min(0).max(1).optional(),
6652
+ maxBudgetUsd: z.number().positive().optional(),
6653
+ maxToolOutputSize: z.number().int().positive().optional()
6654
+ }).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
6655
+ message: 'Either "judge" or "rubric" must be provided in passesJudge'
6656
+ });
6340
6657
  var EvalExpectBlockSchema = z.object({
6341
6658
  response: z.unknown().optional(),
6342
6659
  schema: z.string().optional(),
@@ -6345,28 +6662,7 @@ var EvalExpectBlockSchema = z.object({
6345
6662
  snapshot: z.string().optional(),
6346
6663
  snapshotSanitizers: z.array(SnapshotSanitizerSchema).optional(),
6347
6664
  isError: z.union([z.boolean(), z.string(), z.array(z.string())]).optional(),
6348
- passesJudge: z.object({
6349
- rubric: z.union([
6350
- z.enum([
6351
- "correctness",
6352
- "completeness",
6353
- "groundedness",
6354
- "instruction-following",
6355
- "conciseness"
6356
- ]),
6357
- z.object({ text: z.string().min(1) })
6358
- ]),
6359
- reference: z.unknown().optional(),
6360
- threshold: z.number().min(0).max(1).optional(),
6361
- reps: z.number().int().min(1).optional(),
6362
- provider: z.enum(["anthropic", "openai", "google"]).optional(),
6363
- model: z.string().optional(),
6364
- apiKeyEnvVar: z.string().optional(),
6365
- maxTokens: z.number().int().positive().optional(),
6366
- temperature: z.number().min(0).max(1).optional(),
6367
- maxBudgetUsd: z.number().positive().optional(),
6368
- maxToolOutputSize: z.number().int().positive().optional()
6369
- }).optional(),
6665
+ passesJudge: z.union([JudgeExpectConfigSchema, z.array(JudgeExpectConfigSchema).min(1)]).optional(),
6370
6666
  responseSize: z.object({
6371
6667
  maxBytes: z.number().optional(),
6372
6668
  minBytes: z.number().optional()
@@ -6539,6 +6835,9 @@ function createVercelOrchestrator() {
6539
6835
  try {
6540
6836
  const { generateText, stepCountIs } = await import('ai');
6541
6837
  const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
6838
+ if (!config.provider) {
6839
+ throw new Error("provider is required for SDK host type");
6840
+ }
6542
6841
  const modelId = config.model ?? defaultModel(config.provider);
6543
6842
  const model = await loadModel(config.provider, modelId);
6544
6843
  const mcpTools = await mcp.listTools();
@@ -6592,13 +6891,233 @@ function createVercelOrchestrator() {
6592
6891
  return {
6593
6892
  success: false,
6594
6893
  toolCalls: [],
6595
- error: enrichErrorMessage(err, config.provider)
6894
+ error: enrichErrorMessage(err, config.provider ?? "unknown")
6596
6895
  };
6597
6896
  }
6598
6897
  }
6599
6898
  };
6600
6899
  }
6601
6900
 
6901
+ // src/evals/mcpHost/adapters/cli/parsers.ts
6902
+ function parseStreamJson(stdout) {
6903
+ const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6904
+ const toolCalls = [];
6905
+ const textParts = [];
6906
+ const conversationHistory = [];
6907
+ for (const line of lines) {
6908
+ let event;
6909
+ try {
6910
+ event = JSON.parse(line);
6911
+ } catch {
6912
+ continue;
6913
+ }
6914
+ if (event.type === "assistant" && event.message?.content) {
6915
+ for (const block of event.message.content) {
6916
+ if (block.type === "tool_use" && block.name) {
6917
+ const rawName = block.name;
6918
+ const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
6919
+ toolCalls.push({
6920
+ name: mcpMatch ? mcpMatch[1] : rawName,
6921
+ arguments: block.input ?? {},
6922
+ id: block.id
6923
+ });
6924
+ }
6925
+ if (block.type === "text" && block.text) {
6926
+ textParts.push(block.text);
6927
+ }
6928
+ }
6929
+ }
6930
+ if (event.type === "user" && event.message?.content) {
6931
+ for (const block of event.message.content) {
6932
+ if (block.type === "tool_result") {
6933
+ const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
6934
+ conversationHistory.push({ role: "tool", content });
6935
+ }
6936
+ }
6937
+ }
6938
+ if (event.type === "result" && typeof event.result === "string") {
6939
+ if (textParts.length === 0) {
6940
+ textParts.push(event.result);
6941
+ }
6942
+ }
6943
+ if (event.type === "result" && event.is_error === true) {
6944
+ return {
6945
+ success: false,
6946
+ toolCalls,
6947
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error"
6948
+ };
6949
+ }
6950
+ }
6951
+ const response = textParts.join("");
6952
+ if (response) {
6953
+ conversationHistory.push({ role: "assistant", content: response });
6954
+ }
6955
+ return {
6956
+ success: true,
6957
+ toolCalls,
6958
+ response: response || void 0,
6959
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
6960
+ };
6961
+ }
6962
+ function createJsonParser(paths) {
6963
+ return (stdout) => {
6964
+ const data = JSON.parse(stdout);
6965
+ const rawToolCalls = getNestedValue(data, paths.toolCalls);
6966
+ const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
6967
+ name: typeof tc.name === "string" ? tc.name : "",
6968
+ arguments: tc.arguments ?? tc.args ?? {}
6969
+ })) : [];
6970
+ const response = getNestedValue(data, paths.response);
6971
+ const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
6972
+ return {
6973
+ success,
6974
+ toolCalls,
6975
+ response: typeof response === "string" ? response : void 0
6976
+ };
6977
+ };
6978
+ }
6979
+ function getNestedValue(obj, path3) {
6980
+ return path3.split(".").reduce((current, key) => {
6981
+ if (current !== null && typeof current === "object") {
6982
+ return current[key];
6983
+ }
6984
+ return void 0;
6985
+ }, obj);
6986
+ }
6987
+
6988
+ // src/evals/mcpHost/adapters/cli/runner.ts
6989
+ var DEFAULT_TIMEOUT = 12e4;
6990
+ var MAX_BUFFER = 10 * 1024 * 1024;
6991
+ function getParser(format) {
6992
+ switch (format ?? "stream-json") {
6993
+ case "stream-json":
6994
+ return parseStreamJson;
6995
+ case "json":
6996
+ return createJsonParser({
6997
+ toolCalls: "toolCalls",
6998
+ response: "response",
6999
+ success: "success"
7000
+ });
7001
+ }
7002
+ }
7003
+ function interpolateArgs(args, scenario) {
7004
+ return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
7005
+ }
7006
+ async function runCLIHost(cliConfig, scenario) {
7007
+ const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
7008
+ const args = interpolateArgs(cliConfig.args, scenario);
7009
+ const startTime = Date.now();
7010
+ let stdout;
7011
+ try {
7012
+ const result2 = await spawnProcess(cliConfig.command, args, { timeout });
7013
+ stdout = result2.stdout;
7014
+ } catch (err) {
7015
+ const elapsed = Date.now() - startTime;
7016
+ const message = err instanceof Error ? err.message : String(err);
7017
+ if (message.includes("TIMEOUT") || message.includes("timed out")) {
7018
+ return {
7019
+ success: false,
7020
+ toolCalls: [],
7021
+ error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
7022
+ };
7023
+ }
7024
+ return {
7025
+ success: false,
7026
+ toolCalls: [],
7027
+ error: `CLI host process failed: ${message}`
7028
+ };
7029
+ }
7030
+ const parse = getParser(cliConfig.outputFormat);
7031
+ let result;
7032
+ try {
7033
+ result = parse(stdout);
7034
+ } catch (err) {
7035
+ return {
7036
+ success: false,
7037
+ toolCalls: [],
7038
+ error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
7039
+ stdout (first 500 chars): ${stdout.slice(0, 500)}`
7040
+ };
7041
+ }
7042
+ const validationError = validateSimulationResult(result);
7043
+ if (validationError) {
7044
+ return {
7045
+ success: false,
7046
+ toolCalls: [],
7047
+ error: `CLI host returned invalid result: ${validationError}`
7048
+ };
7049
+ }
7050
+ return result;
7051
+ }
7052
+ function validateSimulationResult(result) {
7053
+ if (result === null || typeof result !== "object") {
7054
+ return `Expected object, got ${typeof result}`;
7055
+ }
7056
+ const obj = result;
7057
+ if (typeof obj.success !== "boolean") {
7058
+ return `"success" must be a boolean, got ${typeof obj.success}`;
7059
+ }
7060
+ if (!Array.isArray(obj.toolCalls)) {
7061
+ return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
7062
+ }
7063
+ for (let i = 0; i < obj.toolCalls.length; i++) {
7064
+ const tc = obj.toolCalls[i];
7065
+ if (typeof tc.name !== "string") {
7066
+ return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
7067
+ }
7068
+ if (typeof tc.arguments !== "object" || tc.arguments === null) {
7069
+ return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
7070
+ }
7071
+ }
7072
+ return null;
7073
+ }
7074
+ function spawnProcess(command, args, options) {
7075
+ return new Promise((resolve2, reject) => {
7076
+ const child = spawn(command, args, {
7077
+ stdio: ["pipe", "pipe", "pipe"]
7078
+ });
7079
+ child.stdin.end();
7080
+ const stdoutChunks = [];
7081
+ const stderrChunks = [];
7082
+ let totalBytes = 0;
7083
+ child.stdout.on("data", (chunk) => {
7084
+ totalBytes += chunk.length;
7085
+ if (totalBytes <= MAX_BUFFER) {
7086
+ stdoutChunks.push(chunk);
7087
+ }
7088
+ });
7089
+ child.stderr.on("data", (chunk) => {
7090
+ totalBytes += chunk.length;
7091
+ if (totalBytes <= MAX_BUFFER) {
7092
+ stderrChunks.push(chunk);
7093
+ }
7094
+ });
7095
+ const timer = setTimeout(() => {
7096
+ child.kill("SIGTERM");
7097
+ reject(new Error(`Process timed out after ${options.timeout}ms`));
7098
+ }, options.timeout);
7099
+ child.on("error", (err) => {
7100
+ clearTimeout(timer);
7101
+ reject(err);
7102
+ });
7103
+ child.on("close", (code) => {
7104
+ clearTimeout(timer);
7105
+ const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
7106
+ const stderr = Buffer.concat(stderrChunks).toString("utf-8");
7107
+ if (code !== 0) {
7108
+ reject(
7109
+ new Error(
7110
+ `Command failed with exit code ${code ?? "null"}` + (stderr ? `
7111
+ stderr: ${stderr}` : "")
7112
+ )
7113
+ );
7114
+ return;
7115
+ }
7116
+ resolve2({ stdout, stderr });
7117
+ });
7118
+ });
7119
+ }
7120
+
6602
7121
  // src/evals/mcpHost/mcpHostSimulation.ts
6603
7122
  var vercelOrchestrator = createVercelOrchestrator();
6604
7123
  var allProviders = [
@@ -6616,6 +7135,25 @@ var simulatorRegistry = new Map(
6616
7135
  allProviders.map((p) => [p, vercelOrchestrator])
6617
7136
  );
6618
7137
  async function simulateMCPHost(mcp, scenario, config) {
7138
+ const hostType = config.hostType ?? "sdk";
7139
+ if (hostType === "cli") {
7140
+ if (!config.cli) {
7141
+ throw new Error(
7142
+ `mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
7143
+ );
7144
+ }
7145
+ return runCLIHost(config.cli, scenario);
7146
+ }
7147
+ if (hostType === "browser" || hostType === "desktop") {
7148
+ throw new Error(
7149
+ `Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
7150
+ );
7151
+ }
7152
+ if (!config.provider) {
7153
+ throw new Error(
7154
+ `mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
7155
+ );
7156
+ }
6619
7157
  const simulator = simulatorRegistry.get(config.provider);
6620
7158
  if (!simulator) {
6621
7159
  throw new Error(
@@ -6807,17 +7345,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6807
7345
  };
6808
7346
  }
6809
7347
  if (expectBlock.passesJudge !== void 0) {
6810
- const effectiveReps = expectBlock.passesJudge.reps ?? config.judgeReps ?? 1;
6811
- const effectiveReference = expectBlock.passesJudge.reference !== void 0 ? expectBlock.passesJudge.reference : config.canonicalAnswer;
6812
- const validation = await validateJudge(response, {
6813
- ...expectBlock.passesJudge,
6814
- reference: effectiveReference,
6815
- reps: effectiveReps
6816
- });
6817
- results.judge = {
6818
- pass: validation.pass,
6819
- details: validation.message
6820
- };
7348
+ const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
7349
+ const judgeResultEntries = await Promise.all(
7350
+ judgeConfigs.map(async (judgeConfig) => {
7351
+ const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
7352
+ const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
7353
+ const validation = await validateJudge(response, {
7354
+ ...judgeConfig,
7355
+ reference: effectiveReference,
7356
+ reps: effectiveReps
7357
+ });
7358
+ const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
7359
+ return {
7360
+ pass: validation.pass,
7361
+ details: validation.message,
7362
+ score: validation.details?.score,
7363
+ reasoning: validation.details?.reasoning,
7364
+ judgeName,
7365
+ judgeProvider: validation.details?.judgeProvider,
7366
+ judgeModel: validation.details?.judgeModel
7367
+ };
7368
+ })
7369
+ );
7370
+ if (judgeResultEntries.length === 1) {
7371
+ results.judge = judgeResultEntries[0];
7372
+ } else {
7373
+ const allPassed = judgeResultEntries.every((r) => r.pass);
7374
+ const passCount = judgeResultEntries.filter((r) => r.pass).length;
7375
+ results.judge = {
7376
+ pass: allPassed,
7377
+ details: `${passCount}/${judgeResultEntries.length} judges passed`,
7378
+ judgeResults: judgeResultEntries
7379
+ };
7380
+ }
6821
7381
  }
6822
7382
  if (expectBlock.snapshot !== void 0) {
6823
7383
  if (!config.playwrightExpect) {
@@ -6846,6 +7406,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6846
7406
  }
6847
7407
  return { expectations: results, toolPrecision, toolRecall };
6848
7408
  }
7409
+ function buildRequest(evalCase) {
7410
+ const request = {};
7411
+ if (evalCase.description) request.description = evalCase.description;
7412
+ if (evalCase.mode === "mcp_host") {
7413
+ if (evalCase.scenario) request.scenario = evalCase.scenario;
7414
+ if (evalCase.mcpHostConfig) {
7415
+ request.mcpHostConfig = {
7416
+ provider: evalCase.mcpHostConfig.provider,
7417
+ ...evalCase.mcpHostConfig.model !== void 0 && {
7418
+ model: evalCase.mcpHostConfig.model
7419
+ }
7420
+ };
7421
+ }
7422
+ } else {
7423
+ if (evalCase.args) request.args = evalCase.args;
7424
+ }
7425
+ return request;
7426
+ }
6849
7427
  function isMCPHostSimulationResult(value) {
6850
7428
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6851
7429
  }
@@ -6894,6 +7472,7 @@ async function runSingleIteration(evalCase, context, options) {
6894
7472
  toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6895
7473
  source: "eval",
6896
7474
  pass: didCasePass(error, expectationResults),
7475
+ request: buildRequest(evalCase),
6897
7476
  response,
6898
7477
  error,
6899
7478
  expectations: expectationResults,
@@ -7036,8 +7615,13 @@ async function runEvalDataset(options, context) {
7036
7615
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7037
7616
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7038
7617
  const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7039
- const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7040
- return sum + effectiveIterations * judgeReps;
7618
+ if (c.expect?.passesJudge == null) return sum;
7619
+ const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
7620
+ const totalReps = judges.reduce(
7621
+ (r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
7622
+ 0
7623
+ );
7624
+ return sum + effectiveIterations * totalReps;
7041
7625
  }, 0);
7042
7626
  if (estimatedJudgeCalls > 50) {
7043
7627
  debugEval(
@@ -7385,6 +7969,6 @@ function formatCapabilities(capabilities) {
7385
7969
  return parts.length > 0 ? parts.join(", ") : "none declared";
7386
7970
  }
7387
7971
 
7388
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7972
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7389
7973
  //# sourceMappingURL=index.js.map
7390
7974
  //# sourceMappingURL=index.js.map