npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.7 → 1.0.0 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +20 -1
package/dist/cli/index.js +12 -1
package/dist/fixtures/mcp.d.ts +33 -8
package/dist/fixtures/mcp.js +354 -37
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +721 -76
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +533 -116
package/dist/index.d.ts +533 -116
package/dist/index.js +719 -78
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +8 -134
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +11 -6
package/dist/reporters/mcpReporter.d.cts +0 -90
package/dist/reporters/mcpReporter.d.ts +0 -90

package/dist/index.cjs CHANGED Viewed

@@ -3306,7 +3306,11 @@ async function performOAuthSetup(config) {
     const page = await context.newPage();
     page.setDefaultTimeout(timeoutMs);
     await page.goto(authorizationUrl.toString());
-    await completeLoginForm(page, config);
+    if ("customLoginFlow" in config && config.customLoginFlow) {
+      await config.customLoginFlow(page);
+    } else {
+      await completeLoginForm(page, config);
+    }
     await page.waitForURL(
       (url) => url.href.startsWith(redirectUri) && url.searchParams.has("code"),
       { timeout: timeoutMs }
@@ -4407,7 +4411,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.7"};
+  version: "1.0.0"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -4626,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
 }
 async function closeMCPClient(client) {
   try {
+    const transport = client.transport;
+    if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
+      try {
+        await transport.terminateSession();
+      } catch (sessionError) {
+        debugClient(
+          "Error terminating session: %s",
+          sessionError instanceof Error ? sessionError.message : String(sessionError)
+        );
+      }
+    }
     await client.close();
   } catch (error) {
     debugClient(
@@ -4854,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
   } catch (error) {
     const zodError = error;
     const issues = formatZodIssues(zodError);
+    const text = stringifyResponse(response);
     return {
       pass: false,
       message: `Response does not match schema: ${issues}`,
       details: {
-        issues: zodError.issues
+        issues: zodError.issues,
+        textPreview: truncateForDisplay2(text)
       }
     };
   }
@@ -4911,6 +4928,12 @@ function formatZodIssues(error) {
   });
   return issues.join("; ");
 }
+function truncateForDisplay2(str, maxLength = 200) {
+  if (str.length <= maxLength) {
+    return str;
+  }
+  return str.slice(0, maxLength) + "... (truncated)";
+}
 // src/assertions/validators/text.ts
 function validateText(response, expected, options = {}) {
@@ -4937,11 +4960,11 @@ function validateText(response, expected, options = {}) {
     details: {
       missing,
       textLength: text.length,
-      textPreview: truncateForDisplay2(text)
+      textPreview: truncateForDisplay3(text)
     }
   };
 }
-function truncateForDisplay2(str, maxLength = 200) {
+function truncateForDisplay3(str, maxLength = 200) {
   if (str.length <= maxLength) {
     return str;
   }
@@ -4973,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
     details: {
       unmatched,
       textLength: text.length,
-      textPreview: truncateForDisplay3(text)
+      textPreview: truncateForDisplay4(text)
     }
   };
 }
@@ -4993,7 +5016,7 @@ function patternToString(pattern) {
   }
   return `/${pattern}/`;
 }
-function truncateForDisplay3(str, maxLength = 200) {
+function truncateForDisplay4(str, maxLength = 200) {
   if (str.length <= maxLength) {
     return str;
   }
@@ -5016,7 +5039,7 @@ function validateError(response, expected = true) {
         pass: false,
         message: "Expected an error response but got success",
         details: {
-          textPreview: truncateForDisplay4(extractText2(response))
+          textPreview: truncateForDisplay5(extractText2(response))
         }
       };
     } else {
@@ -5028,7 +5051,7 @@ function validateError(response, expected = true) {
       }
       return {
         pass: false,
-        message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
+        message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
         details: {
           errorMessage
         }
@@ -5041,7 +5064,7 @@ function validateError(response, expected = true) {
       pass: false,
       message: `Expected an error containing "${expectedMessages[0]}" but got success`,
       details: {
-        textPreview: truncateForDisplay4(extractText2(response))
+        textPreview: truncateForDisplay5(extractText2(response))
       }
     };
   }
@@ -5063,7 +5086,7 @@ function validateError(response, expected = true) {
     }
   };
 }
-function truncateForDisplay4(str, maxLength = 200) {
+function truncateForDisplay5(str, maxLength = 200) {
   if (str.length <= maxLength) {
     return str;
   }
@@ -5124,9 +5147,17 @@ function formatBytes(bytes) {
 function isSimulationResult(value) {
   return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
 }
+function isPatternMatcher(v) {
+  return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
+}
 function partialMatch(actual, expected) {
   return Object.entries(expected).every(([k, v]) => {
     const actualVal = actual[k];
+    if (isPatternMatcher(v)) {
+      if (typeof actualVal !== "string") return false;
+      const re = new RegExp(v.$pattern, v.$flags);
+      return re.test(actualVal);
+    }
     if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
       return partialMatch(
         actualVal,
@@ -5173,6 +5204,10 @@ function validateToolCalls(response, expectation) {
           return {
             pass: false,
             message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
+            details: {
+              actual: actual.map((c) => c.name),
+              expected: expected.name
+            },
             metrics
           };
         }
@@ -5189,6 +5224,10 @@ function validateToolCalls(response, expectation) {
         return {
           pass: false,
           message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
+          details: {
+            actual: actual.map((c) => c.name),
+            expected: expected.name
+          },
           metrics
         };
       }
@@ -5201,6 +5240,10 @@ function validateToolCalls(response, expectation) {
       return {
         pass: false,
         message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
+        details: {
+          actual: actual.map((c) => c.name),
+          unexpected: unexpected.map((c) => c.name)
+        },
         metrics
       };
     }
@@ -5219,19 +5262,22 @@ function validateToolCallCount(response, options) {
   if (exact !== void 0 && count !== exact) {
     return {
       pass: false,
-      message: `Expected exactly ${exact} tool call(s), but got ${count}`
+      message: `Expected exactly ${exact} tool call(s), but got ${count}`,
+      details: { actual: count, expected: exact }
     };
   }
   if (min !== void 0 && count < min) {
     return {
       pass: false,
-      message: `Expected at least ${min} tool call(s), but got ${count}`
+      message: `Expected at least ${min} tool call(s), but got ${count}`,
+      details: { actual: count, min }
     };
   }
   if (max !== void 0 && count > max) {
     return {
       pass: false,
-      message: `Expected at most ${max} tool call(s), but got ${count}`
+      message: `Expected at most ${max} tool call(s), but got ${count}`,
+      details: { actual: count, max }
     };
   }
   return {
@@ -5265,7 +5311,175 @@ var JudgeResponseSchema = zod.z.object({
   reasoning: zod.z.string()
 });
-// src/judge/claudeAgentJudge.ts
+// src/judge/anthropicJudge.ts
+function createAnthropicJudge(config = {}) {
+  const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
+  const apiKey = process.env[apiKeyEnvVar];
+  if (!apiKey) {
+    throw new Error(
+      `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
+    );
+  }
+  const model = config.model ?? "claude-sonnet-4-20250514";
+  const maxTokens = config.maxTokens ?? 1e3;
+  const temperature = config.temperature ?? 0;
+  return {
+    async evaluate(candidate, reference, rubric) {
+      let anthropicModule;
+      try {
+        anthropicModule = await import('@anthropic-ai/sdk');
+      } catch (err) {
+        throw new Error(
+          `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
+Original error: ${err instanceof Error ? err.message : String(err)}`
+        );
+      }
+      const client = new anthropicModule.default({ apiKey });
+      const prompt = buildJudgePrompt(candidate, reference, rubric);
+      const startTime = Date.now();
+      const response = await client.messages.create({
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
+        messages: [{ role: "user", content: prompt }]
+      });
+      const durationMs = Date.now() - startTime;
+      const textBlock = response.content.find(
+        (b) => b.type === "text"
+      );
+      const text = textBlock?.text ?? "";
+      const parsed = parseJudgeResponse(text);
+      return {
+        pass: parsed.pass,
+        score: parsed.score,
+        reasoning: parsed.reasoning,
+        usage: {
+          inputTokens: response.usage?.input_tokens ?? 0,
+          outputTokens: response.usage?.output_tokens ?? 0,
+          totalCostUsd: 0,
+          durationMs
+        }
+      };
+    }
+  };
+}
+function buildJudgePrompt(candidate, reference, rubric) {
+  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
+  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
+  return `Rubric:
+${rubric}
+<candidate_response>
+${candidateStr}
+</candidate_response>
+<reference_answer>
+${referenceStr ?? "No reference provided."}
+</reference_answer>
+Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
+}
+function parseJudgeResponse(text) {
+  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse judge response as JSON: ${text}`);
+  }
+  const result = JudgeResponseSchema.safeParse(parsed);
+  if (!result.success) {
+    throw new Error(
+      `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
+Validation errors: ${JSON.stringify(result.error.issues)}`
+    );
+  }
+  return result.data;
+}
+// src/judge/vertexAnthropicJudge.ts
+function createVertexAnthropicJudge(config = {}) {
+  const model = config.model ?? "claude-sonnet-4-20250514";
+  const maxTokens = config.maxTokens ?? 1e3;
+  const temperature = config.temperature ?? 0;
+  return {
+    async evaluate(candidate, reference, rubric) {
+      let vertexModule;
+      try {
+        vertexModule = await import('@anthropic-ai/vertex-sdk');
+      } catch (err) {
+        throw new Error(
+          `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
+Original error: ${err instanceof Error ? err.message : String(err)}`
+        );
+      }
+      const client = new vertexModule.AnthropicVertex({
+        projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
+        region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
+      });
+      const prompt = buildJudgePrompt2(candidate, reference, rubric);
+      const startTime = Date.now();
+      const response = await client.messages.create({
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
+        messages: [{ role: "user", content: prompt }]
+      });
+      const durationMs = Date.now() - startTime;
+      const textBlock = response.content.find(
+        (b) => b.type === "text"
+      );
+      const text = textBlock?.text ?? "";
+      const parsed = parseJudgeResponse2(text);
+      return {
+        pass: parsed.pass,
+        score: parsed.score,
+        reasoning: parsed.reasoning,
+        usage: {
+          inputTokens: response.usage?.input_tokens ?? 0,
+          outputTokens: response.usage?.output_tokens ?? 0,
+          totalCostUsd: 0,
+          durationMs
+        }
+      };
+    }
+  };
+}
+function buildJudgePrompt2(candidate, reference, rubric) {
+  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
+  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
+  return `Rubric:
+${rubric}
+<candidate_response>
+${candidateStr}
+</candidate_response>
+<reference_answer>
+${referenceStr ?? "No reference provided."}
+</reference_answer>
+Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
+}
+function parseJudgeResponse2(text) {
+  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse judge response as JSON: ${text}`);
+  }
+  const result = JudgeResponseSchema.safeParse(parsed);
+  if (!result.success) {
+    throw new Error(
+      `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
+Validation errors: ${JSON.stringify(result.error.issues)}`
+    );
+  }
+  return result.data;
+}
 function createClaudeAgentJudge(config) {
   const model = config.model ?? "claude-sonnet-4-20250514";
   const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -5283,7 +5497,7 @@ function createClaudeAgentJudge(config) {
           exceedsMaxToolOutputSize: true
         };
       }
-      const prompt = buildJudgePrompt(candidate, reference, rubric);
+      const prompt = buildJudgePrompt3(candidate, reference, rubric);
       try {
         let resultMessage;
         for await (const message of claudeAgentSdk.query({
@@ -5315,7 +5529,7 @@ function createClaudeAgentJudge(config) {
           );
         }
         const responseText = resultMessage.result ?? "";
-        const parsed = parseJudgeResponse(responseText);
+        const parsed = parseJudgeResponse3(responseText);
         const usage = {
           inputTokens: resultMessage.usage?.input_tokens ?? 0,
           outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -5344,7 +5558,7 @@ function createClaudeAgentJudge(config) {
 function buildSystemPrompt() {
   return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
 }
-function buildJudgePrompt(candidate, reference, rubric) {
+function buildJudgePrompt3(candidate, reference, rubric) {
   const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
   const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
   const parts = [];
@@ -5361,7 +5575,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
   );
   return parts.join("");
 }
-function parseJudgeResponse(text) {
+function parseJudgeResponse3(text) {
   let jsonText = text.trim();
   if (jsonText.startsWith("```json")) {
     jsonText = jsonText.slice(7);
@@ -5418,7 +5632,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
         );
       }
       const client = new openaiModule.default({ apiKey });
-      const prompt = buildJudgePrompt2(candidate, reference, rubric);
+      const prompt = buildJudgePrompt4(candidate, reference, rubric);
       const startTime = Date.now();
       const completion = await client.chat.completions.create({
         model,
@@ -5434,7 +5648,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
       });
       const durationMs = Date.now() - startTime;
       const text = completion.choices[0]?.message.content ?? "";
-      const parsed = parseJudgeResponse2(text);
+      const parsed = parseJudgeResponse4(text);
       return {
         pass: parsed.pass,
         score: parsed.score,
@@ -5449,7 +5663,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
     }
   };
 }
-function buildJudgePrompt2(candidate, reference, rubric) {
+function buildJudgePrompt4(candidate, reference, rubric) {
   const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
   const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
   return `Rubric:
@@ -5465,7 +5679,7 @@ ${referenceStr ?? "No reference provided."}
 Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
 }
-function parseJudgeResponse2(text) {
+function parseJudgeResponse4(text) {
   const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
   let parsed;
   try {
@@ -5567,14 +5781,48 @@ function createJudge(config = {}) {
   const provider = config.provider ?? "anthropic";
   switch (provider) {
     case "anthropic":
+      return createAnthropicJudge(config);
+    case "vertex-anthropic":
+      return createVertexAnthropicJudge(config);
+    case "anthropic-agent-sdk":
       return createClaudeAgentJudge(config);
     case "openai":
       return createOpenAIJudge(config);
     case "google":
       return createGoogleJudge(config);
     default:
-      throw new Error(`Unsupported LLM provider: ${String(provider)}`);
+      throw new Error(
+        `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
+      );
+  }
+}
+// src/judge/judgeRegistry.ts
+var registry = /* @__PURE__ */ new Map();
+function registerJudge(name15, executor) {
+  const existing = registry.get(name15);
+  if (existing !== void 0) {
+    if (existing === executor) {
+      return;
+    }
+    throw new Error(
+      `Judge "${name15}" is already registered with a different executor. Use clearJudgeRegistry() first if you need to replace it.`
+    );
+  }
+  registry.set(name15, executor);
+}
+function getRegisteredJudge(name15) {
+  const executor = registry.get(name15);
+  if (!executor) {
+    const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
+    throw new Error(
+      `Judge "${name15}" is not registered.${available} Register it with registerJudge() before tests run.`
+    );
   }
+  return executor;
+}
+function clearJudgeRegistry() {
+  registry.clear();
 }
 // src/assertions/validators/judge.ts
@@ -5585,6 +5833,7 @@ function computeStdDev(scores, mean) {
 }
 async function validateJudge(response, config) {
   const {
+    judge: judgeName,
     rubric,
     reference,
     threshold = 0.7,
@@ -5597,6 +5846,29 @@ async function validateJudge(response, config) {
     maxBudgetUsd,
     maxToolOutputSize
   } = config;
+  if (judgeName !== void 0) {
+    try {
+      const executor = getRegisteredJudge(judgeName);
+      const judgeResult = await executor(response, reference ?? void 0);
+      const score = judgeResult.score;
+      const passed = score >= threshold;
+      return {
+        pass: passed,
+        message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
+      };
+    } catch (err) {
+      return {
+        pass: false,
+        message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
+      };
+    }
+  }
+  if (rubric === void 0) {
+    return {
+      pass: false,
+      message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
+    };
+  }
   const resolvedRubric = resolveRubric(rubric);
   const judgeConfig = {
     ...provider !== void 0 && { provider },
@@ -5643,11 +5915,17 @@ async function validateJudge(response, config) {
     return {
       pass: passed,
       message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
-      details: reps > 1 ? {
-        scores,
-        scoreStdDev: stdDev,
-        highVariance
-      } : void 0
+      details: {
+        score: meanScore,
+        reasoning: lastReasoning,
+        judgeProvider: provider ?? "anthropic",
+        judgeModel: model,
+        ...reps > 1 && {
+          scores,
+          scoreStdDev: stdDev,
+          highVariance
+        }
+      }
     };
   } catch (err) {
     return {
@@ -5840,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
 // src/assertions/matchers/toMatchToolSchema.ts
 function toMatchToolSchema(received, schema, options = {}) {
   const result = validateSchema(received, schema, options);
+  const preview = result.details?.textPreview;
   return {
     pass: result.pass,
     message: () => {
       if (this.isNot) {
         return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
       }
+      if (!result.pass && preview) {
+        return `${result.message}
+Actual response (truncated):
+${preview}`;
+      }
       return result.message;
     }
   };
@@ -5854,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
 // src/assertions/matchers/toContainToolText.ts
 function toContainToolText(received, expected, options = {}) {
   const result = validateText(received, expected, options);
+  const preview = result.details?.textPreview;
   return {
     pass: result.pass,
     message: () => {
@@ -5861,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
         const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
         return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
       }
+      if (!result.pass && preview) {
+        return `${result.message}
+Actual response (truncated):
+${preview}`;
+      }
       return result.message;
     }
   };
@@ -5869,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
 // src/assertions/matchers/toMatchToolPattern.ts
 function toMatchToolPattern(received, patterns, options = {}) {
   const result = validatePattern(received, patterns, options);
+  const preview = result.details?.textPreview;
   return {
     pass: result.pass,
     message: () => {
       if (this.isNot) {
         return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
       }
+      if (!result.pass && preview) {
+        return `${result.message}
+Actual response (truncated):
+${preview}`;
+      }
       return result.message;
     }
   };
@@ -6026,31 +6325,68 @@ function toBeToolError(received, expected = true) {
 // src/assertions/matchers/toPassToolJudge.ts
 var DEFAULT_PASSING_THRESHOLD = 0.7;
-async function toPassToolJudge(received, rubric, options = {}) {
+async function runSingleJudge(received, rubric, options) {
   const {
     reference = null,
     passingThreshold = DEFAULT_PASSING_THRESHOLD,
     reps,
     provider,
-    model
+    model,
+    judge
   } = options;
   const validation = await validateJudge(received, {
-    rubric,
+    ...rubric !== void 0 && { rubric },
     reference: reference ?? void 0,
     threshold: passingThreshold,
     ...reps !== void 0 && { reps },
     ...provider !== void 0 && { provider },
-    ...model !== void 0 && { model }
+    ...model !== void 0 && { model },
+    ...judge !== void 0 && { judge }
   });
+  return { pass: validation.pass, message: validation.message };
+}
+async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
+  if (Array.isArray(rubricOrOptions)) {
+    const results = await Promise.all(
+      rubricOrOptions.map(async (judgeConfig) => {
+        const { rubric: r, ...opts } = judgeConfig;
+        return runSingleJudge(received, r, opts);
+      })
+    );
+    const allPassed = results.every((r) => r.pass);
+    const passCount = results.filter((r) => r.pass).length;
+    const summary = `${passCount}/${results.length} judges passed`;
+    const details = results.map((r) => r.message).join("\n");
+    if (this.isNot) {
+      return {
+        pass: !allPassed,
+        message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
+      };
+    }
+    return {
+      pass: allPassed,
+      message: () => `${summary}
+${details}`
+    };
+  }
+  let rubric;
+  let options;
+  if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
+    rubric = rubricOrOptions;
+    options = maybeOptions ?? {};
+  } else {
+    options = rubricOrOptions;
+  }
+  const result = await runSingleJudge(received, rubric, options);
   if (this.isNot) {
     return {
-      pass: !validation.pass,
-      message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
+      pass: !result.pass,
+      message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
     };
   }
   return {
-    pass: validation.pass,
-    message: () => validation.message
+    pass: result.pass,
+    message: () => result.message
   };
 }
@@ -6334,6 +6670,7 @@ function getAuthConfigFromEnv() {
   return void 0;
 }
 var MCPHostConfigSchema = zod.z.object({
+  hostType: zod.z.enum(["sdk", "cli", "browser", "desktop"]).optional(),
   provider: zod.z.enum([
     "openai",
     "anthropic",
@@ -6344,12 +6681,18 @@ var MCPHostConfigSchema = zod.z.object({
     "openrouter",
     "xai",
     "vertex-anthropic"
-  ]),
+  ]).optional(),
   apiKeyEnvVar: zod.z.string().optional(),
   model: zod.z.string().optional(),
   maxTokens: zod.z.number().optional(),
   temperature: zod.z.number().optional(),
-  maxToolCalls: zod.z.number().optional()
+  maxToolCalls: zod.z.number().optional(),
+  cli: zod.z.object({
+    command: zod.z.string(),
+    args: zod.z.array(zod.z.string()),
+    outputFormat: zod.z.enum(["stream-json", "json"]).optional(),
+    timeout: zod.z.number().optional()
+  }).optional()
 });
 var SnapshotSanitizerSchema = zod.z.union([
   // Built-in sanitizers
@@ -6364,6 +6707,37 @@ var SnapshotSanitizerSchema = zod.z.union([
     remove: zod.z.array(zod.z.string())
   })
 ]);
+var JudgeExpectConfigSchema = zod.z.object({
+  judge: zod.z.string().min(1).optional(),
+  rubric: zod.z.union([
+    zod.z.enum([
+      "correctness",
+      "completeness",
+      "groundedness",
+      "instruction-following",
+      "conciseness"
+    ]),
+    zod.z.object({ text: zod.z.string().min(1) })
+  ]).optional(),
+  reference: zod.z.unknown().optional(),
+  threshold: zod.z.number().min(0).max(1).optional(),
+  reps: zod.z.number().int().min(1).optional(),
+  provider: zod.z.enum([
+    "anthropic",
+    "vertex-anthropic",
+    "anthropic-agent-sdk",
+    "openai",
+    "google"
+  ]).optional(),
+  model: zod.z.string().optional(),
+  apiKeyEnvVar: zod.z.string().optional(),
+  maxTokens: zod.z.number().int().positive().optional(),
+  temperature: zod.z.number().min(0).max(1).optional(),
+  maxBudgetUsd: zod.z.number().positive().optional(),
+  maxToolOutputSize: zod.z.number().int().positive().optional()
+}).refine((data) => data.judge !== void 0 || data.rubric !== void 0, {
+  message: 'Either "judge" or "rubric" must be provided in passesJudge'
+});
 var EvalExpectBlockSchema = zod.z.object({
   response: zod.z.unknown().optional(),
   schema: zod.z.string().optional(),
@@ -6372,28 +6746,7 @@ var EvalExpectBlockSchema = zod.z.object({
   snapshot: zod.z.string().optional(),
   snapshotSanitizers: zod.z.array(SnapshotSanitizerSchema).optional(),
   isError: zod.z.union([zod.z.boolean(), zod.z.string(), zod.z.array(zod.z.string())]).optional(),
-  passesJudge: zod.z.object({
-    rubric: zod.z.union([
-      zod.z.enum([
-        "correctness",
-        "completeness",
-        "groundedness",
-        "instruction-following",
-        "conciseness"
-      ]),
-      zod.z.object({ text: zod.z.string().min(1) })
-    ]),
-    reference: zod.z.unknown().optional(),
-    threshold: zod.z.number().min(0).max(1).optional(),
-    reps: zod.z.number().int().min(1).optional(),
-    provider: zod.z.enum(["anthropic", "openai", "google"]).optional(),
-    model: zod.z.string().optional(),
-    apiKeyEnvVar: zod.z.string().optional(),
-    maxTokens: zod.z.number().int().positive().optional(),
-    temperature: zod.z.number().min(0).max(1).optional(),
-    maxBudgetUsd: zod.z.number().positive().optional(),
-    maxToolOutputSize: zod.z.number().int().positive().optional()
-  }).optional(),
+  passesJudge: zod.z.union([JudgeExpectConfigSchema, zod.z.array(JudgeExpectConfigSchema).min(1)]).optional(),
   responseSize: zod.z.object({
     maxBytes: zod.z.number().optional(),
     minBytes: zod.z.number().optional()
@@ -6566,6 +6919,9 @@ function createVercelOrchestrator() {
       try {
         const { generateText, stepCountIs } = await import('ai');
         const { jsonSchema: jsonSchema2 } = await Promise.resolve().then(() => (init_dist3(), dist_exports));
+        if (!config.provider) {
+          throw new Error("provider is required for SDK host type");
+        }
         const modelId = config.model ?? defaultModel(config.provider);
         const model = await loadModel(config.provider, modelId);
         const mcpTools = await mcp.listTools();
@@ -6619,13 +6975,233 @@ function createVercelOrchestrator() {
         return {
           success: false,
           toolCalls: [],
-          error: enrichErrorMessage(err, config.provider)
+          error: enrichErrorMessage(err, config.provider ?? "unknown")
         };
       }
     }
   };
 }
+// src/evals/mcpHost/adapters/cli/parsers.ts
+function parseStreamJson(stdout) {
+  const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
+  const toolCalls = [];
+  const textParts = [];
+  const conversationHistory = [];
+  for (const line of lines) {
+    let event;
+    try {
+      event = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (event.type === "assistant" && event.message?.content) {
+      for (const block of event.message.content) {
+        if (block.type === "tool_use" && block.name) {
+          const rawName = block.name;
+          const mcpMatch = /^mcp__[^_]+__(.+)$/.exec(rawName);
+          toolCalls.push({
+            name: mcpMatch ? mcpMatch[1] : rawName,
+            arguments: block.input ?? {},
+            id: block.id
+          });
+        }
+        if (block.type === "text" && block.text) {
+          textParts.push(block.text);
+        }
+      }
+    }
+    if (event.type === "user" && event.message?.content) {
+      for (const block of event.message.content) {
+        if (block.type === "tool_result") {
+          const content = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
+          conversationHistory.push({ role: "tool", content });
+        }
+      }
+    }
+    if (event.type === "result" && typeof event.result === "string") {
+      if (textParts.length === 0) {
+        textParts.push(event.result);
+      }
+    }
+    if (event.type === "result" && event.is_error === true) {
+      return {
+        success: false,
+        toolCalls,
+        error: typeof event.result === "string" ? event.result : "CLI host reported an error"
+      };
+    }
+  }
+  const response = textParts.join("");
+  if (response) {
+    conversationHistory.push({ role: "assistant", content: response });
+  }
+  return {
+    success: true,
+    toolCalls,
+    response: response || void 0,
+    conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
+  };
+}
+function createJsonParser(paths) {
+  return (stdout) => {
+    const data = JSON.parse(stdout);
+    const rawToolCalls = getNestedValue(data, paths.toolCalls);
+    const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls.map((tc) => ({
+      name: typeof tc.name === "string" ? tc.name : "",
+      arguments: tc.arguments ?? tc.args ?? {}
+    })) : [];
+    const response = getNestedValue(data, paths.response);
+    const success = paths.success ? Boolean(getNestedValue(data, paths.success)) : true;
+    return {
+      success,
+      toolCalls,
+      response: typeof response === "string" ? response : void 0
+    };
+  };
+}
+function getNestedValue(obj, path3) {
+  return path3.split(".").reduce((current, key) => {
+    if (current !== null && typeof current === "object") {
+      return current[key];
+    }
+    return void 0;
+  }, obj);
+}
+// src/evals/mcpHost/adapters/cli/runner.ts
+var DEFAULT_TIMEOUT = 12e4;
+var MAX_BUFFER = 10 * 1024 * 1024;
+function getParser(format) {
+  switch (format ?? "stream-json") {
+    case "stream-json":
+      return parseStreamJson;
+    case "json":
+      return createJsonParser({
+        toolCalls: "toolCalls",
+        response: "response",
+        success: "success"
+      });
+  }
+}
+function interpolateArgs(args, scenario) {
+  return args.map((arg) => arg.replace(/\{\{scenario\}\}/g, scenario));
+}
+async function runCLIHost(cliConfig, scenario) {
+  const timeout = cliConfig.timeout ?? DEFAULT_TIMEOUT;
+  const args = interpolateArgs(cliConfig.args, scenario);
+  const startTime = Date.now();
+  let stdout;
+  try {
+    const result2 = await spawnProcess(cliConfig.command, args, { timeout });
+    stdout = result2.stdout;
+  } catch (err) {
+    const elapsed = Date.now() - startTime;
+    const message = err instanceof Error ? err.message : String(err);
+    if (message.includes("TIMEOUT") || message.includes("timed out")) {
+      return {
+        success: false,
+        toolCalls: [],
+        error: `CLI host timed out after ${elapsed}ms (limit: ${timeout}ms). Increase timeout via mcpHostConfig.cli.timeout.`
+      };
+    }
+    return {
+      success: false,
+      toolCalls: [],
+      error: `CLI host process failed: ${message}`
+    };
+  }
+  const parse = getParser(cliConfig.outputFormat);
+  let result;
+  try {
+    result = parse(stdout);
+  } catch (err) {
+    return {
+      success: false,
+      toolCalls: [],
+      error: `Failed to parse CLI host output: ${err instanceof Error ? err.message : String(err)}
+stdout (first 500 chars): ${stdout.slice(0, 500)}`
+    };
+  }
+  const validationError = validateSimulationResult(result);
+  if (validationError) {
+    return {
+      success: false,
+      toolCalls: [],
+      error: `CLI host returned invalid result: ${validationError}`
+    };
+  }
+  return result;
+}
+function validateSimulationResult(result) {
+  if (result === null || typeof result !== "object") {
+    return `Expected object, got ${typeof result}`;
+  }
+  const obj = result;
+  if (typeof obj.success !== "boolean") {
+    return `"success" must be a boolean, got ${typeof obj.success}`;
+  }
+  if (!Array.isArray(obj.toolCalls)) {
+    return `"toolCalls" must be an array, got ${typeof obj.toolCalls}`;
+  }
+  for (let i = 0; i < obj.toolCalls.length; i++) {
+    const tc = obj.toolCalls[i];
+    if (typeof tc.name !== "string") {
+      return `toolCalls[${i}].name must be a string, got ${typeof tc.name}`;
+    }
+    if (typeof tc.arguments !== "object" || tc.arguments === null) {
+      return `toolCalls[${i}].arguments must be an object, got ${typeof tc.arguments}`;
+    }
+  }
+  return null;
+}
+function spawnProcess(command, args, options) {
+  return new Promise((resolve2, reject) => {
+    const child = child_process.spawn(command, args, {
+      stdio: ["pipe", "pipe", "pipe"]
+    });
+    child.stdin.end();
+    const stdoutChunks = [];
+    const stderrChunks = [];
+    let totalBytes = 0;
+    child.stdout.on("data", (chunk) => {
+      totalBytes += chunk.length;
+      if (totalBytes <= MAX_BUFFER) {
+        stdoutChunks.push(chunk);
+      }
+    });
+    child.stderr.on("data", (chunk) => {
+      totalBytes += chunk.length;
+      if (totalBytes <= MAX_BUFFER) {
+        stderrChunks.push(chunk);
+      }
+    });
+    const timer = setTimeout(() => {
+      child.kill("SIGTERM");
+      reject(new Error(`Process timed out after ${options.timeout}ms`));
+    }, options.timeout);
+    child.on("error", (err) => {
+      clearTimeout(timer);
+      reject(err);
+    });
+    child.on("close", (code) => {
+      clearTimeout(timer);
+      const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
+      const stderr = Buffer.concat(stderrChunks).toString("utf-8");
+      if (code !== 0) {
+        reject(
+          new Error(
+            `Command failed with exit code ${code ?? "null"}` + (stderr ? `
+stderr: ${stderr}` : "")
+          )
+        );
+        return;
+      }
+      resolve2({ stdout, stderr });
+    });
+  });
+}
 // src/evals/mcpHost/mcpHostSimulation.ts
 var vercelOrchestrator = createVercelOrchestrator();
 var allProviders = [
@@ -6643,6 +7219,25 @@ var simulatorRegistry = new Map(
   allProviders.map((p) => [p, vercelOrchestrator])
 );
 async function simulateMCPHost(mcp, scenario, config) {
+  const hostType = config.hostType ?? "sdk";
+  if (hostType === "cli") {
+    if (!config.cli) {
+      throw new Error(
+        `mcpHostConfig.cli is required when hostType is 'cli'. Provide { command } with a shell command containing {{scenario}}.`
+      );
+    }
+    return runCLIHost(config.cli, scenario);
+  }
+  if (hostType === "browser" || hostType === "desktop") {
+    throw new Error(
+      `Host type '${hostType}' is not yet implemented. Supported host types: 'sdk', 'cli'.`
+    );
+  }
+  if (!config.provider) {
+    throw new Error(
+      `mcpHostConfig.provider is required for 'sdk' host type. Supported: ${allProviders.join(", ")}`
+    );
+  }
   const simulator = simulatorRegistry.get(config.provider);
   if (!simulator) {
     throw new Error(
@@ -6834,17 +7429,39 @@ async function runExpectBlockValidations(expectBlock, response, config) {
     };
   }
   if (expectBlock.passesJudge !== void 0) {
-    const effectiveReps = expectBlock.passesJudge.reps ?? config.judgeReps ?? 1;
-    const effectiveReference = expectBlock.passesJudge.reference !== void 0 ? expectBlock.passesJudge.reference : config.canonicalAnswer;
-    const validation = await validateJudge(response, {
-      ...expectBlock.passesJudge,
-      reference: effectiveReference,
-      reps: effectiveReps
-    });
-    results.judge = {
-      pass: validation.pass,
-      details: validation.message
-    };
+    const judgeConfigs = Array.isArray(expectBlock.passesJudge) ? expectBlock.passesJudge : [expectBlock.passesJudge];
+    const judgeResultEntries = await Promise.all(
+      judgeConfigs.map(async (judgeConfig) => {
+        const effectiveReps = judgeConfig.reps ?? config.judgeReps ?? 1;
+        const effectiveReference = judgeConfig.reference !== void 0 ? judgeConfig.reference : config.canonicalAnswer;
+        const validation = await validateJudge(response, {
+          ...judgeConfig,
+          reference: effectiveReference,
+          reps: effectiveReps
+        });
+        const judgeName = judgeConfig.judge ?? (typeof judgeConfig.rubric === "string" ? judgeConfig.rubric : void 0);
+        return {
+          pass: validation.pass,
+          details: validation.message,
+          score: validation.details?.score,
+          reasoning: validation.details?.reasoning,
+          judgeName,
+          judgeProvider: validation.details?.judgeProvider,
+          judgeModel: validation.details?.judgeModel
+        };
+      })
+    );
+    if (judgeResultEntries.length === 1) {
+      results.judge = judgeResultEntries[0];
+    } else {
+      const allPassed = judgeResultEntries.every((r) => r.pass);
+      const passCount = judgeResultEntries.filter((r) => r.pass).length;
+      results.judge = {
+        pass: allPassed,
+        details: `${passCount}/${judgeResultEntries.length} judges passed`,
+        judgeResults: judgeResultEntries
+      };
+    }
   }
   if (expectBlock.snapshot !== void 0) {
     if (!config.playwrightExpect) {
@@ -6873,6 +7490,24 @@ async function runExpectBlockValidations(expectBlock, response, config) {
   }
   return { expectations: results, toolPrecision, toolRecall };
 }
+function buildRequest(evalCase) {
+  const request = {};
+  if (evalCase.description) request.description = evalCase.description;
+  if (evalCase.mode === "mcp_host") {
+    if (evalCase.scenario) request.scenario = evalCase.scenario;
+    if (evalCase.mcpHostConfig) {
+      request.mcpHostConfig = {
+        provider: evalCase.mcpHostConfig.provider,
+        ...evalCase.mcpHostConfig.model !== void 0 && {
+          model: evalCase.mcpHostConfig.model
+        }
+      };
+    }
+  } else {
+    if (evalCase.args) request.args = evalCase.args;
+  }
+  return request;
+}
 function isMCPHostSimulationResult(value) {
   return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
 }
@@ -6921,6 +7556,7 @@ async function runSingleIteration(evalCase, context, options) {
     toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
     source: "eval",
     pass: didCasePass(error, expectationResults),
+    request: buildRequest(evalCase),
     response,
     error,
     expectations: expectationResults,
@@ -6946,7 +7582,7 @@ function isInfrastructureError(err) {
   } else {
     return false;
   }
-  return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
+  return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
   msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
 }
 async function runEvalCase(evalCase, context, options = {}) {
@@ -7063,8 +7699,13 @@ async function runEvalDataset(options, context) {
   const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
   const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
     const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
-    const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
-    return sum + effectiveIterations * judgeReps;
+    if (c.expect?.passesJudge == null) return sum;
+    const judges = Array.isArray(c.expect.passesJudge) ? c.expect.passesJudge : [c.expect.passesJudge];
+    const totalReps = judges.reduce(
+      (r, j) => r + (j.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1),
+      0
+    );
+    return sum + effectiveIterations * totalReps;
   }, 0);
   if (estimatedJudgeCalls > 50) {
     debugEval(
@@ -7421,6 +8062,7 @@ exports.EvalDatasetSchema = EvalDatasetSchema;
 exports.MCPConfigSchema = MCPConfigSchema;
 exports.MCP_PROTOCOL_VERSION = MCP_PROTOCOL_VERSION;
 exports.SnapshotSanitizers = SnapshotSanitizers;
+exports.clearJudgeRegistry = clearJudgeRegistry;
 exports.closeMCPClient = closeMCPClient;
 exports.createJudge = createJudge;
 exports.createMCPClientForConfig = createMCPClientForConfig;
@@ -7431,6 +8073,7 @@ exports.discoverProtectedResource = discoverProtectedResource;
 exports.expect = expect;
 exports.extractText = extractText;
 exports.getMissingDependencyMessage = getMissingDependencyMessage;
+exports.getRegisteredJudge = getRegisteredJudge;
 exports.getResponseSizeBytes = getResponseSizeBytes;
 exports.hasValidTokens = hasValidTokens;
 exports.injectTokens = injectTokens;
@@ -7451,6 +8094,8 @@ exports.normalizeWhitespace = normalizeWhitespace;
 exports.performClientCredentialsFlow = performClientCredentialsFlow;
 exports.performOAuthSetup = performOAuthSetup;
 exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
+exports.refreshAccessToken = refreshAccessToken;
+exports.registerJudge = registerJudge;
 exports.resolveRubric = resolveRubric;
 exports.runConformanceChecks = runConformanceChecks;
 exports.runEvalCase = runEvalCase;