npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.7 → 1.0.0-beta.8 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/cli/index.js +1 -1
package/dist/fixtures/mcp.d.ts +33 -8
package/dist/fixtures/mcp.js +284 -24
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +649 -62
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +504 -115
package/dist/index.d.ts +504 -115
package/dist/index.js +648 -64
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +8 -134
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +11 -6
package/dist/reporters/mcpReporter.d.cts +0 -90
package/dist/reporters/mcpReporter.d.ts +0 -90

package/dist/cli/index.js CHANGED Viewed

@@ -80,7 +80,7 @@ function JsonPreview({ data, maxLines = 15 }) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.7"};
+  version: "1.0.0-beta.8"};
 // src/cli/templates/index.ts
 function getPlaywrightConfigTemplate(answers) {

package/dist/fixtures/mcp.d.ts CHANGED Viewed

@@ -214,7 +214,7 @@ type RubricSpec = BuiltInRubric | {
 };
 /** Valid LLM judge provider kinds. */
-type ProviderKind = 'anthropic' | 'openai' | 'google';
+type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
 /**
  * Tool call validators for mcp_host simulation results.
@@ -258,6 +258,12 @@ interface JudgeMatcherOptions {
     provider?: ProviderKind;
     /** Override the judge model */
     model?: string;
+    /**
+     * Name of a registered custom judge executor.
+     * When set, the named judge handles the entire evaluation pipeline
+     * and its `pass` result is authoritative.
+     */
+    judge?: string;
 }
 /**
  * Declaration merging for Playwright matchers
@@ -348,21 +354,30 @@ declare global {
              */
             toBeToolError(expected?: boolean | string | string[]): R;
             /**
-             * Validates that a response passes LLM-as-judge evaluation
+             * Validates that a response passes LLM-as-judge evaluation.
              *
-             * @param rubric - Evaluation rubric/criteria
-             * @param options - Judge options
+             * Two call signatures:
+             * - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
+             * - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
              *
              * @example
              * ```typescript
+             * // Built-in LLM judge with rubric
              * expect(result).toPassToolJudge('Response should be helpful and accurate');
-             * expect(result).toPassToolJudge('Response should match reference', {
+             * expect(result).toPassToolJudge('correctness', {
              *   reference: expectedOutput,
              *   passingThreshold: 0.8,
              * });
+             *
+             * // Named custom judge (registered via registerJudge)
+             * expect(result).toPassToolJudge({ judge: 'glean-completeness' });
              * ```
              */
             toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(judges: Array<JudgeMatcherOptions & {
+                rubric?: RubricSpec;
+            }>): Promise<R>;
             /**
              * Validates that a response meets size constraints
              *
@@ -452,16 +467,26 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
  * Validates that a response passes LLM-as-judge evaluation.
  * Delegates evaluation logic to validateJudge() for consistency
  * with the validator/matcher duality pattern.
+ *
+ * Supports three call signatures:
+ *   - toPassToolJudge(rubric, options?)        — built-in LLM judge with rubric
+ *   - toPassToolJudge({ judge: 'name', ... })  — named custom judge
+ *   - toPassToolJudge([...judges])             — multi-judge (all must pass)
  */
 /**
- * Creates the toPassToolJudge matcher function
+ * The toPassToolJudge matcher function.
  *
- * Note: This is an async matcher that calls an LLM for evaluation.
+ * Accepts either:
+ *   (received, rubric, options?) — rubric-based LLM judge
+ *   (received, options)          — named custom judge (options.judge required)
+ *   (received, judges[])         — multi-judge (all must pass)
  */
 declare function toPassToolJudge(this: {
     isNot: boolean;
-}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
+}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
+    rubric?: RubricSpec;
+}>, maybeOptions?: JudgeMatcherOptions): Promise<{
     pass: boolean;
     message: () => string;
 }>;

package/dist/fixtures/mcp.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { expect as expect$1, test as test$1 } from '@playwright/test';
-import { query } from '@anthropic-ai/claude-agent-sdk';
 import { z } from 'zod';
+import { query } from '@anthropic-ai/claude-agent-sdk';
 import { Client } from '@modelcontextprotocol/sdk/client/index.js';
 import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
 import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
@@ -662,7 +662,175 @@ var JudgeResponseSchema = z.object({
   reasoning: z.string()
 });
-// src/judge/claudeAgentJudge.ts
+// src/judge/anthropicJudge.ts
+function createAnthropicJudge(config = {}) {
+  const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
+  const apiKey = process.env[apiKeyEnvVar];
+  if (!apiKey) {
+    throw new Error(
+      `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
+    );
+  }
+  const model = config.model ?? "claude-sonnet-4-20250514";
+  const maxTokens = config.maxTokens ?? 1e3;
+  const temperature = config.temperature ?? 0;
+  return {
+    async evaluate(candidate, reference, rubric) {
+      let anthropicModule;
+      try {
+        anthropicModule = await import('@anthropic-ai/sdk');
+      } catch (err) {
+        throw new Error(
+          `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
+Original error: ${err instanceof Error ? err.message : String(err)}`
+        );
+      }
+      const client = new anthropicModule.default({ apiKey });
+      const prompt = buildJudgePrompt(candidate, reference, rubric);
+      const startTime = Date.now();
+      const response = await client.messages.create({
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
+        messages: [{ role: "user", content: prompt }]
+      });
+      const durationMs = Date.now() - startTime;
+      const textBlock = response.content.find(
+        (b) => b.type === "text"
+      );
+      const text = textBlock?.text ?? "";
+      const parsed = parseJudgeResponse(text);
+      return {
+        pass: parsed.pass,
+        score: parsed.score,
+        reasoning: parsed.reasoning,
+        usage: {
+          inputTokens: response.usage?.input_tokens ?? 0,
+          outputTokens: response.usage?.output_tokens ?? 0,
+          totalCostUsd: 0,
+          durationMs
+        }
+      };
+    }
+  };
+}
+function buildJudgePrompt(candidate, reference, rubric) {
+  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
+  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
+  return `Rubric:
+${rubric}
+<candidate_response>
+${candidateStr}
+</candidate_response>
+<reference_answer>
+${referenceStr ?? "No reference provided."}
+</reference_answer>
+Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
+}
+function parseJudgeResponse(text) {
+  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse judge response as JSON: ${text}`);
+  }
+  const result = JudgeResponseSchema.safeParse(parsed);
+  if (!result.success) {
+    throw new Error(
+      `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
+Validation errors: ${JSON.stringify(result.error.issues)}`
+    );
+  }
+  return result.data;
+}
+// src/judge/vertexAnthropicJudge.ts
+function createVertexAnthropicJudge(config = {}) {
+  const model = config.model ?? "claude-sonnet-4-20250514";
+  const maxTokens = config.maxTokens ?? 1e3;
+  const temperature = config.temperature ?? 0;
+  return {
+    async evaluate(candidate, reference, rubric) {
+      let vertexModule;
+      try {
+        vertexModule = await import('@anthropic-ai/vertex-sdk');
+      } catch (err) {
+        throw new Error(
+          `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
+Original error: ${err instanceof Error ? err.message : String(err)}`
+        );
+      }
+      const client = new vertexModule.AnthropicVertex({
+        projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
+        region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
+      });
+      const prompt = buildJudgePrompt2(candidate, reference, rubric);
+      const startTime = Date.now();
+      const response = await client.messages.create({
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
+        messages: [{ role: "user", content: prompt }]
+      });
+      const durationMs = Date.now() - startTime;
+      const textBlock = response.content.find(
+        (b) => b.type === "text"
+      );
+      const text = textBlock?.text ?? "";
+      const parsed = parseJudgeResponse2(text);
+      return {
+        pass: parsed.pass,
+        score: parsed.score,
+        reasoning: parsed.reasoning,
+        usage: {
+          inputTokens: response.usage?.input_tokens ?? 0,
+          outputTokens: response.usage?.output_tokens ?? 0,
+          totalCostUsd: 0,
+          durationMs
+        }
+      };
+    }
+  };
+}
+function buildJudgePrompt2(candidate, reference, rubric) {
+  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
+  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
+  return `Rubric:
+${rubric}
+<candidate_response>
+${candidateStr}
+</candidate_response>
+<reference_answer>
+${referenceStr ?? "No reference provided."}
+</reference_answer>
+Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
+}
+function parseJudgeResponse2(text) {
+  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse judge response as JSON: ${text}`);
+  }
+  const result = JudgeResponseSchema.safeParse(parsed);
+  if (!result.success) {
+    throw new Error(
+      `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
+Validation errors: ${JSON.stringify(result.error.issues)}`
+    );
+  }
+  return result.data;
+}
 function createClaudeAgentJudge(config) {
   const model = config.model ?? "claude-sonnet-4-20250514";
   const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -680,7 +848,7 @@ function createClaudeAgentJudge(config) {
           exceedsMaxToolOutputSize: true
         };
       }
-      const prompt = buildJudgePrompt(candidate, reference, rubric);
+      const prompt = buildJudgePrompt3(candidate, reference, rubric);
       try {
         let resultMessage;
         for await (const message of query({
@@ -712,7 +880,7 @@ function createClaudeAgentJudge(config) {
           );
         }
         const responseText = resultMessage.result ?? "";
-        const parsed = parseJudgeResponse(responseText);
+        const parsed = parseJudgeResponse3(responseText);
         const usage = {
           inputTokens: resultMessage.usage?.input_tokens ?? 0,
           outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -741,7 +909,7 @@ function createClaudeAgentJudge(config) {
 function buildSystemPrompt() {
   return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
 }
-function buildJudgePrompt(candidate, reference, rubric) {
+function buildJudgePrompt3(candidate, reference, rubric) {
   const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
   const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
   const parts = [];
@@ -758,7 +926,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
   );
   return parts.join("");
 }
-function parseJudgeResponse(text) {
+function parseJudgeResponse3(text) {
   let jsonText = text.trim();
   if (jsonText.startsWith("```json")) {
     jsonText = jsonText.slice(7);
@@ -815,7 +983,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
         );
       }
       const client = new openaiModule.default({ apiKey });
-      const prompt = buildJudgePrompt2(candidate, reference, rubric);
+      const prompt = buildJudgePrompt4(candidate, reference, rubric);
       const startTime = Date.now();
       const completion = await client.chat.completions.create({
         model,
@@ -831,7 +999,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
       });
       const durationMs = Date.now() - startTime;
       const text = completion.choices[0]?.message.content ?? "";
-      const parsed = parseJudgeResponse2(text);
+      const parsed = parseJudgeResponse4(text);
       return {
         pass: parsed.pass,
         score: parsed.score,
@@ -846,7 +1014,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
     }
   };
 }
-function buildJudgePrompt2(candidate, reference, rubric) {
+function buildJudgePrompt4(candidate, reference, rubric) {
   const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
   const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
   return `Rubric:
@@ -862,7 +1030,7 @@ ${referenceStr ?? "No reference provided."}
 Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
 }
-function parseJudgeResponse2(text) {
+function parseJudgeResponse4(text) {
   const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
   let parsed;
   try {
@@ -964,6 +1132,10 @@ function createJudge(config = {}) {
   const provider = config.provider ?? "anthropic";
   switch (provider) {
     case "anthropic":
+      return createAnthropicJudge(config);
+    case "vertex-anthropic":
+      return createVertexAnthropicJudge(config);
+    case "anthropic-agent-sdk":
       return createClaudeAgentJudge(config);
     case "openai":
       return createOpenAIJudge(config);
@@ -974,6 +1146,19 @@ function createJudge(config = {}) {
   }
 }
+// src/judge/judgeRegistry.ts
+var registry = /* @__PURE__ */ new Map();
+function getRegisteredJudge(name) {
+  const executor = registry.get(name);
+  if (!executor) {
+    const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
+    throw new Error(
+      `Judge "${name}" is not registered.${available} Register it with registerJudge() before tests run.`
+    );
+  }
+  return executor;
+}
 // src/assertions/validators/judge.ts
 function computeStdDev(scores, mean) {
   if (scores.length <= 1) return 0;
@@ -982,6 +1167,7 @@ function computeStdDev(scores, mean) {
 }
 async function validateJudge(response, config) {
   const {
+    judge: judgeName,
     rubric,
     reference,
     threshold = 0.7,
@@ -994,6 +1180,29 @@ async function validateJudge(response, config) {
     maxBudgetUsd,
     maxToolOutputSize
   } = config;
+  if (judgeName !== void 0) {
+    try {
+      const executor = getRegisteredJudge(judgeName);
+      const judgeResult = await executor(response, reference ?? void 0);
+      const score = judgeResult.score;
+      const passed = score >= threshold;
+      return {
+        pass: passed,
+        message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
+      };
+    } catch (err) {
+      return {
+        pass: false,
+        message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
+      };
+    }
+  }
+  if (rubric === void 0) {
+    return {
+      pass: false,
+      message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
+    };
+  }
   const resolvedRubric = resolveRubric(rubric);
   const judgeConfig = {
     ...provider !== void 0 && { provider },
@@ -1040,11 +1249,17 @@ async function validateJudge(response, config) {
     return {
       pass: passed,
       message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
-      details: reps > 1 ? {
-        scores,
-        scoreStdDev: stdDev,
-        highVariance
-      } : void 0
+      details: {
+        score: meanScore,
+        reasoning: lastReasoning,
+        judgeProvider: provider ?? "anthropic",
+        judgeModel: model,
+        ...reps > 1 && {
+          scores,
+          scoreStdDev: stdDev,
+          highVariance
+        }
+      }
     };
   } catch (err) {
     return {
@@ -1056,31 +1271,68 @@ async function validateJudge(response, config) {
 // src/assertions/matchers/toPassToolJudge.ts
 var DEFAULT_PASSING_THRESHOLD = 0.7;
-async function toPassToolJudge(received, rubric, options = {}) {
+async function runSingleJudge(received, rubric, options) {
   const {
     reference = null,
     passingThreshold = DEFAULT_PASSING_THRESHOLD,
     reps,
     provider,
-    model
+    model,
+    judge
   } = options;
   const validation = await validateJudge(received, {
-    rubric,
+    ...rubric !== void 0 && { rubric },
     reference: reference ?? void 0,
     threshold: passingThreshold,
     ...reps !== void 0 && { reps },
     ...provider !== void 0 && { provider },
-    ...model !== void 0 && { model }
+    ...model !== void 0 && { model },
+    ...judge !== void 0 && { judge }
   });
+  return { pass: validation.pass, message: validation.message };
+}
+async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
+  if (Array.isArray(rubricOrOptions)) {
+    const results = await Promise.all(
+      rubricOrOptions.map(async (judgeConfig) => {
+        const { rubric: r, ...opts } = judgeConfig;
+        return runSingleJudge(received, r, opts);
+      })
+    );
+    const allPassed = results.every((r) => r.pass);
+    const passCount = results.filter((r) => r.pass).length;
+    const summary = `${passCount}/${results.length} judges passed`;
+    const details = results.map((r) => r.message).join("\n");
+    if (this.isNot) {
+      return {
+        pass: !allPassed,
+        message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
+      };
+    }
+    return {
+      pass: allPassed,
+      message: () => `${summary}
+${details}`
+    };
+  }
+  let rubric;
+  let options;
+  if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
+    rubric = rubricOrOptions;
+    options = maybeOptions ?? {};
+  } else {
+    options = rubricOrOptions;
+  }
+  const result = await runSingleJudge(received, rubric, options);
   if (this.isNot) {
     return {
-      pass: !validation.pass,
-      message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
+      pass: !result.pass,
+      message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
     };
   }
   return {
-    pass: validation.pass,
-    message: () => validation.message
+    pass: result.pass,
+    message: () => result.message
   };
 }
@@ -1188,9 +1440,17 @@ async function toSatisfyToolPredicate(received, predicate, description) {
 function isSimulationResult(value) {
   return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
 }
+function isPatternMatcher(v) {
+  return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
+}
 function partialMatch(actual, expected) {
   return Object.entries(expected).every(([k, v]) => {
     const actualVal = actual[k];
+    if (isPatternMatcher(v)) {
+      if (typeof actualVal !== "string") return false;
+      const re = new RegExp(v.$pattern, v.$flags);
+      return re.test(actualVal);
+    }
     if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
       return partialMatch(
         actualVal,
@@ -1434,7 +1694,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
 // package.json
 var package_default = {
-  version: "1.0.0-beta.7"};
+  version: "1.0.0-beta.8"};
 var debug = createDebug("mcp-server-tester:oauth-flow");
 async function generatePKCE() {
   const codeVerifier = oauth.generateRandomCodeVerifier();