npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.3 → 1.0.0-beta.5 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +11 -10
package/dist/cli/index.js +34 -11
package/dist/fixtures/mcp.d.ts +6 -6
package/dist/fixtures/mcp.js +5 -5
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +79 -45
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +215 -1168
package/dist/index.d.ts +215 -1168
package/dist/index.js +79 -45
package/dist/index.js.map +1 -1
package/dist/reporters/mcpReporter.cjs.map +1 -1
package/dist/reporters/mcpReporter.js.map +1 -1
package/dist/reporters/ui-dist/app.js +107 -7
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +9 -6
package/src/reporters/ui-dist/app.js +0 -174
package/src/reporters/ui-dist/index.html +0 -28
package/src/reporters/ui-dist/styles.css +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
   }
 });
 var MCPHostCapabilitiesSchema = zod.z.object({
-  sampling: zod.z.record(zod.z.unknown()).optional(),
+  sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
   roots: zod.z.object({
     listChanged: zod.z.boolean()
   }).optional()
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
     }
     return true;
   }),
-  headers: zod.z.record(zod.z.string()).optional(),
+  headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
   capabilities: MCPHostCapabilitiesSchema.optional(),
   connectTimeoutMs: zod.z.number().positive().optional(),
   requestTimeoutMs: zod.z.number().positive().optional(),
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.3"};
+  version: "1.0.0-beta.5"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -5151,7 +5151,7 @@ function validateToolCalls(response, expectation) {
   if (!isSimulationResult(response)) {
     return {
       pass: false,
-      message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
+      message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
     };
   }
   const actual = response.toolCalls;
@@ -5211,7 +5211,7 @@ function validateToolCallCount(response, options) {
   if (!isSimulationResult(response)) {
     return {
       pass: false,
-      message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
+      message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
     };
   }
   const count = response.toolCalls.length;
@@ -6333,7 +6333,7 @@ function getAuthConfigFromEnv() {
   }
   return void 0;
 }
-var LLMHostConfigSchema = zod.z.object({
+var MCPHostConfigSchema = zod.z.object({
   provider: zod.z.enum([
     "openai",
     "anthropic",
@@ -6402,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
     calls: zod.z.array(
       zod.z.object({
         name: zod.z.string(),
-        arguments: zod.z.record(zod.z.unknown()).optional(),
+        arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
         required: zod.z.boolean().optional()
       })
     ),
@@ -6418,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
 var EvalCaseSchema = zod.z.object({
   id: zod.z.string().min(1, "id must not be empty"),
   description: zod.z.string().optional(),
-  mode: zod.z.enum(["direct", "llm_host"]).optional(),
+  mode: zod.z.enum(["direct", "mcp_host"]).optional(),
   toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
-  args: zod.z.record(zod.z.unknown()).optional(),
+  args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
   scenario: zod.z.string().optional(),
-  llmHostConfig: LLMHostConfigSchema.optional(),
-  metadata: zod.z.record(zod.z.unknown()).optional(),
+  mcpHostConfig: MCPHostConfigSchema.optional(),
+  metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
   iterations: zod.z.number().int().min(1).optional(),
   accuracyThreshold: zod.z.number().min(0).max(1).optional(),
   judgeReps: zod.z.number().int().min(1).optional(),
@@ -6435,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
   name: zod.z.string().min(1, "name must not be empty"),
   description: zod.z.string().optional(),
   cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
-  metadata: zod.z.record(zod.z.unknown()).optional()
+  metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
 });
 function validateEvalCase(evalCase) {
   return EvalCaseSchema.parse(evalCase);
@@ -6473,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
   return dataset;
 }
-// src/evals/llmHost/adapters/vercel.ts
+// src/evals/mcpHost/adapters/vercel.ts
 function enrichErrorMessage(err, provider) {
   const raw = err instanceof Error ? err.message : String(err);
   if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
-    return `LLM host simulation failed: required package not installed.
-Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
+    return `MCP host simulation failed: required package not installed.
+Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
   }
   if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
-    return `LLM host simulation failed: authentication error.
+    return `MCP host simulation failed: authentication error.
 Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
   }
   if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
-    return `LLM host simulation failed: model not found.
+    return `MCP host simulation failed: model not found.
 Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
   }
   if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
-    return `LLM host simulation failed: network error.
+    return `MCP host simulation failed: network error.
 Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
   }
   if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
-    return `LLM host simulation failed: rate limited.
+    return `MCP host simulation failed: rate limited.
 Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
   }
-  return `LLM host simulation failed: ${raw}`;
+  return `MCP host simulation failed: ${raw}`;
 }
 async function loadModel(provider, model) {
   switch (provider) {
@@ -6626,7 +6626,7 @@ function createVercelOrchestrator() {
   };
 }
-// src/evals/llmHost/llmHostSimulation.ts
+// src/evals/mcpHost/mcpHostSimulation.ts
 var vercelOrchestrator = createVercelOrchestrator();
 var allProviders = [
   "openai",
@@ -6642,7 +6642,7 @@ var allProviders = [
 var simulatorRegistry = new Map(
   allProviders.map((p) => [p, vercelOrchestrator])
 );
-async function simulateLLMHost(mcp, scenario, config) {
+async function simulateMCPHost(mcp, scenario, config) {
   const simulator = simulatorRegistry.get(config.provider);
   if (!simulator) {
     throw new Error(
@@ -6664,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
     deepseek: "npm install ai @ai-sdk/deepseek",
     openrouter: "npm install ai @openrouter/ai-sdk-provider",
     xai: "npm install ai @ai-sdk/xai",
-    "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
+    "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
   };
   const pkg = packageMap[provider];
   return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6707,24 +6707,24 @@ async function execFileNoThrow(file, args) {
 async function executeToolCall(evalCase, mcp) {
   const mode = evalCase.mode || "direct";
   try {
-    if (mode === "llm_host") {
+    if (mode === "mcp_host") {
       if (!evalCase.scenario) {
         throw new Error(
-          `Eval case ${evalCase.id}: scenario is required for llm_host mode`
+          `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
         );
       }
-      if (!evalCase.llmHostConfig) {
+      if (!evalCase.mcpHostConfig) {
         throw new Error(
-          `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
+          `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
         );
       }
-      const simulationResult = await simulateLLMHost(
+      const simulationResult = await simulateMCPHost(
         mcp,
         evalCase.scenario,
-        evalCase.llmHostConfig
+        evalCase.mcpHostConfig
       );
       if (!simulationResult.success) {
-        throw new Error(simulationResult.error || "LLM host simulation failed");
+        throw new Error(simulationResult.error || "MCP host simulation failed");
       }
       return { response: simulationResult };
     } else {
@@ -6866,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
   }
   return { expectations: results, toolPrecision, toolRecall };
 }
+function isMCPHostSimulationResult(value) {
+  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
+}
 async function runSingleIteration(evalCase, context, options) {
   const startTime = Date.now();
   const { response, error } = await executeToolCall(evalCase, context.mcp);
   let expectationResults = {};
   let toolPrecision;
   let toolRecall;
+  let mcpHostTrace;
   if (!error && evalCase.expect) {
     const {
       expectations,
@@ -6886,11 +6890,28 @@ async function runSingleIteration(evalCase, context, options) {
     expectationResults = expectations;
     toolPrecision = tp;
     toolRecall = tr;
+    if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
+      const expectedNames = new Set(
+        evalCase.expect.toolsTriggered.calls.map((c) => c.name)
+      );
+      const requiredNames = new Set(
+        evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
+      );
+      const calledNames = new Set(response.toolCalls.map((c) => c.name));
+      mcpHostTrace = {
+        calls: response.toolCalls.map((call) => ({
+          name: call.name,
+          arguments: call.arguments,
+          status: expectedNames.has(call.name) ? "expected" : "unexpected"
+        })),
+        missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
+      };
+    }
   }
   return {
     id: evalCase.id,
     datasetName: options.datasetName ?? "single-case",
-    toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
+    toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
     source: "eval",
     pass: didCasePass(error, expectationResults),
     response,
@@ -6901,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
     durationMs: Date.now() - startTime,
     tags: evalCase.tags,
     toolPrecision,
-    toolRecall
+    toolRecall,
+    mcpHostTrace
   };
 }
 function isInfrastructureError(err) {
@@ -6958,7 +6980,7 @@ async function runEvalCase(evalCase, context, options = {}) {
   const baseResult = lastResult ?? {
     id: evalCase.id,
     datasetName: options.datasetName ?? "single-case",
-    toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
+    toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
     source: "eval",
     pass: false,
     error: iterationResults[0]?.error,
@@ -6972,12 +6994,25 @@ async function runEvalCase(evalCase, context, options = {}) {
     ...baseResult,
     pass: assertionPassRate >= threshold,
     assertionPassRate,
+    assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
     infrastructureErrorRate,
     iterationResults,
     infrastructureErrorCount: infraErrors.length,
     durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
   };
 }
+function wilsonCI(k, n) {
+  if (n < 2) return void 0;
+  const z5 = 1.96;
+  const z22 = z5 * z5;
+  const \u00F1 = n + z22;
+  const p\u0303 = (k + z22 / 2) / \u00F1;
+  const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
+  return {
+    lower: Math.max(0, p\u0303 - margin),
+    upper: Math.min(1, p\u0303 + margin)
+  };
+}
 async function runWithConcurrency(tasks, limit) {
   const results = new Array(tasks.length);
   let index = 0;
@@ -7007,7 +7042,7 @@ async function runEvalDataset(options, context) {
     filterTags,
     saveResultsTo,
     baselineResultsFrom,
-    llmHostModel,
+    mcpHostModel,
     judgeModel
   } = options;
   const startTime = Date.now();
@@ -7017,7 +7052,7 @@ async function runEvalDataset(options, context) {
   };
   const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
   const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
-    const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
+    const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
     const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
     return sum + effectiveIterations * judgeReps;
   }, 0);
@@ -7027,12 +7062,12 @@ async function runEvalDataset(options, context) {
     );
   }
   const tasks = casesToRun.map((evalCase) => async () => {
-    const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
-    if (evalCase.mode === "llm_host") {
+    const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
+    if (evalCase.mode === "mcp_host") {
       const effectiveIterations = withIterations.iterations ?? 1;
       if (effectiveIterations > 1 && effectiveIterations < 10) {
         console.warn(
-          `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
+          `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
         );
       }
     }
@@ -7064,7 +7099,7 @@ async function runEvalDataset(options, context) {
     gitHash,
     timestamp: (/* @__PURE__ */ new Date()).toISOString(),
     packageVersion: package_default.version,
-    ...llmHostModel !== void 0 && { llmHostModel },
+    ...mcpHostModel !== void 0 && { mcpHostModel },
     ...judgeModel !== void 0 && { judgeModel }
   };
   const result = {
@@ -7109,12 +7144,12 @@ async function runEvalDataset(options, context) {
       );
     }
   }
-  const llmHostCases = caseResults.filter(
+  const mcpHostCases = caseResults.filter(
     (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
   );
-  if (llmHostCases.length > 0) {
-    const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
-    const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
+  if (mcpHostCases.length > 0) {
+    const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
+    const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
     result.datasetToolPrecision = avgPrec;
     result.datasetToolRecall = avgRecall;
     result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7182,7 +7217,6 @@ async function runServerComparison(options, contextA, contextB) {
     bWins,
     ties,
     bothFail,
-    bothFailCount: bothFail,
     decidedCases,
     failureAlignment: total > 0 ? bothFail / total : 0,
     aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7411,7 +7445,7 @@ exports.runEvalCase = runEvalCase;
 exports.runEvalDataset = runEvalDataset;
 exports.runServerComparison = runServerComparison;
 exports.saveBaseline = saveBaseline;
-exports.simulateLLMHost = simulateLLMHost;
+exports.simulateMCPHost = simulateMCPHost;
 exports.test = test;
 exports.validateAccessToken = validateAccessToken;
 exports.validateError = validateError;