npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0 → 1.0.1-beta.0 - Mend

@gleanwork/mcp-server-tester 1.0.0 → 1.0.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/cli/index.js +1 -1
package/dist/fixtures/mcp.js +1 -1
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +70 -10
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +88 -14
package/dist/index.d.ts +88 -14
package/dist/index.js +70 -10
package/dist/index.js.map +1 -1
package/dist/reporters/mcpReporter.cjs +34 -1
package/dist/reporters/mcpReporter.cjs.map +1 -1
package/dist/reporters/mcpReporter.d.cts +90 -0
package/dist/reporters/mcpReporter.d.ts +90 -0
package/dist/reporters/mcpReporter.js +34 -1
package/dist/reporters/mcpReporter.js.map +1 -1
package/package.json +1 -1

package/dist/index.d.cts CHANGED Viewed

@@ -2684,6 +2684,56 @@ interface CLIConfig {
      */
     timeout?: number;
 }
+/**
+ * A cookie to inject into the browser context before running the script.
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
+ */
+interface BrowserCookie {
+    name: string;
+    value: string;
+    url?: string;
+    domain?: string;
+    path?: string;
+    expires?: number;
+    httpOnly?: boolean;
+    secure?: boolean;
+    sameSite?: 'Strict' | 'Lax' | 'None';
+    partitionKey?: string;
+}
+/**
+ * Configuration for a browser-based host.
+ *
+ * Uses Playwright to launch a Chromium instance, inject auth state,
+ * and execute a user-provided script that drives a web-based MCP host
+ * (e.g., claude.ai).
+ */
+interface BrowserConfig {
+    /**
+     * Path to the browser script (resolved relative to cwd).
+     * The script must default-export an async function
+     * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
+     */
+    script: string;
+    /**
+     * Timeout in milliseconds for the browser script.
+     * @default 120000 (2 minutes)
+     */
+    timeout?: number;
+    /**
+     * Whether to launch in headless mode.
+     * @default true
+     */
+    headless?: boolean;
+    /**
+     * Path to a Playwright storage state JSON file (cookies + localStorage).
+     * Resolved relative to cwd.
+     */
+    storageState?: string;
+    /**
+     * Extra cookies to inject into the browser context.
+     */
+    cookies?: BrowserCookie[];
+}
 /**
  * Configuration for MCP host simulation
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
      * CLI host configuration (required for 'cli' host type).
      */
     cli?: CLIConfig;
+    /**
+     * Browser host configuration (required for 'browser' host type).
+     */
+    browser?: BrowserConfig;
 }
 /**
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
      * (excludes LLM response time)
      */
     mcpDurationMs?: number;
+    /**
+     * Token usage from the LLM during simulation.
+     * Populated by SDK-based hosts from the AI SDK response.
+     */
+    usage?: UsageMetrics;
 }
 /**
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
             desktop: "desktop";
         }>>;
         provider: z.ZodOptional<z.ZodEnum<{
-            openai: "openai";
             anthropic: "anthropic";
-            azure: "azure";
+            "vertex-anthropic": "vertex-anthropic";
+            openai: "openai";
             google: "google";
+            azure: "azure";
             mistral: "mistral";
             deepseek: "deepseek";
             openrouter: "openrouter";
             xai: "xai";
-            "vertex-anthropic": "vertex-anthropic";
         }>>;
         apiKeyEnvVar: z.ZodOptional<z.ZodString>;
         model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
             threshold: z.ZodOptional<z.ZodNumber>;
             reps: z.ZodOptional<z.ZodNumber>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                google: "google";
                 "vertex-anthropic": "vertex-anthropic";
                 "anthropic-agent-sdk": "anthropic-agent-sdk";
+                openai: "openai";
+                google: "google";
             }>>;
             model: z.ZodOptional<z.ZodString>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
             threshold: z.ZodOptional<z.ZodNumber>;
             reps: z.ZodOptional<z.ZodNumber>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                google: "google";
                 "vertex-anthropic": "vertex-anthropic";
                 "anthropic-agent-sdk": "anthropic-agent-sdk";
+                openai: "openai";
+                google: "google";
             }>>;
             model: z.ZodOptional<z.ZodString>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 desktop: "desktop";
             }>>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                azure: "azure";
+                "vertex-anthropic": "vertex-anthropic";
+                openai: "openai";
                 google: "google";
+                azure: "azure";
                 mistral: "mistral";
                 deepseek: "deepseek";
                 openrouter: "openrouter";
                 xai: "xai";
-                "vertex-anthropic": "vertex-anthropic";
             }>>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
             model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 threshold: z.ZodOptional<z.ZodNumber>;
                 reps: z.ZodOptional<z.ZodNumber>;
                 provider: z.ZodOptional<z.ZodEnum<{
-                    openai: "openai";
                     anthropic: "anthropic";
-                    google: "google";
                     "vertex-anthropic": "vertex-anthropic";
                     "anthropic-agent-sdk": "anthropic-agent-sdk";
+                    openai: "openai";
+                    google: "google";
                 }>>;
                 model: z.ZodOptional<z.ZodString>;
                 apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 threshold: z.ZodOptional<z.ZodNumber>;
                 reps: z.ZodOptional<z.ZodNumber>;
                 provider: z.ZodOptional<z.ZodEnum<{
-                    openai: "openai";
                     anthropic: "anthropic";
-                    google: "google";
                     "vertex-anthropic": "vertex-anthropic";
                     "anthropic-agent-sdk": "anthropic-agent-sdk";
+                    openai: "openai";
+                    google: "google";
                 }>>;
                 model: z.ZodOptional<z.ZodString>;
                 apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
             name: string;
         }>;
     };
+    /** Token usage from mcp_host LLM simulation in this iteration */
+    hostUsage?: UsageMetrics;
 }
 /**
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
             name: string;
         }>;
     };
+    /**
+     * Aggregate token usage from mcp_host LLM simulation for this case.
+     * Summed across all iterations. Only populated for mcp_host mode cases.
+     */
+    hostUsage?: UsageMetrics;
 }
 /**
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
          * Expectation type breakdown
          */
         expectationBreakdown: ExpectationBreakdown;
+        /**
+         * Aggregate token usage from all mcp_host LLM simulations in this run.
+         */
+        totalHostUsage?: UsageMetrics;
     };
     /**
      * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
      * Experiment tracking metadata captured at run time.
      */
     metadata?: EvalRunMetadata;
+    /**
+     * Aggregate token usage from all mcp_host LLM simulations across all cases.
+     */
+    totalHostUsage?: UsageMetrics;
 }
 /**
  * Options for running eval dataset

package/dist/index.d.ts CHANGED Viewed

@@ -2684,6 +2684,56 @@ interface CLIConfig {
      */
     timeout?: number;
 }
+/**
+ * A cookie to inject into the browser context before running the script.
+ * Matches the shape expected by Playwright's `BrowserContext.addCookies()`.
+ */
+interface BrowserCookie {
+    name: string;
+    value: string;
+    url?: string;
+    domain?: string;
+    path?: string;
+    expires?: number;
+    httpOnly?: boolean;
+    secure?: boolean;
+    sameSite?: 'Strict' | 'Lax' | 'None';
+    partitionKey?: string;
+}
+/**
+ * Configuration for a browser-based host.
+ *
+ * Uses Playwright to launch a Chromium instance, inject auth state,
+ * and execute a user-provided script that drives a web-based MCP host
+ * (e.g., claude.ai).
+ */
+interface BrowserConfig {
+    /**
+     * Path to the browser script (resolved relative to cwd).
+     * The script must default-export an async function
+     * `(page: Page, scenario: string) => Promise<MCPHostSimulationResult>`.
+     */
+    script: string;
+    /**
+     * Timeout in milliseconds for the browser script.
+     * @default 120000 (2 minutes)
+     */
+    timeout?: number;
+    /**
+     * Whether to launch in headless mode.
+     * @default true
+     */
+    headless?: boolean;
+    /**
+     * Path to a Playwright storage state JSON file (cookies + localStorage).
+     * Resolved relative to cwd.
+     */
+    storageState?: string;
+    /**
+     * Extra cookies to inject into the browser context.
+     */
+    cookies?: BrowserCookie[];
+}
 /**
  * Configuration for MCP host simulation
  */
@@ -2729,6 +2779,10 @@ interface MCPHostConfig {
      * CLI host configuration (required for 'cli' host type).
      */
     cli?: CLIConfig;
+    /**
+     * Browser host configuration (required for 'browser' host type).
+     */
+    browser?: BrowserConfig;
 }
 /**
  * A tool call made by the LLM
@@ -2770,6 +2824,11 @@ interface MCPHostSimulationResult {
      * (excludes LLM response time)
      */
     mcpDurationMs?: number;
+    /**
+     * Token usage from the LLM during simulation.
+     * Populated by SDK-based hosts from the AI SDK response.
+     */
+    usage?: UsageMetrics;
 }
 /**
  * Interface for MCP host simulators.
@@ -3071,15 +3130,15 @@ declare const EvalCaseSchema: z.ZodObject<{
             desktop: "desktop";
         }>>;
         provider: z.ZodOptional<z.ZodEnum<{
-            openai: "openai";
             anthropic: "anthropic";
-            azure: "azure";
+            "vertex-anthropic": "vertex-anthropic";
+            openai: "openai";
             google: "google";
+            azure: "azure";
             mistral: "mistral";
             deepseek: "deepseek";
             openrouter: "openrouter";
             xai: "xai";
-            "vertex-anthropic": "vertex-anthropic";
         }>>;
         apiKeyEnvVar: z.ZodOptional<z.ZodString>;
         model: z.ZodOptional<z.ZodString>;
@@ -3136,11 +3195,11 @@ declare const EvalCaseSchema: z.ZodObject<{
             threshold: z.ZodOptional<z.ZodNumber>;
             reps: z.ZodOptional<z.ZodNumber>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                google: "google";
                 "vertex-anthropic": "vertex-anthropic";
                 "anthropic-agent-sdk": "anthropic-agent-sdk";
+                openai: "openai";
+                google: "google";
             }>>;
             model: z.ZodOptional<z.ZodString>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3163,11 +3222,11 @@ declare const EvalCaseSchema: z.ZodObject<{
             threshold: z.ZodOptional<z.ZodNumber>;
             reps: z.ZodOptional<z.ZodNumber>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                google: "google";
                 "vertex-anthropic": "vertex-anthropic";
                 "anthropic-agent-sdk": "anthropic-agent-sdk";
+                openai: "openai";
+                google: "google";
             }>>;
             model: z.ZodOptional<z.ZodString>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3223,15 +3282,15 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 desktop: "desktop";
             }>>;
             provider: z.ZodOptional<z.ZodEnum<{
-                openai: "openai";
                 anthropic: "anthropic";
-                azure: "azure";
+                "vertex-anthropic": "vertex-anthropic";
+                openai: "openai";
                 google: "google";
+                azure: "azure";
                 mistral: "mistral";
                 deepseek: "deepseek";
                 openrouter: "openrouter";
                 xai: "xai";
-                "vertex-anthropic": "vertex-anthropic";
             }>>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
             model: z.ZodOptional<z.ZodString>;
@@ -3288,11 +3347,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 threshold: z.ZodOptional<z.ZodNumber>;
                 reps: z.ZodOptional<z.ZodNumber>;
                 provider: z.ZodOptional<z.ZodEnum<{
-                    openai: "openai";
                     anthropic: "anthropic";
-                    google: "google";
                     "vertex-anthropic": "vertex-anthropic";
                     "anthropic-agent-sdk": "anthropic-agent-sdk";
+                    openai: "openai";
+                    google: "google";
                 }>>;
                 model: z.ZodOptional<z.ZodString>;
                 apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3315,11 +3374,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 threshold: z.ZodOptional<z.ZodNumber>;
                 reps: z.ZodOptional<z.ZodNumber>;
                 provider: z.ZodOptional<z.ZodEnum<{
-                    openai: "openai";
                     anthropic: "anthropic";
-                    google: "google";
                     "vertex-anthropic": "vertex-anthropic";
                     "anthropic-agent-sdk": "anthropic-agent-sdk";
+                    openai: "openai";
+                    google: "google";
                 }>>;
                 model: z.ZodOptional<z.ZodString>;
                 apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3562,6 +3621,8 @@ interface IterationResult {
             name: string;
         }>;
     };
+    /** Token usage from mcp_host LLM simulation in this iteration */
+    hostUsage?: UsageMetrics;
 }
 /**
  * Request data captured from the eval case input.
@@ -3710,6 +3771,11 @@ interface EvalCaseResult {
             name: string;
         }>;
     };
+    /**
+     * Aggregate token usage from mcp_host LLM simulation for this case.
+     * Summed across all iterations. Only populated for mcp_host mode cases.
+     */
+    hostUsage?: UsageMetrics;
 }
 /**
  * Aggregated MCP eval run data
@@ -3759,6 +3825,10 @@ interface MCPEvalRunData {
          * Expectation type breakdown
          */
         expectationBreakdown: ExpectationBreakdown;
+        /**
+         * Aggregate token usage from all mcp_host LLM simulations in this run.
+         */
+        totalHostUsage?: UsageMetrics;
     };
     /**
      * All eval results from this run
@@ -3873,6 +3943,10 @@ interface EvalRunnerResult {
      * Experiment tracking metadata captured at run time.
      */
     metadata?: EvalRunMetadata;
+    /**
+     * Aggregate token usage from all mcp_host LLM simulations across all cases.
+     */
+    totalHostUsage?: UsageMetrics;
 }
 /**
  * Options for running eval dataset

package/dist/index.js CHANGED Viewed

@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0"};
+  version: "1.0.1-beta.0"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -6931,6 +6931,12 @@ function createVercelOrchestrator() {
         });
         const totalDurationMs = Date.now() - llmStart;
         const llmDurationMs = totalDurationMs - mcpDurationMs;
+        const hostUsage = result.usage ? {
+          inputTokens: result.usage.promptTokens ?? 0,
+          outputTokens: result.usage.completionTokens ?? 0,
+          totalCostUsd: 0,
+          durationMs: llmDurationMs
+        } : void 0;
         const conversationHistory = (result.steps ?? []).map((step) => ({
           role: step.toolCalls?.length > 0 ? "tool" : "assistant",
           content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6942,7 +6948,8 @@ function createVercelOrchestrator() {
           scenario,
           llmDurationMs,
           mcpDurationMs,
-          conversationHistory
+          conversationHistory,
+          usage: hostUsage
         };
       } catch (err) {
         return {
@@ -6960,6 +6967,7 @@ function parseStreamJson(stdout) {
   const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
   const toolCalls = [];
   const textParts = [];
+  let usage;
   const conversationHistory = [];
   for (const line of lines) {
     let event;
@@ -6992,16 +7000,28 @@ function parseStreamJson(stdout) {
         }
       }
     }
-    if (event.type === "result" && typeof event.result === "string") {
-      if (textParts.length === 0) {
+    if (event.type === "result") {
+      if (typeof event.result === "string" && textParts.length === 0) {
         textParts.push(event.result);
       }
+      if (event.usage) {
+        usage = {
+          inputTokens: event.usage.input_tokens ?? 0,
+          outputTokens: event.usage.output_tokens ?? 0,
+          totalCostUsd: event.total_cost_usd ?? 0,
+          durationMs: event.duration_ms ?? 0,
+          durationApiMs: event.duration_api_ms,
+          cacheReadInputTokens: event.usage.cache_read_input_tokens,
+          cacheCreationInputTokens: event.usage.cache_creation_input_tokens
+        };
+      }
     }
     if (event.type === "result" && event.is_error === true) {
       return {
         success: false,
         toolCalls,
-        error: typeof event.result === "string" ? event.result : "CLI host reported an error"
+        error: typeof event.result === "string" ? event.result : "CLI host reported an error",
+        usage
       };
     }
   }
@@ -7013,7 +7033,8 @@ function parseStreamJson(stdout) {
     success: true,
     toolCalls,
     response: response || void 0,
-    conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
+    conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
+    usage
   };
 }
 function createJsonParser(paths) {
@@ -7278,6 +7299,32 @@ async function execFileNoThrow(file, args) {
   }
 }
+// src/utils/usageUtils.ts
+function optionalSum(a, b) {
+  if (a === void 0 && b === void 0) return void 0;
+  return (a ?? 0) + (b ?? 0);
+}
+function sumUsage(a, b) {
+  if (!a && !b) return void 0;
+  if (!a) return b ? { ...b } : void 0;
+  if (!b) return { ...a };
+  return {
+    inputTokens: a.inputTokens + b.inputTokens,
+    outputTokens: a.outputTokens + b.outputTokens,
+    totalCostUsd: a.totalCostUsd + b.totalCostUsd,
+    durationMs: a.durationMs + b.durationMs,
+    durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
+    cacheReadInputTokens: optionalSum(
+      a.cacheReadInputTokens,
+      b.cacheReadInputTokens
+    ),
+    cacheCreationInputTokens: optionalSum(
+      a.cacheCreationInputTokens,
+      b.cacheCreationInputTokens
+    )
+  };
+}
 // src/evals/evalRunner.ts
 async function executeToolCall(evalCase, mcp) {
   const mode = evalCase.mode || "direct";
@@ -7523,6 +7570,7 @@ async function runSingleIteration(evalCase, context, options) {
       };
     }
   }
+  const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
   return {
     id: evalCase.id,
     datasetName: options.datasetName ?? "single-case",
@@ -7539,7 +7587,8 @@ async function runSingleIteration(evalCase, context, options) {
     tags: evalCase.tags,
     toolPrecision,
     toolRecall,
-    mcpHostTrace
+    mcpHostTrace,
+    hostUsage
   };
 }
 function isInfrastructureError(err) {
@@ -7575,7 +7624,8 @@ async function runEvalCase(evalCase, context, options = {}) {
         durationMs: result.durationMs,
         error: result.error,
         isInfrastructureError: infraError,
-        mcpHostTrace: result.mcpHostTrace
+        mcpHostTrace: result.mcpHostTrace,
+        hostUsage: result.hostUsage
       });
     } catch (err) {
       const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7608,6 +7658,10 @@ async function runEvalCase(evalCase, context, options = {}) {
     durationMs: 0,
     tags: evalCase.tags
   };
+  const totalHostUsage = iterationResults.reduce(
+    (acc, r) => sumUsage(acc, r.hostUsage),
+    void 0
+  );
   return {
     ...baseResult,
     pass: assertionPassRate >= threshold,
@@ -7616,7 +7670,8 @@ async function runEvalCase(evalCase, context, options = {}) {
     infrastructureErrorRate,
     iterationResults,
     infrastructureErrorCount: infraErrors.length,
-    durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
+    durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
+    hostUsage: totalHostUsage
   };
 }
 function wilsonCI(k, n) {
@@ -7726,13 +7781,18 @@ async function runEvalDataset(options, context) {
     ...mcpHostModel !== void 0 && { mcpHostModel },
     ...judgeModel !== void 0 && { judgeModel }
   };
+  const runHostUsage = caseResults.reduce(
+    (acc, r) => sumUsage(acc, r.hostUsage),
+    void 0
+  );
   const result = {
     total,
     passed,
     failed: total - passed,
     caseResults,
     durationMs: Date.now() - startTime,
-    metadata
+    metadata,
+    totalHostUsage: runHostUsage
   };
   if (baselineResultsFrom) {
     try {