npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.5 → 1.0.0-beta.6 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.5 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/cli/index.js +1 -1
package/dist/fixtures/mcp.js +1 -1
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +18 -6
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +43 -2
package/dist/index.d.ts +43 -2
package/dist/index.js +18 -6
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +5 -5
package/package.json +1 -1

package/dist/index.d.cts CHANGED Viewed

@@ -3252,6 +3252,21 @@ interface IterationResult {
     error?: string;
     /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
     isInfrastructureError?: boolean;
+    /**
+     * Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
+     * Captures what was actually called so you can distinguish "LLM didn't call the tool"
+     * from "LLM called the wrong tool" from "tool was called but assertion failed".
+     */
+    mcpHostTrace?: {
+        calls: Array<{
+            name: string;
+            arguments: Record<string, unknown>;
+            status: 'expected' | 'unexpected';
+        }>;
+        missed: Array<{
+            name: string;
+        }>;
+    };
 }
 /**
  * Result of a single eval case
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
      * @example '.mcp-test-results/baseline.json'
      */
     saveResultsTo?: string;
+    /**
+     * When true (default), strips the `response` field from each case result
+     * before saving the baseline file. Keeps baseline files small and git-friendly —
+     * the full tool response is not needed for pass/fail regression detection.
+     *
+     * Set to false to preserve complete responses in the saved file.
+     *
+     * @default true
+     */
+    omitResponsesFromBaseline?: boolean;
     /**
      * If set, loads this file as the baseline and computes delta metrics vs the current run.
      * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
 declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
 declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
+/**
+ * Options for saveBaseline
+ */
+interface SaveBaselineOptions {
+    /**
+     * When true (default), strips the `response` field from each case result
+     * before saving. Keeps baseline files small and git-friendly — the baseline
+     * is a pass/fail record and the full response is not needed for comparison.
+     *
+     * Set to false to preserve the complete response in the saved file.
+     *
+     * @default true
+     */
+    omitResponses?: boolean;
+}
 /**
  * Saves eval results to a JSON file for use as a baseline in future runs.
  *
  * @param result - The eval run result to save
  * @param filePath - Path to write the JSON file (parent dirs created automatically)
+ * @param options - Save options
  */
-declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
+declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
 /**
  * Loads a previously saved baseline from a JSON file.
  *
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
     includeAutoTracking?: boolean;
 }
-export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
+export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };

package/dist/index.d.ts CHANGED Viewed

@@ -3252,6 +3252,21 @@ interface IterationResult {
     error?: string;
     /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
     isInfrastructureError?: boolean;
+    /**
+     * Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
+     * Captures what was actually called so you can distinguish "LLM didn't call the tool"
+     * from "LLM called the wrong tool" from "tool was called but assertion failed".
+     */
+    mcpHostTrace?: {
+        calls: Array<{
+            name: string;
+            arguments: Record<string, unknown>;
+            status: 'expected' | 'unexpected';
+        }>;
+        missed: Array<{
+            name: string;
+        }>;
+    };
 }
 /**
  * Result of a single eval case
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
      * @example '.mcp-test-results/baseline.json'
      */
     saveResultsTo?: string;
+    /**
+     * When true (default), strips the `response` field from each case result
+     * before saving the baseline file. Keeps baseline files small and git-friendly —
+     * the full tool response is not needed for pass/fail regression detection.
+     *
+     * Set to false to preserve complete responses in the saved file.
+     *
+     * @default true
+     */
+    omitResponsesFromBaseline?: boolean;
     /**
      * If set, loads this file as the baseline and computes delta metrics vs the current run.
      * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
 declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
 declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
+/**
+ * Options for saveBaseline
+ */
+interface SaveBaselineOptions {
+    /**
+     * When true (default), strips the `response` field from each case result
+     * before saving. Keeps baseline files small and git-friendly — the baseline
+     * is a pass/fail record and the full response is not needed for comparison.
+     *
+     * Set to false to preserve the complete response in the saved file.
+     *
+     * @default true
+     */
+    omitResponses?: boolean;
+}
 /**
  * Saves eval results to a JSON file for use as a baseline in future runs.
  *
  * @param result - The eval run result to save
  * @param filePath - Path to write the JSON file (parent dirs created automatically)
+ * @param options - Save options
  */
-declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
+declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
 /**
  * Loads a previously saved baseline from a JSON file.
  *
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
     includeAutoTracking?: boolean;
 }
-export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
+export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };

package/dist/index.js CHANGED Viewed

@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.5"};
+  version: "1.0.0-beta.6"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -6642,9 +6642,16 @@ function getMissingDependencyMessage(provider) {
   const pkg = packageMap[provider];
   return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
 }
-async function saveBaseline(result, filePath) {
+async function saveBaseline(result, filePath, options = {}) {
+  const { omitResponses = true } = options;
+  const toSave = omitResponses ? {
+    ...result,
+    caseResults: result.caseResults.map(
+      ({ response: _response, ...rest }) => rest
+    )
+  } : result;
   await mkdir(dirname(filePath), { recursive: true });
-  await writeFile(filePath, JSON.stringify(result, null, 2), "utf8");
+  await writeFile(filePath, JSON.stringify(toSave, null, 2), "utf8");
 }
 async function loadBaseline(filePath) {
   const raw = await readFile(filePath, "utf8");
@@ -6912,7 +6919,8 @@ function isInfrastructureError(err) {
   } else {
     return false;
   }
-  return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
+  return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
+  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
 }
 async function runEvalCase(evalCase, context, options = {}) {
   const iterations = evalCase.iterations ?? 1;
@@ -6930,7 +6938,8 @@ async function runEvalCase(evalCase, context, options = {}) {
         pass: result.pass,
         durationMs: result.durationMs,
         error: result.error,
-        isInfrastructureError: infraError
+        isInfrastructureError: infraError,
+        mcpHostTrace: result.mcpHostTrace
       });
     } catch (err) {
       const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7014,6 +7023,7 @@ async function runEvalDataset(options, context) {
     onCaseComplete,
     filterTags,
     saveResultsTo,
+    omitResponsesFromBaseline = true,
     baselineResultsFrom,
     mcpHostModel,
     judgeModel
@@ -7128,7 +7138,9 @@ async function runEvalDataset(options, context) {
     result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
   }
   if (saveResultsTo) {
-    await saveBaseline(result, saveResultsTo);
+    await saveBaseline(result, saveResultsTo, {
+      omitResponses: omitResponsesFromBaseline
+    });
   }
   if (context.testInfo) {
     await context.testInfo.attach("mcp-test-results", {