@gleanwork/mcp-server-tester 1.0.0-beta.5 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -3252,6 +3252,21 @@ interface IterationResult {
3252
3252
  error?: string;
3253
3253
  /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
3254
3254
  isInfrastructureError?: boolean;
3255
+ /**
3256
+ * Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
3257
+ * Captures what was actually called so you can distinguish "LLM didn't call the tool"
3258
+ * from "LLM called the wrong tool" from "tool was called but assertion failed".
3259
+ */
3260
+ mcpHostTrace?: {
3261
+ calls: Array<{
3262
+ name: string;
3263
+ arguments: Record<string, unknown>;
3264
+ status: 'expected' | 'unexpected';
3265
+ }>;
3266
+ missed: Array<{
3267
+ name: string;
3268
+ }>;
3269
+ };
3255
3270
  }
3256
3271
  /**
3257
3272
  * Result of a single eval case
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
3619
3634
  * @example '.mcp-test-results/baseline.json'
3620
3635
  */
3621
3636
  saveResultsTo?: string;
3637
+ /**
3638
+ * When true (default), strips the `response` field from each case result
3639
+ * before saving the baseline file. Keeps baseline files small and git-friendly —
3640
+ * the full tool response is not needed for pass/fail regression detection.
3641
+ *
3642
+ * Set to false to preserve complete responses in the saved file.
3643
+ *
3644
+ * @default true
3645
+ */
3646
+ omitResponsesFromBaseline?: boolean;
3622
3647
  /**
3623
3648
  * If set, loads this file as the baseline and computes delta metrics vs the current run.
3624
3649
  * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
3676
3701
  declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
3677
3702
  declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
3678
3703
 
3704
+ /**
3705
+ * Options for saveBaseline
3706
+ */
3707
+ interface SaveBaselineOptions {
3708
+ /**
3709
+ * When true (default), strips the `response` field from each case result
3710
+ * before saving. Keeps baseline files small and git-friendly — the baseline
3711
+ * is a pass/fail record and the full response is not needed for comparison.
3712
+ *
3713
+ * Set to false to preserve the complete response in the saved file.
3714
+ *
3715
+ * @default true
3716
+ */
3717
+ omitResponses?: boolean;
3718
+ }
3679
3719
  /**
3680
3720
  * Saves eval results to a JSON file for use as a baseline in future runs.
3681
3721
  *
3682
3722
  * @param result - The eval run result to save
3683
3723
  * @param filePath - Path to write the JSON file (parent dirs created automatically)
3724
+ * @param options - Save options
3684
3725
  */
3685
- declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
3726
+ declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
3686
3727
  /**
3687
3728
  * Loads a previously saved baseline from a JSON file.
3688
3729
  *
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
4025
4066
  includeAutoTracking?: boolean;
4026
4067
  }
4027
4068
 
4028
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4069
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
package/dist/index.d.ts CHANGED
@@ -3252,6 +3252,21 @@ interface IterationResult {
3252
3252
  error?: string;
3253
3253
  /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
3254
3254
  isInfrastructureError?: boolean;
3255
+ /**
3256
+ * Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
3257
+ * Captures what was actually called so you can distinguish "LLM didn't call the tool"
3258
+ * from "LLM called the wrong tool" from "tool was called but assertion failed".
3259
+ */
3260
+ mcpHostTrace?: {
3261
+ calls: Array<{
3262
+ name: string;
3263
+ arguments: Record<string, unknown>;
3264
+ status: 'expected' | 'unexpected';
3265
+ }>;
3266
+ missed: Array<{
3267
+ name: string;
3268
+ }>;
3269
+ };
3255
3270
  }
3256
3271
  /**
3257
3272
  * Result of a single eval case
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
3619
3634
  * @example '.mcp-test-results/baseline.json'
3620
3635
  */
3621
3636
  saveResultsTo?: string;
3637
+ /**
3638
+ * When true (default), strips the `response` field from each case result
3639
+ * before saving the baseline file. Keeps baseline files small and git-friendly —
3640
+ * the full tool response is not needed for pass/fail regression detection.
3641
+ *
3642
+ * Set to false to preserve complete responses in the saved file.
3643
+ *
3644
+ * @default true
3645
+ */
3646
+ omitResponsesFromBaseline?: boolean;
3622
3647
  /**
3623
3648
  * If set, loads this file as the baseline and computes delta metrics vs the current run.
3624
3649
  * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
3676
3701
  declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
3677
3702
  declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
3678
3703
 
3704
+ /**
3705
+ * Options for saveBaseline
3706
+ */
3707
+ interface SaveBaselineOptions {
3708
+ /**
3709
+ * When true (default), strips the `response` field from each case result
3710
+ * before saving. Keeps baseline files small and git-friendly — the baseline
3711
+ * is a pass/fail record and the full response is not needed for comparison.
3712
+ *
3713
+ * Set to false to preserve the complete response in the saved file.
3714
+ *
3715
+ * @default true
3716
+ */
3717
+ omitResponses?: boolean;
3718
+ }
3679
3719
  /**
3680
3720
  * Saves eval results to a JSON file for use as a baseline in future runs.
3681
3721
  *
3682
3722
  * @param result - The eval run result to save
3683
3723
  * @param filePath - Path to write the JSON file (parent dirs created automatically)
3724
+ * @param options - Save options
3684
3725
  */
3685
- declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
3726
+ declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
3686
3727
  /**
3687
3728
  * Loads a previously saved baseline from a JSON file.
3688
3729
  *
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
4025
4066
  includeAutoTracking?: boolean;
4026
4067
  }
4027
4068
 
4028
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4069
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
package/dist/index.js CHANGED
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.5"};
4383
+ version: "1.0.0-beta.6"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -6642,9 +6642,16 @@ function getMissingDependencyMessage(provider) {
6642
6642
  const pkg = packageMap[provider];
6643
6643
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
6644
6644
  }
6645
- async function saveBaseline(result, filePath) {
6645
+ async function saveBaseline(result, filePath, options = {}) {
6646
+ const { omitResponses = true } = options;
6647
+ const toSave = omitResponses ? {
6648
+ ...result,
6649
+ caseResults: result.caseResults.map(
6650
+ ({ response: _response, ...rest }) => rest
6651
+ )
6652
+ } : result;
6646
6653
  await mkdir(dirname(filePath), { recursive: true });
6647
- await writeFile(filePath, JSON.stringify(result, null, 2), "utf8");
6654
+ await writeFile(filePath, JSON.stringify(toSave, null, 2), "utf8");
6648
6655
  }
6649
6656
  async function loadBaseline(filePath) {
6650
6657
  const raw = await readFile(filePath, "utf8");
@@ -6912,7 +6919,8 @@ function isInfrastructureError(err) {
6912
6919
  } else {
6913
6920
  return false;
6914
6921
  }
6915
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6922
+ return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow LLM couldn't run, not a tool discoverability failure
6923
+ msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6916
6924
  }
6917
6925
  async function runEvalCase(evalCase, context, options = {}) {
6918
6926
  const iterations = evalCase.iterations ?? 1;
@@ -6930,7 +6938,8 @@ async function runEvalCase(evalCase, context, options = {}) {
6930
6938
  pass: result.pass,
6931
6939
  durationMs: result.durationMs,
6932
6940
  error: result.error,
6933
- isInfrastructureError: infraError
6941
+ isInfrastructureError: infraError,
6942
+ mcpHostTrace: result.mcpHostTrace
6934
6943
  });
6935
6944
  } catch (err) {
6936
6945
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7014,6 +7023,7 @@ async function runEvalDataset(options, context) {
7014
7023
  onCaseComplete,
7015
7024
  filterTags,
7016
7025
  saveResultsTo,
7026
+ omitResponsesFromBaseline = true,
7017
7027
  baselineResultsFrom,
7018
7028
  mcpHostModel,
7019
7029
  judgeModel
@@ -7128,7 +7138,9 @@ async function runEvalDataset(options, context) {
7128
7138
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
7129
7139
  }
7130
7140
  if (saveResultsTo) {
7131
- await saveBaseline(result, saveResultsTo);
7141
+ await saveBaseline(result, saveResultsTo, {
7142
+ omitResponses: omitResponsesFromBaseline
7143
+ });
7132
7144
  }
7133
7145
  if (context.testInfo) {
7134
7146
  await context.testInfo.attach("mcp-test-results", {