@gleanwork/mcp-server-tester 1.0.0-beta.5 → 1.0.0-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +18 -6
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +43 -2
- package/dist/index.d.ts +43 -2
- package/dist/index.js +18 -6
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +5 -5
- package/package.json +2 -2
package/dist/index.d.cts
CHANGED
|
@@ -3252,6 +3252,21 @@ interface IterationResult {
|
|
|
3252
3252
|
error?: string;
|
|
3253
3253
|
/** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
|
|
3254
3254
|
isInfrastructureError?: boolean;
|
|
3255
|
+
/**
|
|
3256
|
+
* Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
|
|
3257
|
+
* Captures what was actually called so you can distinguish "LLM didn't call the tool"
|
|
3258
|
+
* from "LLM called the wrong tool" from "tool was called but assertion failed".
|
|
3259
|
+
*/
|
|
3260
|
+
mcpHostTrace?: {
|
|
3261
|
+
calls: Array<{
|
|
3262
|
+
name: string;
|
|
3263
|
+
arguments: Record<string, unknown>;
|
|
3264
|
+
status: 'expected' | 'unexpected';
|
|
3265
|
+
}>;
|
|
3266
|
+
missed: Array<{
|
|
3267
|
+
name: string;
|
|
3268
|
+
}>;
|
|
3269
|
+
};
|
|
3255
3270
|
}
|
|
3256
3271
|
/**
|
|
3257
3272
|
* Result of a single eval case
|
|
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
|
|
|
3619
3634
|
* @example '.mcp-test-results/baseline.json'
|
|
3620
3635
|
*/
|
|
3621
3636
|
saveResultsTo?: string;
|
|
3637
|
+
/**
|
|
3638
|
+
* When true (default), strips the `response` field from each case result
|
|
3639
|
+
* before saving the baseline file. Keeps baseline files small and git-friendly —
|
|
3640
|
+
* the full tool response is not needed for pass/fail regression detection.
|
|
3641
|
+
*
|
|
3642
|
+
* Set to false to preserve complete responses in the saved file.
|
|
3643
|
+
*
|
|
3644
|
+
* @default true
|
|
3645
|
+
*/
|
|
3646
|
+
omitResponsesFromBaseline?: boolean;
|
|
3622
3647
|
/**
|
|
3623
3648
|
* If set, loads this file as the baseline and computes delta metrics vs the current run.
|
|
3624
3649
|
* Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
|
|
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
|
|
|
3676
3701
|
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
|
|
3677
3702
|
declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
|
|
3678
3703
|
|
|
3704
|
+
/**
|
|
3705
|
+
* Options for saveBaseline
|
|
3706
|
+
*/
|
|
3707
|
+
interface SaveBaselineOptions {
|
|
3708
|
+
/**
|
|
3709
|
+
* When true (default), strips the `response` field from each case result
|
|
3710
|
+
* before saving. Keeps baseline files small and git-friendly — the baseline
|
|
3711
|
+
* is a pass/fail record and the full response is not needed for comparison.
|
|
3712
|
+
*
|
|
3713
|
+
* Set to false to preserve the complete response in the saved file.
|
|
3714
|
+
*
|
|
3715
|
+
* @default true
|
|
3716
|
+
*/
|
|
3717
|
+
omitResponses?: boolean;
|
|
3718
|
+
}
|
|
3679
3719
|
/**
|
|
3680
3720
|
* Saves eval results to a JSON file for use as a baseline in future runs.
|
|
3681
3721
|
*
|
|
3682
3722
|
* @param result - The eval run result to save
|
|
3683
3723
|
* @param filePath - Path to write the JSON file (parent dirs created automatically)
|
|
3724
|
+
* @param options - Save options
|
|
3684
3725
|
*/
|
|
3685
|
-
declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
|
|
3726
|
+
declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
|
|
3686
3727
|
/**
|
|
3687
3728
|
* Loads a previously saved baseline from a JSON file.
|
|
3688
3729
|
*
|
|
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
|
|
|
4025
4066
|
includeAutoTracking?: boolean;
|
|
4026
4067
|
}
|
|
4027
4068
|
|
|
4028
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4069
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
package/dist/index.d.ts
CHANGED
|
@@ -3252,6 +3252,21 @@ interface IterationResult {
|
|
|
3252
3252
|
error?: string;
|
|
3253
3253
|
/** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
|
|
3254
3254
|
isInfrastructureError?: boolean;
|
|
3255
|
+
/**
|
|
3256
|
+
* Ordered trace of tool calls made by the LLM during this iteration (mcp_host mode only).
|
|
3257
|
+
* Captures what was actually called so you can distinguish "LLM didn't call the tool"
|
|
3258
|
+
* from "LLM called the wrong tool" from "tool was called but assertion failed".
|
|
3259
|
+
*/
|
|
3260
|
+
mcpHostTrace?: {
|
|
3261
|
+
calls: Array<{
|
|
3262
|
+
name: string;
|
|
3263
|
+
arguments: Record<string, unknown>;
|
|
3264
|
+
status: 'expected' | 'unexpected';
|
|
3265
|
+
}>;
|
|
3266
|
+
missed: Array<{
|
|
3267
|
+
name: string;
|
|
3268
|
+
}>;
|
|
3269
|
+
};
|
|
3255
3270
|
}
|
|
3256
3271
|
/**
|
|
3257
3272
|
* Result of a single eval case
|
|
@@ -3619,6 +3634,16 @@ interface EvalRunnerOptions {
|
|
|
3619
3634
|
* @example '.mcp-test-results/baseline.json'
|
|
3620
3635
|
*/
|
|
3621
3636
|
saveResultsTo?: string;
|
|
3637
|
+
/**
|
|
3638
|
+
* When true (default), strips the `response` field from each case result
|
|
3639
|
+
* before saving the baseline file. Keeps baseline files small and git-friendly —
|
|
3640
|
+
* the full tool response is not needed for pass/fail regression detection.
|
|
3641
|
+
*
|
|
3642
|
+
* Set to false to preserve complete responses in the saved file.
|
|
3643
|
+
*
|
|
3644
|
+
* @default true
|
|
3645
|
+
*/
|
|
3646
|
+
omitResponsesFromBaseline?: boolean;
|
|
3622
3647
|
/**
|
|
3623
3648
|
* If set, loads this file as the baseline and computes delta metrics vs the current run.
|
|
3624
3649
|
* Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
|
|
@@ -3676,13 +3701,29 @@ interface EvalCaseOptions {
|
|
|
3676
3701
|
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
|
|
3677
3702
|
declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
|
|
3678
3703
|
|
|
3704
|
+
/**
|
|
3705
|
+
* Options for saveBaseline
|
|
3706
|
+
*/
|
|
3707
|
+
interface SaveBaselineOptions {
|
|
3708
|
+
/**
|
|
3709
|
+
* When true (default), strips the `response` field from each case result
|
|
3710
|
+
* before saving. Keeps baseline files small and git-friendly — the baseline
|
|
3711
|
+
* is a pass/fail record and the full response is not needed for comparison.
|
|
3712
|
+
*
|
|
3713
|
+
* Set to false to preserve the complete response in the saved file.
|
|
3714
|
+
*
|
|
3715
|
+
* @default true
|
|
3716
|
+
*/
|
|
3717
|
+
omitResponses?: boolean;
|
|
3718
|
+
}
|
|
3679
3719
|
/**
|
|
3680
3720
|
* Saves eval results to a JSON file for use as a baseline in future runs.
|
|
3681
3721
|
*
|
|
3682
3722
|
* @param result - The eval run result to save
|
|
3683
3723
|
* @param filePath - Path to write the JSON file (parent dirs created automatically)
|
|
3724
|
+
* @param options - Save options
|
|
3684
3725
|
*/
|
|
3685
|
-
declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
|
|
3726
|
+
declare function saveBaseline(result: EvalRunnerResult, filePath: string, options?: SaveBaselineOptions): Promise<void>;
|
|
3686
3727
|
/**
|
|
3687
3728
|
* Loads a previously saved baseline from a JSON file.
|
|
3688
3729
|
*
|
|
@@ -4025,4 +4066,4 @@ interface MCPEvalReporterConfig {
|
|
|
4025
4066
|
includeAutoTracking?: boolean;
|
|
4026
4067
|
}
|
|
4027
4068
|
|
|
4028
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
4069
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
package/dist/index.js
CHANGED
|
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
|
|
|
4380
4380
|
|
|
4381
4381
|
// package.json
|
|
4382
4382
|
var package_default = {
|
|
4383
|
-
version: "1.0.0-beta.
|
|
4383
|
+
version: "1.0.0-beta.7"};
|
|
4384
4384
|
|
|
4385
4385
|
// src/mcp/clientFactory.ts
|
|
4386
4386
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6642,9 +6642,16 @@ function getMissingDependencyMessage(provider) {
|
|
|
6642
6642
|
const pkg = packageMap[provider];
|
|
6643
6643
|
return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
|
|
6644
6644
|
}
|
|
6645
|
-
async function saveBaseline(result, filePath) {
|
|
6645
|
+
async function saveBaseline(result, filePath, options = {}) {
|
|
6646
|
+
const { omitResponses = true } = options;
|
|
6647
|
+
const toSave = omitResponses ? {
|
|
6648
|
+
...result,
|
|
6649
|
+
caseResults: result.caseResults.map(
|
|
6650
|
+
({ response: _response, ...rest }) => rest
|
|
6651
|
+
)
|
|
6652
|
+
} : result;
|
|
6646
6653
|
await mkdir(dirname(filePath), { recursive: true });
|
|
6647
|
-
await writeFile(filePath, JSON.stringify(
|
|
6654
|
+
await writeFile(filePath, JSON.stringify(toSave, null, 2), "utf8");
|
|
6648
6655
|
}
|
|
6649
6656
|
async function loadBaseline(filePath) {
|
|
6650
6657
|
const raw = await readFile(filePath, "utf8");
|
|
@@ -6912,7 +6919,8 @@ function isInfrastructureError(err) {
|
|
|
6912
6919
|
} else {
|
|
6913
6920
|
return false;
|
|
6914
6921
|
}
|
|
6915
|
-
return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") ||
|
|
6922
|
+
return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
6923
|
+
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
6916
6924
|
}
|
|
6917
6925
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
6918
6926
|
const iterations = evalCase.iterations ?? 1;
|
|
@@ -6930,7 +6938,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6930
6938
|
pass: result.pass,
|
|
6931
6939
|
durationMs: result.durationMs,
|
|
6932
6940
|
error: result.error,
|
|
6933
|
-
isInfrastructureError: infraError
|
|
6941
|
+
isInfrastructureError: infraError,
|
|
6942
|
+
mcpHostTrace: result.mcpHostTrace
|
|
6934
6943
|
});
|
|
6935
6944
|
} catch (err) {
|
|
6936
6945
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7014,6 +7023,7 @@ async function runEvalDataset(options, context) {
|
|
|
7014
7023
|
onCaseComplete,
|
|
7015
7024
|
filterTags,
|
|
7016
7025
|
saveResultsTo,
|
|
7026
|
+
omitResponsesFromBaseline = true,
|
|
7017
7027
|
baselineResultsFrom,
|
|
7018
7028
|
mcpHostModel,
|
|
7019
7029
|
judgeModel
|
|
@@ -7128,7 +7138,9 @@ async function runEvalDataset(options, context) {
|
|
|
7128
7138
|
result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
|
|
7129
7139
|
}
|
|
7130
7140
|
if (saveResultsTo) {
|
|
7131
|
-
await saveBaseline(result, saveResultsTo
|
|
7141
|
+
await saveBaseline(result, saveResultsTo, {
|
|
7142
|
+
omitResponses: omitResponsesFromBaseline
|
|
7143
|
+
});
|
|
7132
7144
|
}
|
|
7133
7145
|
if (context.testInfo) {
|
|
7134
7146
|
await context.testInfo.attach("mcp-test-results", {
|