npm - @gleanwork/mcp-server-tester - Versions diffs - 0.12.0 → 1.0.0-beta.1 - Mend

@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +120 -337
package/dist/cli/index.js +468 -176
package/dist/fixtures/mcp.d.ts +121 -44
package/dist/fixtures/mcp.js +988 -248
package/dist/fixtures/mcp.js.map +1 -1
package/dist/fixtures/mcpAuth.js +6 -2
package/dist/fixtures/mcpAuth.js.map +1 -1
package/dist/index.cjs +5034 -1284
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +1697 -575
package/dist/index.d.ts +1697 -575
package/dist/index.js +5020 -1280
package/dist/index.js.map +1 -1
package/dist/reporters/mcpReporter.cjs +35 -16
package/dist/reporters/mcpReporter.cjs.map +1 -1
package/dist/reporters/mcpReporter.d.cts +8 -3
package/dist/reporters/mcpReporter.d.ts +8 -3
package/dist/reporters/mcpReporter.js +36 -17
package/dist/reporters/mcpReporter.js.map +1 -1
package/dist/reporters/ui-dist/app.js +5 -5
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +64 -8
package/src/reporters/ui-dist/app.js +5 -5
package/src/reporters/ui-dist/styles.css +1 -1

package/dist/fixtures/mcp.d.ts CHANGED Viewed

@@ -55,10 +55,33 @@ interface PatternValidatorOptions {
     /** Whether to perform case-sensitive matching (default: true) */
     caseSensitive?: boolean;
 }
+/**
+ * Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
+ * Pass these values in the sanitizers array to replace non-deterministic
+ * values with stable placeholders before snapshot comparison.
+ *
+ * @example
+ * expect(result).toMatchToolSnapshot('my-snapshot', [
+ *   SnapshotSanitizers.UUID,
+ *   SnapshotSanitizers.ISO_DATE,
+ * ]);
+ */
+declare const SnapshotSanitizers: {
+    /** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
+    readonly TIMESTAMP: "timestamp";
+    /** Replaces UUID v1-v5 strings with a stable placeholder */
+    readonly UUID: "uuid";
+    /** Replaces ISO 8601 date/datetime strings with a stable placeholder */
+    readonly ISO_DATE: "iso-date";
+    /** Replaces MongoDB ObjectId strings with a stable placeholder */
+    readonly OBJECT_ID: "objectId";
+    /** Replaces JWT tokens with a stable placeholder */
+    readonly JWT: "jwt";
+};
 /**
  * Built-in sanitizer names for common variable patterns
  */
-type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
+type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
 /**
  * Custom regex-based sanitizer
  */
@@ -171,48 +194,40 @@ declare function toBeToolError(this: {
 };
 /**
- * Supported LLM provider types
+ * Built-in judge rubrics matching Glean EvalV2's named judge types.
+ * Use these for consistent, standardized evaluations across teams.
+ *
+ * All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
  */
-type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
+type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
+/** A rubric specification: either a built-in named rubric or custom text. */
+type RubricSpec = BuiltInRubric | {
+    text: string;
+};
+/** Valid LLM judge provider kinds. */
+type ProviderKind = 'anthropic' | 'openai' | 'google';
 /**
- * Configuration for an LLM judge
+ * Tool call validators for llm_host simulation results.
+ *
+ * These validators extract the tool call trace from an LLMHostSimulationResult
+ * and apply assertions against expected call lists and counts.
  */
-interface JudgeConfig {
-    /**
-     * LLM provider to use
-     * @default 'claude'
-     */
-    provider?: ProviderKind;
-    /**
-     * Environment variable name containing the API key
-     * @default 'ANTHROPIC_API_KEY'
-     */
-    apiKeyEnvVar?: string;
-    /**
-     * Model to use for judging
-     * @default 'claude-sonnet-4-20250514'
-     */
-    model?: string;
-    /**
-     * Maximum tokens for response
-     * @default 1000
-     */
-    maxTokens?: number;
-    /**
-     * Temperature (0-1, lower is more deterministic)
-     * @default 0.0
-     */
-    temperature?: number;
-    /**
-     * Maximum budget in USD for the judge evaluation
-     * @default 0.10
-     */
-    maxBudgetUsd?: number;
-    /**
-     * Maximum size (in bytes) for tool output before failing the test
-     * When set, the judge will fail if the candidate response exceeds this size
-     */
-    maxToolOutputSize?: number;
+interface ToolCallExpectation {
+    calls: Array<{
+        name: string;
+        arguments?: Record<string, unknown>;
+        required?: boolean;
+    }>;
+    order?: 'strict' | 'any';
+    exclusive?: boolean;
+}
+interface ToolCallCountOptions {
+    min?: number;
+    max?: number;
+    exact?: number;
 }
 /**
@@ -229,8 +244,12 @@ interface JudgeMatcherOptions {
     reference?: unknown;
     /** Score threshold for passing (default: 0.7) */
     passingThreshold?: number;
-    /** Judge configuration override */
-    judgeConfig?: JudgeConfig;
+    /** Number of judge evaluations (scores averaged) */
+    reps?: number;
+    /** Override the judge provider */
+    provider?: ProviderKind;
+    /** Override the judge model */
+    model?: string;
 }
 /**
  * Declaration merging for Playwright matchers
@@ -335,7 +354,7 @@ declare global {
              * });
              * ```
              */
-            toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
             /**
              * Validates that a response meets size constraints
              *
@@ -380,6 +399,28 @@ declare global {
              * ```
              */
             toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
+            /**
+             * Validates which tools the LLM called during an llm_host simulation.
+             *
+             * @example
+             * ```typescript
+             * expect(simulationResult).toHaveToolCalls({
+             *   calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
+             *   order: 'any',
+             * });
+             * ```
+             */
+            toHaveToolCalls(expectation: ToolCallExpectation): R;
+            /**
+             * Validates the number of tool calls made during an llm_host simulation.
+             *
+             * @example
+             * ```typescript
+             * expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
+             * expect(simulationResult).toHaveToolCallCount({ exact: 2 });
+             * ```
+             */
+            toHaveToolCallCount(options: ToolCallCountOptions): R;
         }
     }
 }
@@ -401,6 +442,8 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
  * toPassToolJudge Matcher
  *
  * Validates that a response passes LLM-as-judge evaluation.
+ * Delegates evaluation logic to validateJudge() for consistency
+ * with the validator/matcher duality pattern.
  */
 /**
@@ -410,7 +453,7 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
  */
 declare function toPassToolJudge(this: {
     isNot: boolean;
-}, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
+}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
     pass: boolean;
     message: () => string;
 }>;
@@ -477,6 +520,38 @@ declare function toSatisfyToolPredicate(this: {
     message: () => string;
 }>;
+/**
+ * toHaveToolCalls Matcher
+ *
+ * Validates which tools the LLM called during an llm_host simulation.
+ */
+/**
+ * Creates the toHaveToolCalls matcher function
+ */
+declare function toHaveToolCalls(this: {
+    isNot: boolean;
+}, received: unknown, expectation: ToolCallExpectation): {
+    pass: boolean;
+    message: () => string;
+};
+/**
+ * toHaveToolCallCount Matcher
+ *
+ * Validates the number of tool calls made during an llm_host simulation.
+ */
+/**
+ * Creates the toHaveToolCallCount matcher function
+ */
+declare function toHaveToolCallCount(this: {
+    isNot: boolean;
+}, received: unknown, options: ToolCallCountOptions): {
+    pass: boolean;
+    message: () => string;
+};
 /**
  * Extended Playwright expect with MCP tool matchers
  *
@@ -503,6 +578,8 @@ declare const expect: playwright_test.Expect<{
     toPassToolJudge: typeof toPassToolJudge;
     toHaveToolResponseSize: typeof toHaveToolResponseSize;
     toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
+    toHaveToolCalls: typeof toHaveToolCalls;
+    toHaveToolCallCount: typeof toHaveToolCallCount;
 }>;
 /**