npm - @dotsetlabs/bellwether - Versions diffs - 1.0.2 → 2.0.0 - Mend

@dotsetlabs/bellwether 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/CHANGELOG.md +97 -0
package/README.md +9 -2
package/dist/baseline/accessors.d.ts +1 -1
package/dist/baseline/accessors.js +1 -3
package/dist/baseline/baseline-format.d.ts +287 -0
package/dist/baseline/baseline-format.js +12 -0
package/dist/baseline/comparator.js +249 -11
package/dist/baseline/converter.d.ts +15 -15
package/dist/baseline/converter.js +46 -34
package/dist/baseline/diff.d.ts +1 -1
package/dist/baseline/diff.js +45 -28
package/dist/baseline/error-analyzer.d.ts +1 -1
package/dist/baseline/error-analyzer.js +90 -17
package/dist/baseline/incremental-checker.js +8 -5
package/dist/baseline/index.d.ts +2 -12
package/dist/baseline/index.js +3 -23
package/dist/baseline/performance-tracker.d.ts +0 -1
package/dist/baseline/performance-tracker.js +13 -20
package/dist/baseline/response-fingerprint.js +39 -2
package/dist/baseline/saver.js +41 -10
package/dist/baseline/schema-compare.d.ts +22 -0
package/dist/baseline/schema-compare.js +259 -16
package/dist/baseline/types.d.ts +10 -7
package/dist/cache/response-cache.d.ts +12 -2
package/dist/cache/response-cache.js +178 -30
package/dist/cli/commands/check.js +100 -54
package/dist/cli/commands/explore.js +34 -14
package/dist/cli/index.js +13 -3
package/dist/config/template.js +8 -7
package/dist/config/validator.d.ts +59 -59
package/dist/config/validator.js +245 -90
package/dist/constants/core.d.ts +4 -0
package/dist/constants/core.js +8 -19
package/dist/constants/registry.d.ts +17 -0
package/dist/constants/registry.js +18 -0
package/dist/constants/testing.d.ts +0 -369
package/dist/constants/testing.js +18 -456
package/dist/constants.d.ts +1 -1
package/dist/constants.js +1 -1
package/dist/docs/contract.js +131 -83
package/dist/docs/report.js +8 -5
package/dist/interview/insights.d.ts +17 -0
package/dist/interview/insights.js +52 -0
package/dist/interview/interviewer.js +119 -57
package/dist/interview/orchestrator.js +49 -22
package/dist/interview/prompt-test-generator.d.ts +12 -0
package/dist/interview/prompt-test-generator.js +77 -0
package/dist/interview/resource-test-generator.d.ts +12 -0
package/dist/interview/resource-test-generator.js +20 -0
package/dist/interview/schema-inferrer.js +26 -4
package/dist/interview/schema-test-generator.js +278 -31
package/dist/interview/stateful-test-runner.d.ts +3 -0
package/dist/interview/stateful-test-runner.js +80 -0
package/dist/interview/types.d.ts +12 -0
package/dist/llm/anthropic.js +49 -16
package/dist/llm/client.d.ts +2 -0
package/dist/llm/client.js +61 -0
package/dist/llm/ollama.js +9 -4
package/dist/llm/openai.js +34 -23
package/dist/transport/base-transport.d.ts +1 -1
package/dist/transport/http-transport.d.ts +2 -2
package/dist/transport/http-transport.js +26 -6
package/dist/transport/mcp-client.d.ts +18 -6
package/dist/transport/mcp-client.js +50 -20
package/dist/transport/sse-transport.d.ts +8 -4
package/dist/transport/sse-transport.js +161 -69
package/dist/transport/stdio-transport.d.ts +1 -1
package/dist/transport/stdio-transport.js +1 -1
package/dist/utils/timeout.d.ts +10 -2
package/dist/utils/timeout.js +9 -5
package/dist/version.js +1 -1
package/dist/workflow/executor.js +18 -13
package/dist/workflow/loader.js +4 -1
package/dist/workflow/state-tracker.js +22 -18
package/man/bellwether.1 +204 -0
package/man/bellwether.1.md +148 -0
package/package.json +6 -7
package/schemas/bellwether-check.schema.json +185 -0
package/schemas/bellwether-explore.schema.json +837 -0
package/scripts/completions/bellwether.bash +10 -4
package/scripts/completions/bellwether.zsh +55 -2

package/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,103 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [2.0.0] - 2026-02-04
+### Breaking Changes
+- **Removed cloud-related baseline modules**: The following exports have been removed from the public API:
+  - `ai-compatibility-scorer.ts` - AI compatibility scoring
+  - `change-impact-analyzer.ts` - Change impact analysis (`analyzeToolChangeImpact`, `analyzeDiffImpact`, `isBreakingChange`, etc.)
+  - `deprecation-tracker.ts` - Deprecation tracking (`checkDeprecations`, `markAsDeprecated`, `getDeprecatedTools`, etc.)
+  - `health-scorer.ts` - Health scoring (`calculateHealthScore`, `formatHealthScore`, `HEALTH_SCORING`, etc.)
+  - `migration-generator.ts` - Migration guide generation (`generateMigrationGuide`, `formatMigrationGuideMarkdown`, etc.)
+  - `pr-comment-generator.ts` - PR comment generation (`generatePRComment`, `generateCompactPRComment`, etc.)
+  - `risk-scorer.ts` - Risk scoring (`calculateRiskScore`, `generateRiskScoreMarkdown`, etc.)
+  - `scenario-generator.ts` - Auto scenario generation (`generateToolScenarios`, `generateBaselineScenarios`, etc.)
+  - `schema-evolution.ts` - Schema evolution timeline (`buildServerTimeline`, `getSchemaChanges`, etc.)
+  - `test-pruner.ts` - Test pruning (`calculatePruningDecisions`, `prioritizeTools`, etc.)
+  - `cloud-types.ts` - Cloud type definitions
+  - `constants/cloud.ts` - Cloud constants
+- **Renamed baseline function**: `createCloudBaseline()` renamed to `createBaselineFromInterview()`
+- **Removed `PERFORMANCE` constant export** from `performance-tracker.ts`
+### Added
+- **Deterministic prompt testing**: New `prompt-test-generator.ts` for generating deterministic tests for MCP prompts without requiring LLM calls
+- **Deterministic resource testing**: New `resource-test-generator.ts` for generating deterministic tests for MCP resources
+- **Interview insights module**: New `insights.ts` module with `buildInterviewInsights()` for deriving semantic inferences, schema evolution, and error analysis
+- **Baseline format types**: New `baseline-format.ts` with enhanced types:
+  - `PersonaInterview` and `PersonaFinding` for structured interview results
+  - `ResourceCapability` and `PromptCapability` for resource/prompt discovery
+  - Enhanced `ToolCapability` with observed schema tracking and security fingerprints
+  - `ResponseSchemaEvolution` and `DocumentationScoreSummary` types
+- **Registry constants**: New `constants/registry.ts` for MCP Registry integration
+- **Man pages**: Added `man/bellwether.1` and `man/bellwether.1.md` for Unix manual pages
+- **Explore report schema**: New `schemas/bellwether-explore.schema.json` for JSON report validation
+- **JSON schema embedding**: JSON reports now include `$schema` pointer for IDE validation
+- **Expanded behavior aspects**: `BehaviorAspect` type now includes `prompt`, `resource`, `server`, `capability`
+### Changed
+- **Simplified baseline system**: Removed cloud-specific baseline logic in favor of a single, self-contained format
+- **Enhanced schema comparison**: Expanded `schema-compare.ts` with improved property-level diff detection
+- **Improved comparator**: Enhanced `comparator.ts` with better change detection and categorization
+- **SSE transport improvements**: Refactored `sse-transport.ts` for better reliability and error handling
+- **Response cache enhancements**: Improved `response-cache.ts` with better TTL management
+- **Interview system refinements**: Updated `interviewer.ts` and `schema-test-generator.ts` for deterministic test merging
+- **Stateful test runner**: Enhanced `stateful-test-runner.ts` with improved state management
+### Documentation
+- Updated all CLI documentation for consistency
+- Added JSON schema validation pointers to output format docs
+- Updated GitHub Action examples to v2.0.0
+- Improved baseline and CI/CD documentation
+- Enhanced configuration guide with new options
+### Internal
+- Removed ~13,600 lines of cloud-related code
+- Added ~2,600 lines of deterministic testing and baseline improvements
+- Consolidated test files, removing 12 test files for deleted modules
+- Added new tests for prompt/resource generators and enhanced schema comparison
+### Migration Guide
+If you were importing from the `@dotsetlabs/bellwether` library API:
+1. **Baseline functions**: Replace `createCloudBaseline()` with `createBaselineFromInterview()`
+2. **Removed exports**: The following modules are no longer available. If you depended on them, you'll need to implement alternatives:
+   - Health scoring, deprecation tracking, migration generation
+   - PR comment generation, risk scoring, scenario generation
+   - AI compatibility scoring, test pruning, schema evolution timeline
+3. **CLI users**: No changes required. The CLI interface remains fully compatible.
+## [1.0.3] - 2026-02-02
+### Added
+- Added `version` input to GitHub Action for explicit npm version selection
+  - Action now derives version from ref (e.g., `v1.0.3`) or accepts explicit `inputs.version`
+  - Provides clear error message when version cannot be determined
+- Added `signal` option to LLM completion requests for request cancellation via AbortSignal
+- Added AbortController integration to timeout utilities for proper request cancellation
+- Added JSON extraction from mixed LLM responses (handles prose around JSON blocks)
+### Changed
+- Improved timeout handling with AbortController propagation across LLM and transport layers
+- Improved error handling and resource cleanup in interview, orchestrator, and transport modules
+- Refactored response cache, workflow executor, and state tracker for better reliability
+- Updated CI/CD and GitHub/GitLab integration documentation
+### Fixed
+- Fixed GitHub Action stderr handling in check command output capture
+- Fixed various code formatting and linting issues across LLM clients and transport modules
 ## [1.0.2] - 2026-01-30
 ### Added

package/README.md CHANGED Viewed

@@ -105,11 +105,16 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
 | `explore` | LLM behavioral testing |
 | `baseline save` | Save test results as baseline |
 | `baseline compare` | Compare against baseline |
+| `baseline show` | Display baseline contents |
 | `baseline accept` | Accept drift as intentional |
 | `baseline diff` | Compare two baselines |
 | `discover` | Show server capabilities |
 | `watch` | Continuous checking on file changes |
 | `registry` | Search MCP Registry |
+| `golden` | Golden output regression testing |
+| `contract` | Contract validation (generate/validate/show) |
+| `auth` | Manage LLM provider API keys |
+| `validate-config` | Validate bellwether.yaml without running tests |
 ## CI/CD Exit Codes
@@ -120,12 +125,14 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
 | `2` | Warning-level changes | Warn |
 | `3` | Breaking changes | Fail |
 | `4` | Runtime error | Fail |
+| `5` | Low confidence metrics | Warn or fail |
 ## GitHub Action
 ```yaml
-- uses: dotsetlabs/bellwether@v1
+- uses: dotsetlabs/bellwether@v2.0.0
   with:
+    version: '2.0.0'
     server-command: 'npx @mcp/your-server'
     baseline-path: './bellwether-baseline.json'
     fail-on-severity: 'warning'
@@ -167,7 +174,7 @@ bellwether init --preset local npx @mcp/server # Local Ollama (free)
 ```bash
 git clone https://github.com/dotsetlabs/bellwether
-cd bellwether/cli
+cd bellwether
 npm install
 npm run build
 npm test

package/dist/baseline/accessors.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { ToolCapability } from './cloud-types.js';
+import type { ToolCapability } from './baseline-format.js';
 import type { BehavioralBaseline, ToolFingerprint } from './types.js';
 export declare function getBaselineGeneratedAt(baseline: BehavioralBaseline): Date;
 export declare function getBaselineHash(baseline: BehavioralBaseline): string;

package/dist/baseline/accessors.js CHANGED Viewed

@@ -73,9 +73,7 @@ export function getToolFingerprints(baseline) {
         const limitations = profile?.limitations ?? [];
         const description = tool.description || profile?.description || '';
         const schemaHash = tool.schemaHash || profile?.schemaHash || '';
-        const lastTestedAt = tool.lastTestedAt
-            ? new Date(tool.lastTestedAt)
-            : undefined;
+        const lastTestedAt = tool.lastTestedAt ? new Date(tool.lastTestedAt) : undefined;
         return {
             name: tool.name,
             description,

package/dist/baseline/baseline-format.d.ts ADDED Viewed

@@ -0,0 +1,287 @@
+/**
+ * Baseline types for Bellwether.
+ *
+ * These types define the canonical baseline format used for:
+ * - Local baseline storage
+ * - Baseline comparison and drift detection
+ * - Tool capability tracking
+ *
+ * Originally part of a hosted integration, now standalone for open-source use.
+ */
+import type { WorkflowSignature } from './types.js';
+import type { ResponseFingerprint, InferredSchema, ErrorPattern } from './response-fingerprint.js';
+import type { SecurityFingerprint } from '../security/types.js';
+/**
+ * Assertion type for baseline assertions.
+ * Maps to: expects (positive), requires (critical), warns (negative), notes (informational)
+ */
+export type BaselineAssertionType = 'expects' | 'requires' | 'warns' | 'notes';
+/**
+ * Severity level for assertions.
+ */
+export type BaselineAssertionSeverity = 'info' | 'low' | 'medium' | 'high' | 'critical';
+/**
+ * Behavioral assertion in baseline format.
+ */
+export interface BaselineAssertion {
+    /** Type of assertion */
+    type: BaselineAssertionType;
+    /** The condition/assertion statement */
+    condition: string;
+    /** Tool this assertion relates to (optional) */
+    tool?: string;
+    /** Severity level (optional) */
+    severity?: BaselineAssertionSeverity;
+}
+/**
+ * Baseline mode indicating how the baseline was generated.
+ * - 'check': Deterministic structural testing (no LLM required)
+ * - 'explore': LLM-powered behavioral exploration
+ */
+export type BaselineMode = 'check' | 'explore';
+/**
+ * Metadata about how the baseline was generated.
+ */
+export interface BaselineMetadata {
+    /** Baseline mode: 'check' = deterministic, 'explore' = LLM-powered */
+    mode: BaselineMode;
+    /** ISO timestamp when generated */
+    generatedAt: string;
+    /** CLI version that generated this baseline */
+    cliVersion: string;
+    /** Command used to start the server */
+    serverCommand: string;
+    /** Server name from MCP initialization */
+    serverName?: string;
+    /** Interview duration in milliseconds */
+    durationMs: number;
+    /** Personas used during interview (empty for check mode) */
+    personas: string[];
+    /** LLM model used ('none' for check mode) */
+    model: string;
+}
+/**
+ * Server fingerprint in baseline format.
+ */
+export interface BaselineServerFingerprint {
+    /** Server name */
+    name: string;
+    /** Server version */
+    version: string;
+    /** MCP protocol version */
+    protocolVersion: string;
+    /** Available capabilities */
+    capabilities: string[];
+}
+/**
+ * Tool capability from discovery.
+ */
+export interface ToolCapability {
+    /** Tool name */
+    name: string;
+    /** Tool description */
+    description: string;
+    /** Input schema */
+    inputSchema: Record<string, unknown>;
+    /** Hash of the schema for change detection */
+    schemaHash: string;
+    /** Hash of observed arguments schema (from actual calls) */
+    observedArgsSchemaHash?: string;
+    /** Consistency of observed argument schemas (0-1) */
+    observedArgsSchemaConsistency?: number;
+    /** Number of observed schema variations */
+    observedArgsSchemaVariations?: number;
+    /** Fingerprint of the tool's response structure */
+    responseFingerprint?: ResponseFingerprint;
+    /** Inferred JSON schema of the tool's output */
+    inferredOutputSchema?: InferredSchema;
+    /** Normalized error patterns observed during testing */
+    errorPatterns?: ErrorPattern[];
+    /** Baseline p50 latency in milliseconds */
+    baselineP50Ms?: number;
+    /** Baseline p95 latency in milliseconds */
+    baselineP95Ms?: number;
+    /** Baseline p99 latency in milliseconds */
+    baselineP99Ms?: number;
+    /** Baseline success rate (0-1) */
+    baselineSuccessRate?: number;
+    /** Response schema evolution metadata */
+    responseSchemaEvolution?: ResponseSchemaEvolution;
+    /** ISO timestamp of last time this tool was tested */
+    lastTestedAt?: string;
+    /** Schema hash captured at the last test time */
+    inputSchemaHashAtTest?: string;
+    /** Statistical confidence for performance baselines */
+    performanceConfidence?: {
+        sampleCount: number;
+        successfulSamples: number;
+        validationSamples: number;
+        totalTests: number;
+        standardDeviation: number;
+        coefficientOfVariation: number;
+        confidenceLevel: 'low' | 'medium' | 'high';
+        recommendation?: string;
+    };
+    /** Security testing fingerprint with findings and risk score */
+    securityFingerprint?: SecurityFingerprint;
+}
+/**
+ * Resource capability from discovery.
+ */
+export interface ResourceCapability {
+    /** Resource URI template */
+    uri: string;
+    /** Resource name */
+    name: string;
+    /** Resource description */
+    description?: string;
+    /** MIME type */
+    mimeType?: string;
+}
+/**
+ * Prompt capability from discovery.
+ */
+export interface PromptCapability {
+    /** Prompt name */
+    name: string;
+    /** Prompt description */
+    description?: string;
+    /** Arguments the prompt accepts */
+    arguments?: Array<{
+        name: string;
+        description?: string;
+        required?: boolean;
+    }>;
+}
+/**
+ * Interview results for a single persona.
+ */
+export interface PersonaInterview {
+    /** Persona ID */
+    persona: string;
+    /** Number of tools interviewed */
+    toolsInterviewed: number;
+    /** Number of questions asked */
+    questionsAsked: number;
+    /** Findings from this persona */
+    findings: PersonaFinding[];
+}
+/**
+ * A finding from a persona interview.
+ */
+export interface PersonaFinding {
+    /** Tool this finding relates to */
+    tool: string;
+    /** Finding category */
+    category: 'behavior' | 'security' | 'reliability' | 'edge_case';
+    /** Severity level */
+    severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
+    /** Description of the finding */
+    description: string;
+    /** Evidence supporting the finding */
+    evidence?: string;
+}
+/**
+ * Tool behavioral profile in baseline format.
+ */
+export interface BaselineToolProfile {
+    /** Tool name */
+    name: string;
+    /** Tool description */
+    description: string;
+    /** Hash of input schema */
+    schemaHash: string;
+    /** Behavioral assertions */
+    assertions: BaselineAssertion[];
+    /** Security notes */
+    securityNotes: string[];
+    /** Known limitations */
+    limitations: string[];
+    /** Behavioral notes */
+    behavioralNotes: string[];
+}
+/**
+ * Snapshot of accepted drift for a baseline.
+ */
+export interface AcceptedDiff {
+    toolsAdded: string[];
+    toolsRemoved: string[];
+    toolsModified: string[];
+    severity: 'none' | 'info' | 'warning' | 'breaking';
+    breakingCount: number;
+    warningCount: number;
+    infoCount: number;
+}
+/**
+ * Drift acceptance metadata attached to a baseline.
+ */
+export interface DriftAcceptance {
+    acceptedAt: string | Date;
+    acceptedBy?: string;
+    reason?: string;
+    acceptedDiff: AcceptedDiff;
+}
+/**
+ * Serializable schema evolution data for baselines.
+ */
+export interface ResponseSchemaEvolution {
+    currentHash: string;
+    history: Array<{
+        hash: string;
+        schema: InferredSchema;
+        observedAt: string | Date;
+        sampleCount: number;
+    }>;
+    isStable: boolean;
+    stabilityConfidence: number;
+    inconsistentFields: string[];
+    sampleCount: number;
+}
+/**
+ * Serializable documentation score summary for baseline storage.
+ */
+export interface DocumentationScoreSummary {
+    overallScore: number;
+    grade: string;
+    issueCount: number;
+    toolCount: number;
+}
+/**
+ * Canonical baseline format.
+ *
+ * This is the single baseline schema used by Bellwether CLI.
+ *
+ * Versioning: Uses CLI package version for compatibility checking.
+ * Baselines with the same CLI major version are compatible.
+ */
+export interface BellwetherBaseline {
+    /** CLI version that generated this baseline (e.g., '1.0.0') */
+    version: string;
+    /** Generation metadata */
+    metadata: BaselineMetadata;
+    /** Server fingerprint */
+    server: BaselineServerFingerprint;
+    /** Discovered capabilities */
+    capabilities: {
+        tools: ToolCapability[];
+        resources?: ResourceCapability[];
+        prompts?: PromptCapability[];
+    };
+    /** Interview results by persona */
+    interviews: PersonaInterview[];
+    /** Tool behavioral profiles */
+    toolProfiles: BaselineToolProfile[];
+    /** Workflow results (if workflows were tested) */
+    workflows?: WorkflowSignature[];
+    /** Overall behavioral assertions */
+    assertions: BaselineAssertion[];
+    /** Summary of findings */
+    summary: string;
+    /** SHA-256 hash of content (first 16 chars) for integrity */
+    hash: string;
+    /** Drift acceptance metadata (optional) */
+    acceptance?: DriftAcceptance;
+    /** Optional documentation score summary */
+    documentationScore?: DocumentationScoreSummary;
+}
+//# sourceMappingURL=baseline-format.d.ts.map

package/dist/baseline/baseline-format.js ADDED Viewed

@@ -0,0 +1,12 @@
+/**
+ * Baseline types for Bellwether.
+ *
+ * These types define the canonical baseline format used for:
+ * - Local baseline storage
+ * - Baseline comparison and drift detection
+ * - Tool capability tracking
+ *
+ * Originally part of a hosted integration, now standalone for open-source use.
+ */
+export {};
+//# sourceMappingURL=baseline-format.js.map