@dotsetlabs/bellwether 1.0.3 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/CHANGELOG.md +118 -0
  2. package/README.md +17 -2
  3. package/dist/auth/credentials.js +2 -0
  4. package/dist/baseline/accessors.d.ts +1 -1
  5. package/dist/baseline/accessors.js +13 -3
  6. package/dist/baseline/baseline-format.d.ts +335 -0
  7. package/dist/baseline/baseline-format.js +12 -0
  8. package/dist/baseline/comparator.js +494 -13
  9. package/dist/baseline/converter.d.ts +15 -15
  10. package/dist/baseline/converter.js +97 -37
  11. package/dist/baseline/diff.d.ts +1 -1
  12. package/dist/baseline/diff.js +45 -28
  13. package/dist/baseline/error-analyzer.d.ts +1 -1
  14. package/dist/baseline/error-analyzer.js +90 -17
  15. package/dist/baseline/incremental-checker.js +8 -5
  16. package/dist/baseline/index.d.ts +2 -12
  17. package/dist/baseline/index.js +3 -23
  18. package/dist/baseline/performance-tracker.d.ts +0 -1
  19. package/dist/baseline/performance-tracker.js +13 -20
  20. package/dist/baseline/response-fingerprint.js +40 -3
  21. package/dist/baseline/saver.js +75 -10
  22. package/dist/baseline/schema-compare.d.ts +22 -0
  23. package/dist/baseline/schema-compare.js +259 -16
  24. package/dist/baseline/types.d.ts +30 -7
  25. package/dist/cache/response-cache.d.ts +8 -0
  26. package/dist/cache/response-cache.js +119 -2
  27. package/dist/cli/commands/baseline.js +70 -35
  28. package/dist/cli/commands/check.js +71 -15
  29. package/dist/cli/commands/explore.js +69 -16
  30. package/dist/cli/commands/init.js +10 -7
  31. package/dist/cli/commands/watch.js +5 -5
  32. package/dist/cli/index.js +8 -0
  33. package/dist/config/loader.js +2 -2
  34. package/dist/config/template.js +8 -7
  35. package/dist/config/validator.d.ts +59 -59
  36. package/dist/config/validator.js +245 -90
  37. package/dist/constants/core.d.ts +5 -1
  38. package/dist/constants/core.js +9 -20
  39. package/dist/constants/registry.d.ts +17 -0
  40. package/dist/constants/registry.js +18 -0
  41. package/dist/constants/testing.d.ts +0 -369
  42. package/dist/constants/testing.js +18 -456
  43. package/dist/constants.d.ts +1 -1
  44. package/dist/constants.js +1 -1
  45. package/dist/discovery/discovery.js +88 -14
  46. package/dist/discovery/types.d.ts +5 -1
  47. package/dist/docs/agents.js +138 -50
  48. package/dist/docs/contract.js +194 -84
  49. package/dist/docs/report.js +8 -5
  50. package/dist/errors/retry.js +11 -5
  51. package/dist/interview/insights.d.ts +17 -0
  52. package/dist/interview/insights.js +52 -0
  53. package/dist/interview/interviewer.js +52 -10
  54. package/dist/interview/prompt-test-generator.d.ts +12 -0
  55. package/dist/interview/prompt-test-generator.js +77 -0
  56. package/dist/interview/rate-limiter.js +7 -3
  57. package/dist/interview/resource-test-generator.d.ts +12 -0
  58. package/dist/interview/resource-test-generator.js +20 -0
  59. package/dist/interview/schema-inferrer.js +26 -4
  60. package/dist/interview/schema-test-generator.js +278 -31
  61. package/dist/interview/stateful-test-runner.d.ts +3 -0
  62. package/dist/interview/stateful-test-runner.js +80 -0
  63. package/dist/interview/types.d.ts +12 -0
  64. package/dist/llm/anthropic.js +14 -4
  65. package/dist/llm/fallback.d.ts +1 -0
  66. package/dist/llm/fallback.js +7 -1
  67. package/dist/llm/openai.js +15 -4
  68. package/dist/protocol/index.d.ts +2 -0
  69. package/dist/protocol/index.js +2 -0
  70. package/dist/protocol/version-registry.d.ts +66 -0
  71. package/dist/protocol/version-registry.js +159 -0
  72. package/dist/transport/http-transport.d.ts +11 -1
  73. package/dist/transport/http-transport.js +21 -2
  74. package/dist/transport/mcp-client.d.ts +29 -1
  75. package/dist/transport/mcp-client.js +93 -8
  76. package/dist/transport/sse-transport.d.ts +7 -3
  77. package/dist/transport/sse-transport.js +162 -71
  78. package/dist/transport/types.d.ts +134 -1
  79. package/dist/utils/concurrency.d.ts +2 -0
  80. package/dist/utils/concurrency.js +9 -2
  81. package/dist/utils/markdown.js +13 -18
  82. package/dist/utils/timeout.js +2 -1
  83. package/dist/version.js +1 -1
  84. package/man/bellwether.1 +1 -1
  85. package/man/bellwether.1.md +2 -2
  86. package/package.json +1 -1
  87. package/schemas/bellwether-check.schema.json +185 -0
  88. package/schemas/bellwether-explore.schema.json +837 -0
  89. package/scripts/completions/bellwether.bash +10 -4
  90. package/scripts/completions/bellwether.zsh +55 -2
package/CHANGELOG.md CHANGED
@@ -7,6 +7,124 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [2.0.1] - 2026-02-07
11
+
12
+ ### Added
13
+
14
+ - **MCP protocol version gating**: New `src/protocol/` module with version-to-feature-flag mapping
15
+ - Supports MCP protocol versions: `2024-11-05`, `2025-03-26`, `2025-06-18`, `2025-11-25`
16
+ - `MCPFeatureFlags` interface with 9 feature flags (`toolAnnotations`, `entityTitles`, `completions`, `resourceAnnotations`, `structuredOutput`, `serverInstructions`, `httpVersionHeader`, `tasks`, `icons`)
17
+ - `getSharedFeatureFlags(v1, v2)` computes AND-intersection for cross-version baseline comparison
18
+ - All version-specific fields in baselines are now gated by protocol version during conversion and comparison
19
+ - **Version-gated drift detection**: Comparator now detects changes in version-specific fields
20
+ - Tool annotations (readOnlyHint, destructiveHint, idempotentHint, openWorldHint)
21
+ - Entity titles (tool, prompt, resource, and resource template titles)
22
+ - Output schema and structured output changes
23
+ - Execution/task support changes
24
+ - Server instructions changes
25
+ - **MCPClient protocol version tracking**: Client stores negotiated protocol version after `initialize()`, exposes via getters
26
+ - **Mock server protocol version support**: Mock MCP server now supports `MOCK_PROTOCOL_VERSION` env var for testing
27
+
28
+ ### Fixed
29
+
30
+ - **20 production-blocking bugs across all layers** (`4717ca1`):
31
+ - Transport: HTTP transport URL construction, SSE error event handling, MCP client error propagation
32
+ - Discovery: ResourceTemplate type handling, discovery error handling
33
+ - Baseline: Converter version-gated field handling, saver hash calculation, comparator severity logic
34
+ - CLI: Check command exit code handling, explore command cleanup, baseline command error paths
35
+ - Config: Environment variable expansion edge cases
36
+ - Docs: Contract and agents generator error handling
37
+ - **Protocol version gating gaps causing false negatives and data loss** (`dce73ed`):
38
+ - Fixed tool title comparison using wrong feature flag (`toolAnnotations` instead of `entityTitles`)
39
+ - Fixed tool title comparison condition (AND → OR) to detect added/removed titles
40
+ - Added missing `execution` and `baselineP99Ms` fields to `ToolFingerprint` type
41
+ - Added missing fields (`title`, `outputSchema`, `outputSchemaHash`, `annotations`, `execution`, `baselineP99Ms`) to `toToolCapability()` accessor — prevents data loss during incremental check merges
42
+ - Added `execution` and `baselineP99Ms` mapping to `getToolFingerprints()` accessor
43
+ - Added prompt title comparison gated by `entityTitles` flag
44
+ - Added resource title comparison gated by `entityTitles` flag
45
+ - Added resource template title comparison gated by `entityTitles` flag
46
+ - Added execution/task support comparison gated by `tasks` flag
47
+ - Added server instructions comparison gated by `serverInstructions` flag
48
+ - Gated resource template `title` in converter by `entityTitles` flag
49
+ - **Clean JSON output from baseline commands** (`7aab450`):
50
+ - `baseline compare --format json` no longer appends summary text after JSON object
51
+ - `baseline diff --format json` no longer prepends header or appends summary text around JSON object
52
+ - JSON output is now machine-parseable without text contamination
53
+
54
+ ## [2.0.0] - 2026-02-04
55
+
56
+ ### Breaking Changes
57
+
58
+ - **Removed cloud-related baseline modules**: The following exports have been removed from the public API:
59
+ - `ai-compatibility-scorer.ts` - AI compatibility scoring
60
+ - `change-impact-analyzer.ts` - Change impact analysis (`analyzeToolChangeImpact`, `analyzeDiffImpact`, `isBreakingChange`, etc.)
61
+ - `deprecation-tracker.ts` - Deprecation tracking (`checkDeprecations`, `markAsDeprecated`, `getDeprecatedTools`, etc.)
62
+ - `health-scorer.ts` - Health scoring (`calculateHealthScore`, `formatHealthScore`, `HEALTH_SCORING`, etc.)
63
+ - `migration-generator.ts` - Migration guide generation (`generateMigrationGuide`, `formatMigrationGuideMarkdown`, etc.)
64
+ - `pr-comment-generator.ts` - PR comment generation (`generatePRComment`, `generateCompactPRComment`, etc.)
65
+ - `risk-scorer.ts` - Risk scoring (`calculateRiskScore`, `generateRiskScoreMarkdown`, etc.)
66
+ - `scenario-generator.ts` - Auto scenario generation (`generateToolScenarios`, `generateBaselineScenarios`, etc.)
67
+ - `schema-evolution.ts` - Schema evolution timeline (`buildServerTimeline`, `getSchemaChanges`, etc.)
68
+ - `test-pruner.ts` - Test pruning (`calculatePruningDecisions`, `prioritizeTools`, etc.)
69
+ - `cloud-types.ts` - Cloud type definitions
70
+ - `constants/cloud.ts` - Cloud constants
71
+ - **Renamed baseline function**: `createCloudBaseline()` renamed to `createBaselineFromInterview()`
72
+ - **Removed `PERFORMANCE` constant export** from `performance-tracker.ts`
73
+
74
+ ### Added
75
+
76
+ - **Deterministic prompt testing**: New `prompt-test-generator.ts` for generating deterministic tests for MCP prompts without requiring LLM calls
77
+ - **Deterministic resource testing**: New `resource-test-generator.ts` for generating deterministic tests for MCP resources
78
+ - **Interview insights module**: New `insights.ts` module with `buildInterviewInsights()` for deriving semantic inferences, schema evolution, and error analysis
79
+ - **Baseline format types**: New `baseline-format.ts` with enhanced types:
80
+ - `PersonaInterview` and `PersonaFinding` for structured interview results
81
+ - `ResourceCapability` and `PromptCapability` for resource/prompt discovery
82
+ - Enhanced `ToolCapability` with observed schema tracking and security fingerprints
83
+ - `ResponseSchemaEvolution` and `DocumentationScoreSummary` types
84
+ - **Registry constants**: New `constants/registry.ts` for MCP Registry integration
85
+ - **Man pages**: Added `man/bellwether.1` and `man/bellwether.1.md` for Unix manual pages
86
+ - **Explore report schema**: New `schemas/bellwether-explore.schema.json` for JSON report validation
87
+ - **JSON schema embedding**: JSON reports now include `$schema` pointer for IDE validation
88
+ - **Expanded behavior aspects**: `BehaviorAspect` type now includes `prompt`, `resource`, `server`, `capability`
89
+
90
+ ### Changed
91
+
92
+ - **Simplified baseline system**: Removed cloud-specific baseline logic in favor of a single, self-contained format
93
+ - **Enhanced schema comparison**: Expanded `schema-compare.ts` with improved property-level diff detection
94
+ - **Improved comparator**: Enhanced `comparator.ts` with better change detection and categorization
95
+ - **SSE transport improvements**: Refactored `sse-transport.ts` for better reliability and error handling
96
+ - **Response cache enhancements**: Improved `response-cache.ts` with better TTL management
97
+ - **Interview system refinements**: Updated `interviewer.ts` and `schema-test-generator.ts` for deterministic test merging
98
+ - **Stateful test runner**: Enhanced `stateful-test-runner.ts` with improved state management
99
+
100
+ ### Documentation
101
+
102
+ - Updated all CLI documentation for consistency
103
+ - Added JSON schema validation pointers to output format docs
104
+ - Updated GitHub Action examples to v2.0.0
105
+ - Improved baseline and CI/CD documentation
106
+ - Enhanced configuration guide with new options
107
+
108
+ ### Internal
109
+
110
+ - Removed ~13,600 lines of cloud-related code
111
+ - Added ~2,600 lines of deterministic testing and baseline improvements
112
+ - Consolidated test files, removing 12 test files for deleted modules
113
+ - Added new tests for prompt/resource generators and enhanced schema comparison
114
+
115
+ ### Migration Guide
116
+
117
+ If you were importing from the `@dotsetlabs/bellwether` library API:
118
+
119
+ 1. **Baseline functions**: Replace `createCloudBaseline()` with `createBaselineFromInterview()`
120
+
121
+ 2. **Removed exports**: The following modules are no longer available. If you depended on them, you'll need to implement alternatives:
122
+ - Health scoring, deprecation tracking, migration generation
123
+ - PR comment generation, risk scoring, scenario generation
124
+ - AI compatibility scoring, test pruning, schema evolution timeline
125
+
126
+ 3. **CLI users**: No changes required. The CLI interface remains fully compatible.
127
+
10
128
  ## [1.0.3] - 2026-02-02
11
129
 
12
130
  ### Added
package/README.md CHANGED
@@ -75,6 +75,15 @@ jobs:
75
75
  | Parameter renamed | `path` to `file_path` | Breaking |
76
76
  | Description changed | Tool help text updated | Warning |
77
77
  | Performance regression | Latency increased >10% | Warning |
78
+ | Tool annotations changed | `readOnlyHint` flipped to `false` | Warning |
79
+ | Output schema changed | Return type structure modified | Warning |
80
+ | Entity title changed | Tool/prompt/resource title updated | Info |
81
+ | Task support changed | Execution mode switched to `async` | Warning |
82
+ | Server instructions changed | Server-level instructions updated | Info |
83
+ | Prompt added/removed | Prompt template appears or disappears | Breaking |
84
+ | Resource changed | Resource URI or MIME type modified | Warning |
85
+
86
+ Comparisons are **protocol-version-aware** — version-specific fields (annotations, titles, output schemas, etc.) are only compared when both baselines support the relevant MCP protocol version.
78
87
 
79
88
  ## Commands
80
89
 
@@ -105,11 +114,16 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
105
114
  | `explore` | LLM behavioral testing |
106
115
  | `baseline save` | Save test results as baseline |
107
116
  | `baseline compare` | Compare against baseline |
117
+ | `baseline show` | Display baseline contents |
108
118
  | `baseline accept` | Accept drift as intentional |
109
119
  | `baseline diff` | Compare two baselines |
110
120
  | `discover` | Show server capabilities |
111
121
  | `watch` | Continuous checking on file changes |
112
122
  | `registry` | Search MCP Registry |
123
+ | `golden` | Golden output regression testing |
124
+ | `contract` | Contract validation (generate/validate/show) |
125
+ | `auth` | Manage LLM provider API keys |
126
+ | `validate-config` | Validate bellwether.yaml without running tests |
113
127
 
114
128
  ## CI/CD Exit Codes
115
129
 
@@ -120,13 +134,14 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
120
134
  | `2` | Warning-level changes | Warn |
121
135
  | `3` | Breaking changes | Fail |
122
136
  | `4` | Runtime error | Fail |
137
+ | `5` | Low confidence metrics | Warn or fail |
123
138
 
124
139
  ## GitHub Action
125
140
 
126
141
  ```yaml
127
- - uses: dotsetlabs/bellwether@v1.0.2
142
+ - uses: dotsetlabs/bellwether@v2.0.0
128
143
  with:
129
- version: '1.0.2'
144
+ version: '2.0.0'
130
145
  server-command: 'npx @mcp/your-server'
131
146
  baseline-path: './bellwether-baseline.json'
132
147
  fail-on-severity: 'warning'
@@ -58,6 +58,8 @@ function readEnvFile(filePath, envVar, options) {
58
58
  if (decrypted) {
59
59
  return decrypted;
60
60
  }
61
+ // Warn about decryption failure so users know their credential exists but can't be decrypted
62
+ console.warn(`[bellwether] Encrypted credential found for ${envVar} but decryption failed. Check your encryption key.`);
61
63
  return undefined;
62
64
  }
63
65
  if (value) {
@@ -1,4 +1,4 @@
1
- import type { ToolCapability } from './cloud-types.js';
1
+ import type { ToolCapability } from './baseline-format.js';
2
2
  import type { BehavioralBaseline, ToolFingerprint } from './types.js';
3
3
  export declare function getBaselineGeneratedAt(baseline: BehavioralBaseline): Date;
4
4
  export declare function getBaselineHash(baseline: BehavioralBaseline): string;
@@ -57,9 +57,15 @@ export function toToolCapability(tool) {
57
57
  errorPatterns: tool.errorPatterns,
58
58
  baselineP50Ms: tool.baselineP50Ms,
59
59
  baselineP95Ms: tool.baselineP95Ms,
60
+ baselineP99Ms: tool.baselineP99Ms,
60
61
  baselineSuccessRate: tool.baselineSuccessRate,
61
62
  performanceConfidence: tool.performanceConfidence,
62
63
  securityFingerprint: tool.securityFingerprint,
64
+ title: tool.title,
65
+ outputSchema: tool.outputSchema,
66
+ outputSchemaHash: tool.outputSchemaHash,
67
+ annotations: tool.annotations,
68
+ execution: tool.execution,
63
69
  };
64
70
  }
65
71
  export function getToolFingerprints(baseline) {
@@ -73,9 +79,7 @@ export function getToolFingerprints(baseline) {
73
79
  const limitations = profile?.limitations ?? [];
74
80
  const description = tool.description || profile?.description || '';
75
81
  const schemaHash = tool.schemaHash || profile?.schemaHash || '';
76
- const lastTestedAt = tool.lastTestedAt
77
- ? new Date(tool.lastTestedAt)
78
- : undefined;
82
+ const lastTestedAt = tool.lastTestedAt ? new Date(tool.lastTestedAt) : undefined;
79
83
  return {
80
84
  name: tool.name,
81
85
  description,
@@ -95,6 +99,12 @@ export function getToolFingerprints(baseline) {
95
99
  baselineSuccessRate: tool.baselineSuccessRate,
96
100
  performanceConfidence: tool.performanceConfidence,
97
101
  securityFingerprint: tool.securityFingerprint,
102
+ title: tool.title,
103
+ outputSchema: tool.outputSchema,
104
+ outputSchemaHash: tool.outputSchemaHash,
105
+ annotations: tool.annotations,
106
+ execution: tool.execution,
107
+ baselineP99Ms: tool.baselineP99Ms,
98
108
  };
99
109
  });
100
110
  if (fingerprints.length > 0) {
@@ -0,0 +1,335 @@
1
+ /**
2
+ * Baseline types for Bellwether.
3
+ *
4
+ * These types define the canonical baseline format used for:
5
+ * - Local baseline storage
6
+ * - Baseline comparison and drift detection
7
+ * - Tool capability tracking
8
+ *
9
+ * Originally part of a hosted integration, now standalone for open-source use.
10
+ */
11
+ import type { WorkflowSignature } from './types.js';
12
+ import type { ResponseFingerprint, InferredSchema, ErrorPattern } from './response-fingerprint.js';
13
+ import type { SecurityFingerprint } from '../security/types.js';
14
+ /**
15
+ * Assertion type for baseline assertions.
16
+ * Maps to: expects (positive), requires (critical), warns (negative), notes (informational)
17
+ */
18
+ export type BaselineAssertionType = 'expects' | 'requires' | 'warns' | 'notes';
19
+ /**
20
+ * Severity level for assertions.
21
+ */
22
+ export type BaselineAssertionSeverity = 'info' | 'low' | 'medium' | 'high' | 'critical';
23
+ /**
24
+ * Behavioral assertion in baseline format.
25
+ */
26
+ export interface BaselineAssertion {
27
+ /** Type of assertion */
28
+ type: BaselineAssertionType;
29
+ /** The condition/assertion statement */
30
+ condition: string;
31
+ /** Tool this assertion relates to (optional) */
32
+ tool?: string;
33
+ /** Severity level (optional) */
34
+ severity?: BaselineAssertionSeverity;
35
+ }
36
+ /**
37
+ * Baseline mode indicating how the baseline was generated.
38
+ * - 'check': Deterministic structural testing (no LLM required)
39
+ * - 'explore': LLM-powered behavioral exploration
40
+ */
41
+ export type BaselineMode = 'check' | 'explore';
42
+ /**
43
+ * Metadata about how the baseline was generated.
44
+ */
45
+ export interface BaselineMetadata {
46
+ /** Baseline mode: 'check' = deterministic, 'explore' = LLM-powered */
47
+ mode: BaselineMode;
48
+ /** ISO timestamp when generated */
49
+ generatedAt: string;
50
+ /** CLI version that generated this baseline */
51
+ cliVersion: string;
52
+ /** Command used to start the server */
53
+ serverCommand: string;
54
+ /** Server name from MCP initialization */
55
+ serverName?: string;
56
+ /** Interview duration in milliseconds */
57
+ durationMs: number;
58
+ /** Personas used during interview (empty for check mode) */
59
+ personas: string[];
60
+ /** LLM model used ('none' for check mode) */
61
+ model: string;
62
+ }
63
+ /**
64
+ * Server fingerprint in baseline format.
65
+ */
66
+ export interface BaselineServerFingerprint {
67
+ /** Server name */
68
+ name: string;
69
+ /** Server version */
70
+ version: string;
71
+ /** MCP protocol version */
72
+ protocolVersion: string;
73
+ /** Available capabilities */
74
+ capabilities: string[];
75
+ /** Server-provided instructions (MCP 2025-11-25) */
76
+ instructions?: string;
77
+ }
78
+ /**
79
+ * Tool capability from discovery.
80
+ */
81
+ export interface ToolCapability {
82
+ /** Tool name */
83
+ name: string;
84
+ /** Tool description */
85
+ description: string;
86
+ /** Input schema */
87
+ inputSchema: Record<string, unknown>;
88
+ /** Hash of the schema for change detection */
89
+ schemaHash: string;
90
+ /** Human-readable title for the tool (MCP 2025-11-25) */
91
+ title?: string;
92
+ /** JSON Schema for the tool's output (MCP 2025-11-25 structured content) */
93
+ outputSchema?: Record<string, unknown>;
94
+ /** Hash of the output schema for drift detection */
95
+ outputSchemaHash?: string;
96
+ /** Behavioral annotations/hints (MCP 2025-11-25) */
97
+ annotations?: {
98
+ title?: string;
99
+ readOnlyHint?: boolean;
100
+ destructiveHint?: boolean;
101
+ idempotentHint?: boolean;
102
+ openWorldHint?: boolean;
103
+ };
104
+ /** Task execution configuration (MCP 2025-11-25) */
105
+ execution?: {
106
+ taskSupport?: string;
107
+ };
108
+ /** Hash of observed arguments schema (from actual calls) */
109
+ observedArgsSchemaHash?: string;
110
+ /** Consistency of observed argument schemas (0-1) */
111
+ observedArgsSchemaConsistency?: number;
112
+ /** Number of observed schema variations */
113
+ observedArgsSchemaVariations?: number;
114
+ /** Fingerprint of the tool's response structure */
115
+ responseFingerprint?: ResponseFingerprint;
116
+ /** Inferred JSON schema of the tool's output */
117
+ inferredOutputSchema?: InferredSchema;
118
+ /** Normalized error patterns observed during testing */
119
+ errorPatterns?: ErrorPattern[];
120
+ /** Baseline p50 latency in milliseconds */
121
+ baselineP50Ms?: number;
122
+ /** Baseline p95 latency in milliseconds */
123
+ baselineP95Ms?: number;
124
+ /** Baseline p99 latency in milliseconds */
125
+ baselineP99Ms?: number;
126
+ /** Baseline success rate (0-1) */
127
+ baselineSuccessRate?: number;
128
+ /** Response schema evolution metadata */
129
+ responseSchemaEvolution?: ResponseSchemaEvolution;
130
+ /** ISO timestamp of last time this tool was tested */
131
+ lastTestedAt?: string;
132
+ /** Schema hash captured at the last test time */
133
+ inputSchemaHashAtTest?: string;
134
+ /** Statistical confidence for performance baselines */
135
+ performanceConfidence?: {
136
+ sampleCount: number;
137
+ successfulSamples: number;
138
+ validationSamples: number;
139
+ totalTests: number;
140
+ standardDeviation: number;
141
+ coefficientOfVariation: number;
142
+ confidenceLevel: 'low' | 'medium' | 'high';
143
+ recommendation?: string;
144
+ };
145
+ /** Security testing fingerprint with findings and risk score */
146
+ securityFingerprint?: SecurityFingerprint;
147
+ }
148
+ /**
149
+ * Resource capability from discovery.
150
+ */
151
+ export interface ResourceCapability {
152
+ /** Resource URI template */
153
+ uri: string;
154
+ /** Resource name */
155
+ name: string;
156
+ /** Resource description */
157
+ description?: string;
158
+ /** MIME type */
159
+ mimeType?: string;
160
+ /** Human-readable title (MCP 2025-11-25) */
161
+ title?: string;
162
+ /** Resource annotations (MCP 2025-11-25) */
163
+ annotations?: {
164
+ audience?: string[];
165
+ priority?: number;
166
+ lastModified?: string;
167
+ };
168
+ /** Resource size in bytes (MCP 2025-11-25) */
169
+ size?: number;
170
+ }
171
+ /**
172
+ * Resource template capability from discovery (MCP 2025-11-25).
173
+ */
174
+ export interface ResourceTemplateCapability {
175
+ /** URI template (RFC 6570) */
176
+ uriTemplate: string;
177
+ /** Template name */
178
+ name: string;
179
+ /** Human-readable title */
180
+ title?: string;
181
+ /** Template description */
182
+ description?: string;
183
+ /** Expected MIME type */
184
+ mimeType?: string;
185
+ }
186
+ /**
187
+ * Prompt capability from discovery.
188
+ */
189
+ export interface PromptCapability {
190
+ /** Prompt name */
191
+ name: string;
192
+ /** Prompt description */
193
+ description?: string;
194
+ /** Human-readable title (MCP 2025-11-25) */
195
+ title?: string;
196
+ /** Arguments the prompt accepts */
197
+ arguments?: Array<{
198
+ name: string;
199
+ description?: string;
200
+ required?: boolean;
201
+ }>;
202
+ }
203
+ /**
204
+ * Interview results for a single persona.
205
+ */
206
+ export interface PersonaInterview {
207
+ /** Persona ID */
208
+ persona: string;
209
+ /** Number of tools interviewed */
210
+ toolsInterviewed: number;
211
+ /** Number of questions asked */
212
+ questionsAsked: number;
213
+ /** Findings from this persona */
214
+ findings: PersonaFinding[];
215
+ }
216
+ /**
217
+ * A finding from a persona interview.
218
+ */
219
+ export interface PersonaFinding {
220
+ /** Tool this finding relates to */
221
+ tool: string;
222
+ /** Finding category */
223
+ category: 'behavior' | 'security' | 'reliability' | 'edge_case';
224
+ /** Severity level */
225
+ severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
226
+ /** Description of the finding */
227
+ description: string;
228
+ /** Evidence supporting the finding */
229
+ evidence?: string;
230
+ }
231
+ /**
232
+ * Tool behavioral profile in baseline format.
233
+ */
234
+ export interface BaselineToolProfile {
235
+ /** Tool name */
236
+ name: string;
237
+ /** Tool description */
238
+ description: string;
239
+ /** Hash of input schema */
240
+ schemaHash: string;
241
+ /** Behavioral assertions */
242
+ assertions: BaselineAssertion[];
243
+ /** Security notes */
244
+ securityNotes: string[];
245
+ /** Known limitations */
246
+ limitations: string[];
247
+ /** Behavioral notes */
248
+ behavioralNotes: string[];
249
+ }
250
+ /**
251
+ * Snapshot of accepted drift for a baseline.
252
+ */
253
+ export interface AcceptedDiff {
254
+ toolsAdded: string[];
255
+ toolsRemoved: string[];
256
+ toolsModified: string[];
257
+ severity: 'none' | 'info' | 'warning' | 'breaking';
258
+ breakingCount: number;
259
+ warningCount: number;
260
+ infoCount: number;
261
+ }
262
+ /**
263
+ * Drift acceptance metadata attached to a baseline.
264
+ */
265
+ export interface DriftAcceptance {
266
+ acceptedAt: string | Date;
267
+ acceptedBy?: string;
268
+ reason?: string;
269
+ acceptedDiff: AcceptedDiff;
270
+ }
271
+ /**
272
+ * Serializable schema evolution data for baselines.
273
+ */
274
+ export interface ResponseSchemaEvolution {
275
+ currentHash: string;
276
+ history: Array<{
277
+ hash: string;
278
+ schema: InferredSchema;
279
+ observedAt: string | Date;
280
+ sampleCount: number;
281
+ }>;
282
+ isStable: boolean;
283
+ stabilityConfidence: number;
284
+ inconsistentFields: string[];
285
+ sampleCount: number;
286
+ }
287
+ /**
288
+ * Serializable documentation score summary for baseline storage.
289
+ */
290
+ export interface DocumentationScoreSummary {
291
+ overallScore: number;
292
+ grade: string;
293
+ issueCount: number;
294
+ toolCount: number;
295
+ }
296
+ /**
297
+ * Canonical baseline format.
298
+ *
299
+ * This is the single baseline schema used by Bellwether CLI.
300
+ *
301
+ * Versioning: Uses CLI package version for compatibility checking.
302
+ * Baselines with the same CLI major version are compatible.
303
+ */
304
+ export interface BellwetherBaseline {
305
+ /** CLI version that generated this baseline (e.g., '1.0.0') */
306
+ version: string;
307
+ /** Generation metadata */
308
+ metadata: BaselineMetadata;
309
+ /** Server fingerprint */
310
+ server: BaselineServerFingerprint;
311
+ /** Discovered capabilities */
312
+ capabilities: {
313
+ tools: ToolCapability[];
314
+ resources?: ResourceCapability[];
315
+ resourceTemplates?: ResourceTemplateCapability[];
316
+ prompts?: PromptCapability[];
317
+ };
318
+ /** Interview results by persona */
319
+ interviews: PersonaInterview[];
320
+ /** Tool behavioral profiles */
321
+ toolProfiles: BaselineToolProfile[];
322
+ /** Workflow results (if workflows were tested) */
323
+ workflows?: WorkflowSignature[];
324
+ /** Overall behavioral assertions */
325
+ assertions: BaselineAssertion[];
326
+ /** Summary of findings */
327
+ summary: string;
328
+ /** SHA-256 hash of content (first 16 chars) for integrity */
329
+ hash: string;
330
+ /** Drift acceptance metadata (optional) */
331
+ acceptance?: DriftAcceptance;
332
+ /** Optional documentation score summary */
333
+ documentationScore?: DocumentationScoreSummary;
334
+ }
335
+ //# sourceMappingURL=baseline-format.d.ts.map
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Baseline types for Bellwether.
3
+ *
4
+ * These types define the canonical baseline format used for:
5
+ * - Local baseline storage
6
+ * - Baseline comparison and drift detection
7
+ * - Tool capability tracking
8
+ *
9
+ * Originally part of a hosted integration, now standalone for open-source use.
10
+ */
11
+ export {};
12
+ //# sourceMappingURL=baseline-format.js.map