@dotsetlabs/bellwether 1.0.3 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +17 -2
- package/dist/auth/credentials.js +2 -0
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +13 -3
- package/dist/baseline/baseline-format.d.ts +335 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +494 -13
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +97 -37
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +40 -3
- package/dist/baseline/saver.js +75 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +30 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +119 -2
- package/dist/cli/commands/baseline.js +70 -35
- package/dist/cli/commands/check.js +71 -15
- package/dist/cli/commands/explore.js +69 -16
- package/dist/cli/commands/init.js +10 -7
- package/dist/cli/commands/watch.js +5 -5
- package/dist/cli/index.js +8 -0
- package/dist/config/loader.js +2 -2
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +5 -1
- package/dist/constants/core.js +9 -20
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/discovery/discovery.js +88 -14
- package/dist/discovery/types.d.ts +5 -1
- package/dist/docs/agents.js +138 -50
- package/dist/docs/contract.js +194 -84
- package/dist/docs/report.js +8 -5
- package/dist/errors/retry.js +11 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/rate-limiter.js +7 -3
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/llm/anthropic.js +14 -4
- package/dist/llm/fallback.d.ts +1 -0
- package/dist/llm/fallback.js +7 -1
- package/dist/llm/openai.js +15 -4
- package/dist/protocol/index.d.ts +2 -0
- package/dist/protocol/index.js +2 -0
- package/dist/protocol/version-registry.d.ts +66 -0
- package/dist/protocol/version-registry.js +159 -0
- package/dist/transport/http-transport.d.ts +11 -1
- package/dist/transport/http-transport.js +21 -2
- package/dist/transport/mcp-client.d.ts +29 -1
- package/dist/transport/mcp-client.js +93 -8
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +162 -71
- package/dist/transport/types.d.ts +134 -1
- package/dist/utils/concurrency.d.ts +2 -0
- package/dist/utils/concurrency.js +9 -2
- package/dist/utils/markdown.js +13 -18
- package/dist/utils/timeout.js +2 -1
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,124 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [2.0.1] - 2026-02-07
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **MCP protocol version gating**: New `src/protocol/` module with version-to-feature-flag mapping
|
|
15
|
+
- Supports MCP protocol versions: `2024-11-05`, `2025-03-26`, `2025-06-18`, `2025-11-25`
|
|
16
|
+
- `MCPFeatureFlags` interface with 9 feature flags (`toolAnnotations`, `entityTitles`, `completions`, `resourceAnnotations`, `structuredOutput`, `serverInstructions`, `httpVersionHeader`, `tasks`, `icons`)
|
|
17
|
+
- `getSharedFeatureFlags(v1, v2)` computes AND-intersection for cross-version baseline comparison
|
|
18
|
+
- All version-specific fields in baselines are now gated by protocol version during conversion and comparison
|
|
19
|
+
- **Version-gated drift detection**: Comparator now detects changes in version-specific fields
|
|
20
|
+
- Tool annotations (readOnlyHint, destructiveHint, idempotentHint, openWorldHint)
|
|
21
|
+
- Entity titles (tool, prompt, resource, and resource template titles)
|
|
22
|
+
- Output schema and structured output changes
|
|
23
|
+
- Execution/task support changes
|
|
24
|
+
- Server instructions changes
|
|
25
|
+
- **MCPClient protocol version tracking**: Client stores negotiated protocol version after `initialize()`, exposes via getters
|
|
26
|
+
- **Mock server protocol version support**: Mock MCP server now supports `MOCK_PROTOCOL_VERSION` env var for testing
|
|
27
|
+
|
|
28
|
+
### Fixed
|
|
29
|
+
|
|
30
|
+
- **20 production-blocking bugs across all layers** (`4717ca1`):
|
|
31
|
+
- Transport: HTTP transport URL construction, SSE error event handling, MCP client error propagation
|
|
32
|
+
- Discovery: ResourceTemplate type handling, discovery error handling
|
|
33
|
+
- Baseline: Converter version-gated field handling, saver hash calculation, comparator severity logic
|
|
34
|
+
- CLI: Check command exit code handling, explore command cleanup, baseline command error paths
|
|
35
|
+
- Config: Environment variable expansion edge cases
|
|
36
|
+
- Docs: Contract and agents generator error handling
|
|
37
|
+
- **Protocol version gating gaps causing false negatives and data loss** (`dce73ed`):
|
|
38
|
+
- Fixed tool title comparison using wrong feature flag (`toolAnnotations` instead of `entityTitles`)
|
|
39
|
+
- Fixed tool title comparison condition (AND → OR) to detect added/removed titles
|
|
40
|
+
- Added missing `execution` and `baselineP99Ms` fields to `ToolFingerprint` type
|
|
41
|
+
- Added missing fields (`title`, `outputSchema`, `outputSchemaHash`, `annotations`, `execution`, `baselineP99Ms`) to `toToolCapability()` accessor — prevents data loss during incremental check merges
|
|
42
|
+
- Added `execution` and `baselineP99Ms` mapping to `getToolFingerprints()` accessor
|
|
43
|
+
- Added prompt title comparison gated by `entityTitles` flag
|
|
44
|
+
- Added resource title comparison gated by `entityTitles` flag
|
|
45
|
+
- Added resource template title comparison gated by `entityTitles` flag
|
|
46
|
+
- Added execution/task support comparison gated by `tasks` flag
|
|
47
|
+
- Added server instructions comparison gated by `serverInstructions` flag
|
|
48
|
+
- Gated resource template `title` in converter by `entityTitles` flag
|
|
49
|
+
- **Clean JSON output from baseline commands** (`7aab450`):
|
|
50
|
+
- `baseline compare --format json` no longer appends summary text after JSON object
|
|
51
|
+
- `baseline diff --format json` no longer prepends header or appends summary text around JSON object
|
|
52
|
+
- JSON output is now machine-parseable without text contamination
|
|
53
|
+
|
|
54
|
+
## [2.0.0] - 2026-02-04
|
|
55
|
+
|
|
56
|
+
### Breaking Changes
|
|
57
|
+
|
|
58
|
+
- **Removed cloud-related baseline modules**: The following exports have been removed from the public API:
|
|
59
|
+
- `ai-compatibility-scorer.ts` - AI compatibility scoring
|
|
60
|
+
- `change-impact-analyzer.ts` - Change impact analysis (`analyzeToolChangeImpact`, `analyzeDiffImpact`, `isBreakingChange`, etc.)
|
|
61
|
+
- `deprecation-tracker.ts` - Deprecation tracking (`checkDeprecations`, `markAsDeprecated`, `getDeprecatedTools`, etc.)
|
|
62
|
+
- `health-scorer.ts` - Health scoring (`calculateHealthScore`, `formatHealthScore`, `HEALTH_SCORING`, etc.)
|
|
63
|
+
- `migration-generator.ts` - Migration guide generation (`generateMigrationGuide`, `formatMigrationGuideMarkdown`, etc.)
|
|
64
|
+
- `pr-comment-generator.ts` - PR comment generation (`generatePRComment`, `generateCompactPRComment`, etc.)
|
|
65
|
+
- `risk-scorer.ts` - Risk scoring (`calculateRiskScore`, `generateRiskScoreMarkdown`, etc.)
|
|
66
|
+
- `scenario-generator.ts` - Auto scenario generation (`generateToolScenarios`, `generateBaselineScenarios`, etc.)
|
|
67
|
+
- `schema-evolution.ts` - Schema evolution timeline (`buildServerTimeline`, `getSchemaChanges`, etc.)
|
|
68
|
+
- `test-pruner.ts` - Test pruning (`calculatePruningDecisions`, `prioritizeTools`, etc.)
|
|
69
|
+
- `cloud-types.ts` - Cloud type definitions
|
|
70
|
+
- `constants/cloud.ts` - Cloud constants
|
|
71
|
+
- **Renamed baseline function**: `createCloudBaseline()` renamed to `createBaselineFromInterview()`
|
|
72
|
+
- **Removed `PERFORMANCE` constant export** from `performance-tracker.ts`
|
|
73
|
+
|
|
74
|
+
### Added
|
|
75
|
+
|
|
76
|
+
- **Deterministic prompt testing**: New `prompt-test-generator.ts` for generating deterministic tests for MCP prompts without requiring LLM calls
|
|
77
|
+
- **Deterministic resource testing**: New `resource-test-generator.ts` for generating deterministic tests for MCP resources
|
|
78
|
+
- **Interview insights module**: New `insights.ts` module with `buildInterviewInsights()` for deriving semantic inferences, schema evolution, and error analysis
|
|
79
|
+
- **Baseline format types**: New `baseline-format.ts` with enhanced types:
|
|
80
|
+
- `PersonaInterview` and `PersonaFinding` for structured interview results
|
|
81
|
+
- `ResourceCapability` and `PromptCapability` for resource/prompt discovery
|
|
82
|
+
- Enhanced `ToolCapability` with observed schema tracking and security fingerprints
|
|
83
|
+
- `ResponseSchemaEvolution` and `DocumentationScoreSummary` types
|
|
84
|
+
- **Registry constants**: New `constants/registry.ts` for MCP Registry integration
|
|
85
|
+
- **Man pages**: Added `man/bellwether.1` and `man/bellwether.1.md` for Unix manual pages
|
|
86
|
+
- **Explore report schema**: New `schemas/bellwether-explore.schema.json` for JSON report validation
|
|
87
|
+
- **JSON schema embedding**: JSON reports now include `$schema` pointer for IDE validation
|
|
88
|
+
- **Expanded behavior aspects**: `BehaviorAspect` type now includes `prompt`, `resource`, `server`, `capability`
|
|
89
|
+
|
|
90
|
+
### Changed
|
|
91
|
+
|
|
92
|
+
- **Simplified baseline system**: Removed cloud-specific baseline logic in favor of a single, self-contained format
|
|
93
|
+
- **Enhanced schema comparison**: Expanded `schema-compare.ts` with improved property-level diff detection
|
|
94
|
+
- **Improved comparator**: Enhanced `comparator.ts` with better change detection and categorization
|
|
95
|
+
- **SSE transport improvements**: Refactored `sse-transport.ts` for better reliability and error handling
|
|
96
|
+
- **Response cache enhancements**: Improved `response-cache.ts` with better TTL management
|
|
97
|
+
- **Interview system refinements**: Updated `interviewer.ts` and `schema-test-generator.ts` for deterministic test merging
|
|
98
|
+
- **Stateful test runner**: Enhanced `stateful-test-runner.ts` with improved state management
|
|
99
|
+
|
|
100
|
+
### Documentation
|
|
101
|
+
|
|
102
|
+
- Updated all CLI documentation for consistency
|
|
103
|
+
- Added JSON schema validation pointers to output format docs
|
|
104
|
+
- Updated GitHub Action examples to v2.0.0
|
|
105
|
+
- Improved baseline and CI/CD documentation
|
|
106
|
+
- Enhanced configuration guide with new options
|
|
107
|
+
|
|
108
|
+
### Internal
|
|
109
|
+
|
|
110
|
+
- Removed ~13,600 lines of cloud-related code
|
|
111
|
+
- Added ~2,600 lines of deterministic testing and baseline improvements
|
|
112
|
+
- Consolidated test files, removing 12 test files for deleted modules
|
|
113
|
+
- Added new tests for prompt/resource generators and enhanced schema comparison
|
|
114
|
+
|
|
115
|
+
### Migration Guide
|
|
116
|
+
|
|
117
|
+
If you were importing from the `@dotsetlabs/bellwether` library API:
|
|
118
|
+
|
|
119
|
+
1. **Baseline functions**: Replace `createCloudBaseline()` with `createBaselineFromInterview()`
|
|
120
|
+
|
|
121
|
+
2. **Removed exports**: The following modules are no longer available. If you depended on them, you'll need to implement alternatives:
|
|
122
|
+
- Health scoring, deprecation tracking, migration generation
|
|
123
|
+
- PR comment generation, risk scoring, scenario generation
|
|
124
|
+
- AI compatibility scoring, test pruning, schema evolution timeline
|
|
125
|
+
|
|
126
|
+
3. **CLI users**: No changes required. The CLI interface remains fully compatible.
|
|
127
|
+
|
|
10
128
|
## [1.0.3] - 2026-02-02
|
|
11
129
|
|
|
12
130
|
### Added
|
package/README.md
CHANGED
|
@@ -75,6 +75,15 @@ jobs:
|
|
|
75
75
|
| Parameter renamed | `path` to `file_path` | Breaking |
|
|
76
76
|
| Description changed | Tool help text updated | Warning |
|
|
77
77
|
| Performance regression | Latency increased >10% | Warning |
|
|
78
|
+
| Tool annotations changed | `readOnlyHint` flipped to `false` | Warning |
|
|
79
|
+
| Output schema changed | Return type structure modified | Warning |
|
|
80
|
+
| Entity title changed | Tool/prompt/resource title updated | Info |
|
|
81
|
+
| Task support changed | Execution mode switched to `async` | Warning |
|
|
82
|
+
| Server instructions changed | Server-level instructions updated | Info |
|
|
83
|
+
| Prompt added/removed | Prompt template appears or disappears | Breaking |
|
|
84
|
+
| Resource changed | Resource URI or MIME type modified | Warning |
|
|
85
|
+
|
|
86
|
+
Comparisons are **protocol-version-aware** — version-specific fields (annotations, titles, output schemas, etc.) are only compared when both baselines support the relevant MCP protocol version.
|
|
78
87
|
|
|
79
88
|
## Commands
|
|
80
89
|
|
|
@@ -105,11 +114,16 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
|
|
|
105
114
|
| `explore` | LLM behavioral testing |
|
|
106
115
|
| `baseline save` | Save test results as baseline |
|
|
107
116
|
| `baseline compare` | Compare against baseline |
|
|
117
|
+
| `baseline show` | Display baseline contents |
|
|
108
118
|
| `baseline accept` | Accept drift as intentional |
|
|
109
119
|
| `baseline diff` | Compare two baselines |
|
|
110
120
|
| `discover` | Show server capabilities |
|
|
111
121
|
| `watch` | Continuous checking on file changes |
|
|
112
122
|
| `registry` | Search MCP Registry |
|
|
123
|
+
| `golden` | Golden output regression testing |
|
|
124
|
+
| `contract` | Contract validation (generate/validate/show) |
|
|
125
|
+
| `auth` | Manage LLM provider API keys |
|
|
126
|
+
| `validate-config` | Validate bellwether.yaml without running tests |
|
|
113
127
|
|
|
114
128
|
## CI/CD Exit Codes
|
|
115
129
|
|
|
@@ -120,13 +134,14 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
|
|
|
120
134
|
| `2` | Warning-level changes | Warn |
|
|
121
135
|
| `3` | Breaking changes | Fail |
|
|
122
136
|
| `4` | Runtime error | Fail |
|
|
137
|
+
| `5` | Low confidence metrics | Warn or fail |
|
|
123
138
|
|
|
124
139
|
## GitHub Action
|
|
125
140
|
|
|
126
141
|
```yaml
|
|
127
|
-
- uses: dotsetlabs/bellwether@
|
|
142
|
+
- uses: dotsetlabs/bellwether@v2.0.0
|
|
128
143
|
with:
|
|
129
|
-
version: '
|
|
144
|
+
version: '2.0.0'
|
|
130
145
|
server-command: 'npx @mcp/your-server'
|
|
131
146
|
baseline-path: './bellwether-baseline.json'
|
|
132
147
|
fail-on-severity: 'warning'
|
package/dist/auth/credentials.js
CHANGED
|
@@ -58,6 +58,8 @@ function readEnvFile(filePath, envVar, options) {
|
|
|
58
58
|
if (decrypted) {
|
|
59
59
|
return decrypted;
|
|
60
60
|
}
|
|
61
|
+
// Warn about decryption failure so users know their credential exists but can't be decrypted
|
|
62
|
+
console.warn(`[bellwether] Encrypted credential found for ${envVar} but decryption failed. Check your encryption key.`);
|
|
61
63
|
return undefined;
|
|
62
64
|
}
|
|
63
65
|
if (value) {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ToolCapability } from './
|
|
1
|
+
import type { ToolCapability } from './baseline-format.js';
|
|
2
2
|
import type { BehavioralBaseline, ToolFingerprint } from './types.js';
|
|
3
3
|
export declare function getBaselineGeneratedAt(baseline: BehavioralBaseline): Date;
|
|
4
4
|
export declare function getBaselineHash(baseline: BehavioralBaseline): string;
|
|
@@ -57,9 +57,15 @@ export function toToolCapability(tool) {
|
|
|
57
57
|
errorPatterns: tool.errorPatterns,
|
|
58
58
|
baselineP50Ms: tool.baselineP50Ms,
|
|
59
59
|
baselineP95Ms: tool.baselineP95Ms,
|
|
60
|
+
baselineP99Ms: tool.baselineP99Ms,
|
|
60
61
|
baselineSuccessRate: tool.baselineSuccessRate,
|
|
61
62
|
performanceConfidence: tool.performanceConfidence,
|
|
62
63
|
securityFingerprint: tool.securityFingerprint,
|
|
64
|
+
title: tool.title,
|
|
65
|
+
outputSchema: tool.outputSchema,
|
|
66
|
+
outputSchemaHash: tool.outputSchemaHash,
|
|
67
|
+
annotations: tool.annotations,
|
|
68
|
+
execution: tool.execution,
|
|
63
69
|
};
|
|
64
70
|
}
|
|
65
71
|
export function getToolFingerprints(baseline) {
|
|
@@ -73,9 +79,7 @@ export function getToolFingerprints(baseline) {
|
|
|
73
79
|
const limitations = profile?.limitations ?? [];
|
|
74
80
|
const description = tool.description || profile?.description || '';
|
|
75
81
|
const schemaHash = tool.schemaHash || profile?.schemaHash || '';
|
|
76
|
-
const lastTestedAt = tool.lastTestedAt
|
|
77
|
-
? new Date(tool.lastTestedAt)
|
|
78
|
-
: undefined;
|
|
82
|
+
const lastTestedAt = tool.lastTestedAt ? new Date(tool.lastTestedAt) : undefined;
|
|
79
83
|
return {
|
|
80
84
|
name: tool.name,
|
|
81
85
|
description,
|
|
@@ -95,6 +99,12 @@ export function getToolFingerprints(baseline) {
|
|
|
95
99
|
baselineSuccessRate: tool.baselineSuccessRate,
|
|
96
100
|
performanceConfidence: tool.performanceConfidence,
|
|
97
101
|
securityFingerprint: tool.securityFingerprint,
|
|
102
|
+
title: tool.title,
|
|
103
|
+
outputSchema: tool.outputSchema,
|
|
104
|
+
outputSchemaHash: tool.outputSchemaHash,
|
|
105
|
+
annotations: tool.annotations,
|
|
106
|
+
execution: tool.execution,
|
|
107
|
+
baselineP99Ms: tool.baselineP99Ms,
|
|
98
108
|
};
|
|
99
109
|
});
|
|
100
110
|
if (fingerprints.length > 0) {
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline types for Bellwether.
|
|
3
|
+
*
|
|
4
|
+
* These types define the canonical baseline format used for:
|
|
5
|
+
* - Local baseline storage
|
|
6
|
+
* - Baseline comparison and drift detection
|
|
7
|
+
* - Tool capability tracking
|
|
8
|
+
*
|
|
9
|
+
* Originally part of a hosted integration, now standalone for open-source use.
|
|
10
|
+
*/
|
|
11
|
+
import type { WorkflowSignature } from './types.js';
|
|
12
|
+
import type { ResponseFingerprint, InferredSchema, ErrorPattern } from './response-fingerprint.js';
|
|
13
|
+
import type { SecurityFingerprint } from '../security/types.js';
|
|
14
|
+
/**
|
|
15
|
+
* Assertion type for baseline assertions.
|
|
16
|
+
* Maps to: expects (positive), requires (critical), warns (negative), notes (informational)
|
|
17
|
+
*/
|
|
18
|
+
export type BaselineAssertionType = 'expects' | 'requires' | 'warns' | 'notes';
|
|
19
|
+
/**
|
|
20
|
+
* Severity level for assertions.
|
|
21
|
+
*/
|
|
22
|
+
export type BaselineAssertionSeverity = 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
23
|
+
/**
|
|
24
|
+
* Behavioral assertion in baseline format.
|
|
25
|
+
*/
|
|
26
|
+
export interface BaselineAssertion {
|
|
27
|
+
/** Type of assertion */
|
|
28
|
+
type: BaselineAssertionType;
|
|
29
|
+
/** The condition/assertion statement */
|
|
30
|
+
condition: string;
|
|
31
|
+
/** Tool this assertion relates to (optional) */
|
|
32
|
+
tool?: string;
|
|
33
|
+
/** Severity level (optional) */
|
|
34
|
+
severity?: BaselineAssertionSeverity;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Baseline mode indicating how the baseline was generated.
|
|
38
|
+
* - 'check': Deterministic structural testing (no LLM required)
|
|
39
|
+
* - 'explore': LLM-powered behavioral exploration
|
|
40
|
+
*/
|
|
41
|
+
export type BaselineMode = 'check' | 'explore';
|
|
42
|
+
/**
|
|
43
|
+
* Metadata about how the baseline was generated.
|
|
44
|
+
*/
|
|
45
|
+
export interface BaselineMetadata {
|
|
46
|
+
/** Baseline mode: 'check' = deterministic, 'explore' = LLM-powered */
|
|
47
|
+
mode: BaselineMode;
|
|
48
|
+
/** ISO timestamp when generated */
|
|
49
|
+
generatedAt: string;
|
|
50
|
+
/** CLI version that generated this baseline */
|
|
51
|
+
cliVersion: string;
|
|
52
|
+
/** Command used to start the server */
|
|
53
|
+
serverCommand: string;
|
|
54
|
+
/** Server name from MCP initialization */
|
|
55
|
+
serverName?: string;
|
|
56
|
+
/** Interview duration in milliseconds */
|
|
57
|
+
durationMs: number;
|
|
58
|
+
/** Personas used during interview (empty for check mode) */
|
|
59
|
+
personas: string[];
|
|
60
|
+
/** LLM model used ('none' for check mode) */
|
|
61
|
+
model: string;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Server fingerprint in baseline format.
|
|
65
|
+
*/
|
|
66
|
+
export interface BaselineServerFingerprint {
|
|
67
|
+
/** Server name */
|
|
68
|
+
name: string;
|
|
69
|
+
/** Server version */
|
|
70
|
+
version: string;
|
|
71
|
+
/** MCP protocol version */
|
|
72
|
+
protocolVersion: string;
|
|
73
|
+
/** Available capabilities */
|
|
74
|
+
capabilities: string[];
|
|
75
|
+
/** Server-provided instructions (MCP 2025-11-25) */
|
|
76
|
+
instructions?: string;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Tool capability from discovery.
|
|
80
|
+
*/
|
|
81
|
+
export interface ToolCapability {
|
|
82
|
+
/** Tool name */
|
|
83
|
+
name: string;
|
|
84
|
+
/** Tool description */
|
|
85
|
+
description: string;
|
|
86
|
+
/** Input schema */
|
|
87
|
+
inputSchema: Record<string, unknown>;
|
|
88
|
+
/** Hash of the schema for change detection */
|
|
89
|
+
schemaHash: string;
|
|
90
|
+
/** Human-readable title for the tool (MCP 2025-11-25) */
|
|
91
|
+
title?: string;
|
|
92
|
+
/** JSON Schema for the tool's output (MCP 2025-11-25 structured content) */
|
|
93
|
+
outputSchema?: Record<string, unknown>;
|
|
94
|
+
/** Hash of the output schema for drift detection */
|
|
95
|
+
outputSchemaHash?: string;
|
|
96
|
+
/** Behavioral annotations/hints (MCP 2025-11-25) */
|
|
97
|
+
annotations?: {
|
|
98
|
+
title?: string;
|
|
99
|
+
readOnlyHint?: boolean;
|
|
100
|
+
destructiveHint?: boolean;
|
|
101
|
+
idempotentHint?: boolean;
|
|
102
|
+
openWorldHint?: boolean;
|
|
103
|
+
};
|
|
104
|
+
/** Task execution configuration (MCP 2025-11-25) */
|
|
105
|
+
execution?: {
|
|
106
|
+
taskSupport?: string;
|
|
107
|
+
};
|
|
108
|
+
/** Hash of observed arguments schema (from actual calls) */
|
|
109
|
+
observedArgsSchemaHash?: string;
|
|
110
|
+
/** Consistency of observed argument schemas (0-1) */
|
|
111
|
+
observedArgsSchemaConsistency?: number;
|
|
112
|
+
/** Number of observed schema variations */
|
|
113
|
+
observedArgsSchemaVariations?: number;
|
|
114
|
+
/** Fingerprint of the tool's response structure */
|
|
115
|
+
responseFingerprint?: ResponseFingerprint;
|
|
116
|
+
/** Inferred JSON schema of the tool's output */
|
|
117
|
+
inferredOutputSchema?: InferredSchema;
|
|
118
|
+
/** Normalized error patterns observed during testing */
|
|
119
|
+
errorPatterns?: ErrorPattern[];
|
|
120
|
+
/** Baseline p50 latency in milliseconds */
|
|
121
|
+
baselineP50Ms?: number;
|
|
122
|
+
/** Baseline p95 latency in milliseconds */
|
|
123
|
+
baselineP95Ms?: number;
|
|
124
|
+
/** Baseline p99 latency in milliseconds */
|
|
125
|
+
baselineP99Ms?: number;
|
|
126
|
+
/** Baseline success rate (0-1) */
|
|
127
|
+
baselineSuccessRate?: number;
|
|
128
|
+
/** Response schema evolution metadata */
|
|
129
|
+
responseSchemaEvolution?: ResponseSchemaEvolution;
|
|
130
|
+
/** ISO timestamp of last time this tool was tested */
|
|
131
|
+
lastTestedAt?: string;
|
|
132
|
+
/** Schema hash captured at the last test time */
|
|
133
|
+
inputSchemaHashAtTest?: string;
|
|
134
|
+
/** Statistical confidence for performance baselines */
|
|
135
|
+
performanceConfidence?: {
|
|
136
|
+
sampleCount: number;
|
|
137
|
+
successfulSamples: number;
|
|
138
|
+
validationSamples: number;
|
|
139
|
+
totalTests: number;
|
|
140
|
+
standardDeviation: number;
|
|
141
|
+
coefficientOfVariation: number;
|
|
142
|
+
confidenceLevel: 'low' | 'medium' | 'high';
|
|
143
|
+
recommendation?: string;
|
|
144
|
+
};
|
|
145
|
+
/** Security testing fingerprint with findings and risk score */
|
|
146
|
+
securityFingerprint?: SecurityFingerprint;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Resource capability from discovery.
|
|
150
|
+
*/
|
|
151
|
+
export interface ResourceCapability {
|
|
152
|
+
/** Resource URI template */
|
|
153
|
+
uri: string;
|
|
154
|
+
/** Resource name */
|
|
155
|
+
name: string;
|
|
156
|
+
/** Resource description */
|
|
157
|
+
description?: string;
|
|
158
|
+
/** MIME type */
|
|
159
|
+
mimeType?: string;
|
|
160
|
+
/** Human-readable title (MCP 2025-11-25) */
|
|
161
|
+
title?: string;
|
|
162
|
+
/** Resource annotations (MCP 2025-11-25) */
|
|
163
|
+
annotations?: {
|
|
164
|
+
audience?: string[];
|
|
165
|
+
priority?: number;
|
|
166
|
+
lastModified?: string;
|
|
167
|
+
};
|
|
168
|
+
/** Resource size in bytes (MCP 2025-11-25) */
|
|
169
|
+
size?: number;
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Resource template capability from discovery (MCP 2025-11-25).
|
|
173
|
+
*/
|
|
174
|
+
export interface ResourceTemplateCapability {
|
|
175
|
+
/** URI template (RFC 6570) */
|
|
176
|
+
uriTemplate: string;
|
|
177
|
+
/** Template name */
|
|
178
|
+
name: string;
|
|
179
|
+
/** Human-readable title */
|
|
180
|
+
title?: string;
|
|
181
|
+
/** Template description */
|
|
182
|
+
description?: string;
|
|
183
|
+
/** Expected MIME type */
|
|
184
|
+
mimeType?: string;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Prompt capability from discovery.
|
|
188
|
+
*/
|
|
189
|
+
export interface PromptCapability {
|
|
190
|
+
/** Prompt name */
|
|
191
|
+
name: string;
|
|
192
|
+
/** Prompt description */
|
|
193
|
+
description?: string;
|
|
194
|
+
/** Human-readable title (MCP 2025-11-25) */
|
|
195
|
+
title?: string;
|
|
196
|
+
/** Arguments the prompt accepts */
|
|
197
|
+
arguments?: Array<{
|
|
198
|
+
name: string;
|
|
199
|
+
description?: string;
|
|
200
|
+
required?: boolean;
|
|
201
|
+
}>;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Interview results for a single persona.
|
|
205
|
+
*/
|
|
206
|
+
export interface PersonaInterview {
|
|
207
|
+
/** Persona ID */
|
|
208
|
+
persona: string;
|
|
209
|
+
/** Number of tools interviewed */
|
|
210
|
+
toolsInterviewed: number;
|
|
211
|
+
/** Number of questions asked */
|
|
212
|
+
questionsAsked: number;
|
|
213
|
+
/** Findings from this persona */
|
|
214
|
+
findings: PersonaFinding[];
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* A finding from a persona interview.
|
|
218
|
+
*/
|
|
219
|
+
export interface PersonaFinding {
|
|
220
|
+
/** Tool this finding relates to */
|
|
221
|
+
tool: string;
|
|
222
|
+
/** Finding category */
|
|
223
|
+
category: 'behavior' | 'security' | 'reliability' | 'edge_case';
|
|
224
|
+
/** Severity level */
|
|
225
|
+
severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
226
|
+
/** Description of the finding */
|
|
227
|
+
description: string;
|
|
228
|
+
/** Evidence supporting the finding */
|
|
229
|
+
evidence?: string;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Tool behavioral profile in baseline format.
|
|
233
|
+
*/
|
|
234
|
+
export interface BaselineToolProfile {
|
|
235
|
+
/** Tool name */
|
|
236
|
+
name: string;
|
|
237
|
+
/** Tool description */
|
|
238
|
+
description: string;
|
|
239
|
+
/** Hash of input schema */
|
|
240
|
+
schemaHash: string;
|
|
241
|
+
/** Behavioral assertions */
|
|
242
|
+
assertions: BaselineAssertion[];
|
|
243
|
+
/** Security notes */
|
|
244
|
+
securityNotes: string[];
|
|
245
|
+
/** Known limitations */
|
|
246
|
+
limitations: string[];
|
|
247
|
+
/** Behavioral notes */
|
|
248
|
+
behavioralNotes: string[];
|
|
249
|
+
}
|
|
250
|
+
/**
|
|
251
|
+
* Snapshot of accepted drift for a baseline.
|
|
252
|
+
*/
|
|
253
|
+
export interface AcceptedDiff {
|
|
254
|
+
toolsAdded: string[];
|
|
255
|
+
toolsRemoved: string[];
|
|
256
|
+
toolsModified: string[];
|
|
257
|
+
severity: 'none' | 'info' | 'warning' | 'breaking';
|
|
258
|
+
breakingCount: number;
|
|
259
|
+
warningCount: number;
|
|
260
|
+
infoCount: number;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Drift acceptance metadata attached to a baseline.
|
|
264
|
+
*/
|
|
265
|
+
export interface DriftAcceptance {
|
|
266
|
+
acceptedAt: string | Date;
|
|
267
|
+
acceptedBy?: string;
|
|
268
|
+
reason?: string;
|
|
269
|
+
acceptedDiff: AcceptedDiff;
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Serializable schema evolution data for baselines.
|
|
273
|
+
*/
|
|
274
|
+
export interface ResponseSchemaEvolution {
|
|
275
|
+
currentHash: string;
|
|
276
|
+
history: Array<{
|
|
277
|
+
hash: string;
|
|
278
|
+
schema: InferredSchema;
|
|
279
|
+
observedAt: string | Date;
|
|
280
|
+
sampleCount: number;
|
|
281
|
+
}>;
|
|
282
|
+
isStable: boolean;
|
|
283
|
+
stabilityConfidence: number;
|
|
284
|
+
inconsistentFields: string[];
|
|
285
|
+
sampleCount: number;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Serializable documentation score summary for baseline storage.
|
|
289
|
+
*/
|
|
290
|
+
export interface DocumentationScoreSummary {
|
|
291
|
+
overallScore: number;
|
|
292
|
+
grade: string;
|
|
293
|
+
issueCount: number;
|
|
294
|
+
toolCount: number;
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Canonical baseline format.
|
|
298
|
+
*
|
|
299
|
+
* This is the single baseline schema used by Bellwether CLI.
|
|
300
|
+
*
|
|
301
|
+
* Versioning: Uses CLI package version for compatibility checking.
|
|
302
|
+
* Baselines with the same CLI major version are compatible.
|
|
303
|
+
*/
|
|
304
|
+
export interface BellwetherBaseline {
|
|
305
|
+
/** CLI version that generated this baseline (e.g., '1.0.0') */
|
|
306
|
+
version: string;
|
|
307
|
+
/** Generation metadata */
|
|
308
|
+
metadata: BaselineMetadata;
|
|
309
|
+
/** Server fingerprint */
|
|
310
|
+
server: BaselineServerFingerprint;
|
|
311
|
+
/** Discovered capabilities */
|
|
312
|
+
capabilities: {
|
|
313
|
+
tools: ToolCapability[];
|
|
314
|
+
resources?: ResourceCapability[];
|
|
315
|
+
resourceTemplates?: ResourceTemplateCapability[];
|
|
316
|
+
prompts?: PromptCapability[];
|
|
317
|
+
};
|
|
318
|
+
/** Interview results by persona */
|
|
319
|
+
interviews: PersonaInterview[];
|
|
320
|
+
/** Tool behavioral profiles */
|
|
321
|
+
toolProfiles: BaselineToolProfile[];
|
|
322
|
+
/** Workflow results (if workflows were tested) */
|
|
323
|
+
workflows?: WorkflowSignature[];
|
|
324
|
+
/** Overall behavioral assertions */
|
|
325
|
+
assertions: BaselineAssertion[];
|
|
326
|
+
/** Summary of findings */
|
|
327
|
+
summary: string;
|
|
328
|
+
/** SHA-256 hash of content (first 16 chars) for integrity */
|
|
329
|
+
hash: string;
|
|
330
|
+
/** Drift acceptance metadata (optional) */
|
|
331
|
+
acceptance?: DriftAcceptance;
|
|
332
|
+
/** Optional documentation score summary */
|
|
333
|
+
documentationScore?: DocumentationScoreSummary;
|
|
334
|
+
}
|
|
335
|
+
//# sourceMappingURL=baseline-format.d.ts.map
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline types for Bellwether.
|
|
3
|
+
*
|
|
4
|
+
* These types define the canonical baseline format used for:
|
|
5
|
+
* - Local baseline storage
|
|
6
|
+
* - Baseline comparison and drift detection
|
|
7
|
+
* - Tool capability tracking
|
|
8
|
+
*
|
|
9
|
+
* Originally part of a hosted integration, now standalone for open-source use.
|
|
10
|
+
*/
|
|
11
|
+
export {};
|
|
12
|
+
//# sourceMappingURL=baseline-format.js.map
|