@dotsetlabs/bellwether 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +97 -0
- package/README.md +9 -2
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +1 -3
- package/dist/baseline/baseline-format.d.ts +287 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +249 -11
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +46 -34
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +39 -2
- package/dist/baseline/saver.js +41 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +10 -7
- package/dist/cache/response-cache.d.ts +12 -2
- package/dist/cache/response-cache.js +178 -30
- package/dist/cli/commands/check.js +100 -54
- package/dist/cli/commands/explore.js +34 -14
- package/dist/cli/index.js +13 -3
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +4 -0
- package/dist/constants/core.js +8 -19
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/docs/contract.js +131 -83
- package/dist/docs/report.js +8 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +119 -57
- package/dist/interview/orchestrator.js +49 -22
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/llm/anthropic.js +49 -16
- package/dist/llm/client.d.ts +2 -0
- package/dist/llm/client.js +61 -0
- package/dist/llm/ollama.js +9 -4
- package/dist/llm/openai.js +34 -23
- package/dist/transport/base-transport.d.ts +1 -1
- package/dist/transport/http-transport.d.ts +2 -2
- package/dist/transport/http-transport.js +26 -6
- package/dist/transport/mcp-client.d.ts +18 -6
- package/dist/transport/mcp-client.js +50 -20
- package/dist/transport/sse-transport.d.ts +8 -4
- package/dist/transport/sse-transport.js +161 -69
- package/dist/transport/stdio-transport.d.ts +1 -1
- package/dist/transport/stdio-transport.js +1 -1
- package/dist/utils/timeout.d.ts +10 -2
- package/dist/utils/timeout.js +9 -5
- package/dist/version.js +1 -1
- package/dist/workflow/executor.js +18 -13
- package/dist/workflow/loader.js +4 -1
- package/dist/workflow/state-tracker.js +22 -18
- package/man/bellwether.1 +204 -0
- package/man/bellwether.1.md +148 -0
- package/package.json +6 -7
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,103 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [2.0.0] - 2026-02-04
|
|
11
|
+
|
|
12
|
+
### Breaking Changes
|
|
13
|
+
|
|
14
|
+
- **Removed cloud-related baseline modules**: The following exports have been removed from the public API:
|
|
15
|
+
- `ai-compatibility-scorer.ts` - AI compatibility scoring
|
|
16
|
+
- `change-impact-analyzer.ts` - Change impact analysis (`analyzeToolChangeImpact`, `analyzeDiffImpact`, `isBreakingChange`, etc.)
|
|
17
|
+
- `deprecation-tracker.ts` - Deprecation tracking (`checkDeprecations`, `markAsDeprecated`, `getDeprecatedTools`, etc.)
|
|
18
|
+
- `health-scorer.ts` - Health scoring (`calculateHealthScore`, `formatHealthScore`, `HEALTH_SCORING`, etc.)
|
|
19
|
+
- `migration-generator.ts` - Migration guide generation (`generateMigrationGuide`, `formatMigrationGuideMarkdown`, etc.)
|
|
20
|
+
- `pr-comment-generator.ts` - PR comment generation (`generatePRComment`, `generateCompactPRComment`, etc.)
|
|
21
|
+
- `risk-scorer.ts` - Risk scoring (`calculateRiskScore`, `generateRiskScoreMarkdown`, etc.)
|
|
22
|
+
- `scenario-generator.ts` - Auto scenario generation (`generateToolScenarios`, `generateBaselineScenarios`, etc.)
|
|
23
|
+
- `schema-evolution.ts` - Schema evolution timeline (`buildServerTimeline`, `getSchemaChanges`, etc.)
|
|
24
|
+
- `test-pruner.ts` - Test pruning (`calculatePruningDecisions`, `prioritizeTools`, etc.)
|
|
25
|
+
- `cloud-types.ts` - Cloud type definitions
|
|
26
|
+
- `constants/cloud.ts` - Cloud constants
|
|
27
|
+
- **Renamed baseline function**: `createCloudBaseline()` renamed to `createBaselineFromInterview()`
|
|
28
|
+
- **Removed `PERFORMANCE` constant export** from `performance-tracker.ts`
|
|
29
|
+
|
|
30
|
+
### Added
|
|
31
|
+
|
|
32
|
+
- **Deterministic prompt testing**: New `prompt-test-generator.ts` for generating deterministic tests for MCP prompts without requiring LLM calls
|
|
33
|
+
- **Deterministic resource testing**: New `resource-test-generator.ts` for generating deterministic tests for MCP resources
|
|
34
|
+
- **Interview insights module**: New `insights.ts` module with `buildInterviewInsights()` for deriving semantic inferences, schema evolution, and error analysis
|
|
35
|
+
- **Baseline format types**: New `baseline-format.ts` with enhanced types:
|
|
36
|
+
- `PersonaInterview` and `PersonaFinding` for structured interview results
|
|
37
|
+
- `ResourceCapability` and `PromptCapability` for resource/prompt discovery
|
|
38
|
+
- Enhanced `ToolCapability` with observed schema tracking and security fingerprints
|
|
39
|
+
- `ResponseSchemaEvolution` and `DocumentationScoreSummary` types
|
|
40
|
+
- **Registry constants**: New `constants/registry.ts` for MCP Registry integration
|
|
41
|
+
- **Man pages**: Added `man/bellwether.1` and `man/bellwether.1.md` for Unix manual pages
|
|
42
|
+
- **Explore report schema**: New `schemas/bellwether-explore.schema.json` for JSON report validation
|
|
43
|
+
- **JSON schema embedding**: JSON reports now include `$schema` pointer for IDE validation
|
|
44
|
+
- **Expanded behavior aspects**: `BehaviorAspect` type now includes `prompt`, `resource`, `server`, `capability`
|
|
45
|
+
|
|
46
|
+
### Changed
|
|
47
|
+
|
|
48
|
+
- **Simplified baseline system**: Removed cloud-specific baseline logic in favor of a single, self-contained format
|
|
49
|
+
- **Enhanced schema comparison**: Expanded `schema-compare.ts` with improved property-level diff detection
|
|
50
|
+
- **Improved comparator**: Enhanced `comparator.ts` with better change detection and categorization
|
|
51
|
+
- **SSE transport improvements**: Refactored `sse-transport.ts` for better reliability and error handling
|
|
52
|
+
- **Response cache enhancements**: Improved `response-cache.ts` with better TTL management
|
|
53
|
+
- **Interview system refinements**: Updated `interviewer.ts` and `schema-test-generator.ts` for deterministic test merging
|
|
54
|
+
- **Stateful test runner**: Enhanced `stateful-test-runner.ts` with improved state management
|
|
55
|
+
|
|
56
|
+
### Documentation
|
|
57
|
+
|
|
58
|
+
- Updated all CLI documentation for consistency
|
|
59
|
+
- Added JSON schema validation pointers to output format docs
|
|
60
|
+
- Updated GitHub Action examples to v2.0.0
|
|
61
|
+
- Improved baseline and CI/CD documentation
|
|
62
|
+
- Enhanced configuration guide with new options
|
|
63
|
+
|
|
64
|
+
### Internal
|
|
65
|
+
|
|
66
|
+
- Removed ~13,600 lines of cloud-related code
|
|
67
|
+
- Added ~2,600 lines of deterministic testing and baseline improvements
|
|
68
|
+
- Consolidated test files, removing 12 test files for deleted modules
|
|
69
|
+
- Added new tests for prompt/resource generators and enhanced schema comparison
|
|
70
|
+
|
|
71
|
+
### Migration Guide
|
|
72
|
+
|
|
73
|
+
If you were importing from the `@dotsetlabs/bellwether` library API:
|
|
74
|
+
|
|
75
|
+
1. **Baseline functions**: Replace `createCloudBaseline()` with `createBaselineFromInterview()`
|
|
76
|
+
|
|
77
|
+
2. **Removed exports**: The following modules are no longer available. If you depended on them, you'll need to implement alternatives:
|
|
78
|
+
- Health scoring, deprecation tracking, migration generation
|
|
79
|
+
- PR comment generation, risk scoring, scenario generation
|
|
80
|
+
- AI compatibility scoring, test pruning, schema evolution timeline
|
|
81
|
+
|
|
82
|
+
3. **CLI users**: No changes required. The CLI interface remains fully compatible.
|
|
83
|
+
|
|
84
|
+
## [1.0.3] - 2026-02-02
|
|
85
|
+
|
|
86
|
+
### Added
|
|
87
|
+
|
|
88
|
+
- Added `version` input to GitHub Action for explicit npm version selection
|
|
89
|
+
- Action now derives version from ref (e.g., `v1.0.3`) or accepts explicit `inputs.version`
|
|
90
|
+
- Provides clear error message when version cannot be determined
|
|
91
|
+
- Added `signal` option to LLM completion requests for request cancellation via AbortSignal
|
|
92
|
+
- Added AbortController integration to timeout utilities for proper request cancellation
|
|
93
|
+
- Added JSON extraction from mixed LLM responses (handles prose around JSON blocks)
|
|
94
|
+
|
|
95
|
+
### Changed
|
|
96
|
+
|
|
97
|
+
- Improved timeout handling with AbortController propagation across LLM and transport layers
|
|
98
|
+
- Improved error handling and resource cleanup in interview, orchestrator, and transport modules
|
|
99
|
+
- Refactored response cache, workflow executor, and state tracker for better reliability
|
|
100
|
+
- Updated CI/CD and GitHub/GitLab integration documentation
|
|
101
|
+
|
|
102
|
+
### Fixed
|
|
103
|
+
|
|
104
|
+
- Fixed GitHub Action stderr handling in check command output capture
|
|
105
|
+
- Fixed various code formatting and linting issues across LLM clients and transport modules
|
|
106
|
+
|
|
10
107
|
## [1.0.2] - 2026-01-30
|
|
11
108
|
|
|
12
109
|
### Added
|
package/README.md
CHANGED
|
@@ -105,11 +105,16 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
|
|
|
105
105
|
| `explore` | LLM behavioral testing |
|
|
106
106
|
| `baseline save` | Save test results as baseline |
|
|
107
107
|
| `baseline compare` | Compare against baseline |
|
|
108
|
+
| `baseline show` | Display baseline contents |
|
|
108
109
|
| `baseline accept` | Accept drift as intentional |
|
|
109
110
|
| `baseline diff` | Compare two baselines |
|
|
110
111
|
| `discover` | Show server capabilities |
|
|
111
112
|
| `watch` | Continuous checking on file changes |
|
|
112
113
|
| `registry` | Search MCP Registry |
|
|
114
|
+
| `golden` | Golden output regression testing |
|
|
115
|
+
| `contract` | Contract validation (generate/validate/show) |
|
|
116
|
+
| `auth` | Manage LLM provider API keys |
|
|
117
|
+
| `validate-config` | Validate bellwether.yaml without running tests |
|
|
113
118
|
|
|
114
119
|
## CI/CD Exit Codes
|
|
115
120
|
|
|
@@ -120,12 +125,14 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
|
|
|
120
125
|
| `2` | Warning-level changes | Warn |
|
|
121
126
|
| `3` | Breaking changes | Fail |
|
|
122
127
|
| `4` | Runtime error | Fail |
|
|
128
|
+
| `5` | Low confidence metrics | Warn or fail |
|
|
123
129
|
|
|
124
130
|
## GitHub Action
|
|
125
131
|
|
|
126
132
|
```yaml
|
|
127
|
-
- uses: dotsetlabs/bellwether@
|
|
133
|
+
- uses: dotsetlabs/bellwether@v2.0.0
|
|
128
134
|
with:
|
|
135
|
+
version: '2.0.0'
|
|
129
136
|
server-command: 'npx @mcp/your-server'
|
|
130
137
|
baseline-path: './bellwether-baseline.json'
|
|
131
138
|
fail-on-severity: 'warning'
|
|
@@ -167,7 +174,7 @@ bellwether init --preset local npx @mcp/server # Local Ollama (free)
|
|
|
167
174
|
|
|
168
175
|
```bash
|
|
169
176
|
git clone https://github.com/dotsetlabs/bellwether
|
|
170
|
-
cd bellwether
|
|
177
|
+
cd bellwether
|
|
171
178
|
npm install
|
|
172
179
|
npm run build
|
|
173
180
|
npm test
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ToolCapability } from './
|
|
1
|
+
import type { ToolCapability } from './baseline-format.js';
|
|
2
2
|
import type { BehavioralBaseline, ToolFingerprint } from './types.js';
|
|
3
3
|
export declare function getBaselineGeneratedAt(baseline: BehavioralBaseline): Date;
|
|
4
4
|
export declare function getBaselineHash(baseline: BehavioralBaseline): string;
|
|
@@ -73,9 +73,7 @@ export function getToolFingerprints(baseline) {
|
|
|
73
73
|
const limitations = profile?.limitations ?? [];
|
|
74
74
|
const description = tool.description || profile?.description || '';
|
|
75
75
|
const schemaHash = tool.schemaHash || profile?.schemaHash || '';
|
|
76
|
-
const lastTestedAt = tool.lastTestedAt
|
|
77
|
-
? new Date(tool.lastTestedAt)
|
|
78
|
-
: undefined;
|
|
76
|
+
const lastTestedAt = tool.lastTestedAt ? new Date(tool.lastTestedAt) : undefined;
|
|
79
77
|
return {
|
|
80
78
|
name: tool.name,
|
|
81
79
|
description,
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline types for Bellwether.
|
|
3
|
+
*
|
|
4
|
+
* These types define the canonical baseline format used for:
|
|
5
|
+
* - Local baseline storage
|
|
6
|
+
* - Baseline comparison and drift detection
|
|
7
|
+
* - Tool capability tracking
|
|
8
|
+
*
|
|
9
|
+
* Originally part of a hosted integration, now standalone for open-source use.
|
|
10
|
+
*/
|
|
11
|
+
import type { WorkflowSignature } from './types.js';
|
|
12
|
+
import type { ResponseFingerprint, InferredSchema, ErrorPattern } from './response-fingerprint.js';
|
|
13
|
+
import type { SecurityFingerprint } from '../security/types.js';
|
|
14
|
+
/**
|
|
15
|
+
* Assertion type for baseline assertions.
|
|
16
|
+
* Maps to: expects (positive), requires (critical), warns (negative), notes (informational)
|
|
17
|
+
*/
|
|
18
|
+
export type BaselineAssertionType = 'expects' | 'requires' | 'warns' | 'notes';
|
|
19
|
+
/**
|
|
20
|
+
* Severity level for assertions.
|
|
21
|
+
*/
|
|
22
|
+
export type BaselineAssertionSeverity = 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
23
|
+
/**
|
|
24
|
+
* Behavioral assertion in baseline format.
|
|
25
|
+
*/
|
|
26
|
+
export interface BaselineAssertion {
|
|
27
|
+
/** Type of assertion */
|
|
28
|
+
type: BaselineAssertionType;
|
|
29
|
+
/** The condition/assertion statement */
|
|
30
|
+
condition: string;
|
|
31
|
+
/** Tool this assertion relates to (optional) */
|
|
32
|
+
tool?: string;
|
|
33
|
+
/** Severity level (optional) */
|
|
34
|
+
severity?: BaselineAssertionSeverity;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Baseline mode indicating how the baseline was generated.
|
|
38
|
+
* - 'check': Deterministic structural testing (no LLM required)
|
|
39
|
+
* - 'explore': LLM-powered behavioral exploration
|
|
40
|
+
*/
|
|
41
|
+
export type BaselineMode = 'check' | 'explore';
|
|
42
|
+
/**
|
|
43
|
+
* Metadata about how the baseline was generated.
|
|
44
|
+
*/
|
|
45
|
+
export interface BaselineMetadata {
|
|
46
|
+
/** Baseline mode: 'check' = deterministic, 'explore' = LLM-powered */
|
|
47
|
+
mode: BaselineMode;
|
|
48
|
+
/** ISO timestamp when generated */
|
|
49
|
+
generatedAt: string;
|
|
50
|
+
/** CLI version that generated this baseline */
|
|
51
|
+
cliVersion: string;
|
|
52
|
+
/** Command used to start the server */
|
|
53
|
+
serverCommand: string;
|
|
54
|
+
/** Server name from MCP initialization */
|
|
55
|
+
serverName?: string;
|
|
56
|
+
/** Interview duration in milliseconds */
|
|
57
|
+
durationMs: number;
|
|
58
|
+
/** Personas used during interview (empty for check mode) */
|
|
59
|
+
personas: string[];
|
|
60
|
+
/** LLM model used ('none' for check mode) */
|
|
61
|
+
model: string;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Server fingerprint in baseline format.
|
|
65
|
+
*/
|
|
66
|
+
export interface BaselineServerFingerprint {
|
|
67
|
+
/** Server name */
|
|
68
|
+
name: string;
|
|
69
|
+
/** Server version */
|
|
70
|
+
version: string;
|
|
71
|
+
/** MCP protocol version */
|
|
72
|
+
protocolVersion: string;
|
|
73
|
+
/** Available capabilities */
|
|
74
|
+
capabilities: string[];
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Tool capability from discovery.
|
|
78
|
+
*/
|
|
79
|
+
export interface ToolCapability {
|
|
80
|
+
/** Tool name */
|
|
81
|
+
name: string;
|
|
82
|
+
/** Tool description */
|
|
83
|
+
description: string;
|
|
84
|
+
/** Input schema */
|
|
85
|
+
inputSchema: Record<string, unknown>;
|
|
86
|
+
/** Hash of the schema for change detection */
|
|
87
|
+
schemaHash: string;
|
|
88
|
+
/** Hash of observed arguments schema (from actual calls) */
|
|
89
|
+
observedArgsSchemaHash?: string;
|
|
90
|
+
/** Consistency of observed argument schemas (0-1) */
|
|
91
|
+
observedArgsSchemaConsistency?: number;
|
|
92
|
+
/** Number of observed schema variations */
|
|
93
|
+
observedArgsSchemaVariations?: number;
|
|
94
|
+
/** Fingerprint of the tool's response structure */
|
|
95
|
+
responseFingerprint?: ResponseFingerprint;
|
|
96
|
+
/** Inferred JSON schema of the tool's output */
|
|
97
|
+
inferredOutputSchema?: InferredSchema;
|
|
98
|
+
/** Normalized error patterns observed during testing */
|
|
99
|
+
errorPatterns?: ErrorPattern[];
|
|
100
|
+
/** Baseline p50 latency in milliseconds */
|
|
101
|
+
baselineP50Ms?: number;
|
|
102
|
+
/** Baseline p95 latency in milliseconds */
|
|
103
|
+
baselineP95Ms?: number;
|
|
104
|
+
/** Baseline p99 latency in milliseconds */
|
|
105
|
+
baselineP99Ms?: number;
|
|
106
|
+
/** Baseline success rate (0-1) */
|
|
107
|
+
baselineSuccessRate?: number;
|
|
108
|
+
/** Response schema evolution metadata */
|
|
109
|
+
responseSchemaEvolution?: ResponseSchemaEvolution;
|
|
110
|
+
/** ISO timestamp of last time this tool was tested */
|
|
111
|
+
lastTestedAt?: string;
|
|
112
|
+
/** Schema hash captured at the last test time */
|
|
113
|
+
inputSchemaHashAtTest?: string;
|
|
114
|
+
/** Statistical confidence for performance baselines */
|
|
115
|
+
performanceConfidence?: {
|
|
116
|
+
sampleCount: number;
|
|
117
|
+
successfulSamples: number;
|
|
118
|
+
validationSamples: number;
|
|
119
|
+
totalTests: number;
|
|
120
|
+
standardDeviation: number;
|
|
121
|
+
coefficientOfVariation: number;
|
|
122
|
+
confidenceLevel: 'low' | 'medium' | 'high';
|
|
123
|
+
recommendation?: string;
|
|
124
|
+
};
|
|
125
|
+
/** Security testing fingerprint with findings and risk score */
|
|
126
|
+
securityFingerprint?: SecurityFingerprint;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Resource capability from discovery.
|
|
130
|
+
*/
|
|
131
|
+
export interface ResourceCapability {
|
|
132
|
+
/** Resource URI template */
|
|
133
|
+
uri: string;
|
|
134
|
+
/** Resource name */
|
|
135
|
+
name: string;
|
|
136
|
+
/** Resource description */
|
|
137
|
+
description?: string;
|
|
138
|
+
/** MIME type */
|
|
139
|
+
mimeType?: string;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Prompt capability from discovery.
|
|
143
|
+
*/
|
|
144
|
+
export interface PromptCapability {
|
|
145
|
+
/** Prompt name */
|
|
146
|
+
name: string;
|
|
147
|
+
/** Prompt description */
|
|
148
|
+
description?: string;
|
|
149
|
+
/** Arguments the prompt accepts */
|
|
150
|
+
arguments?: Array<{
|
|
151
|
+
name: string;
|
|
152
|
+
description?: string;
|
|
153
|
+
required?: boolean;
|
|
154
|
+
}>;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Interview results for a single persona.
|
|
158
|
+
*/
|
|
159
|
+
export interface PersonaInterview {
|
|
160
|
+
/** Persona ID */
|
|
161
|
+
persona: string;
|
|
162
|
+
/** Number of tools interviewed */
|
|
163
|
+
toolsInterviewed: number;
|
|
164
|
+
/** Number of questions asked */
|
|
165
|
+
questionsAsked: number;
|
|
166
|
+
/** Findings from this persona */
|
|
167
|
+
findings: PersonaFinding[];
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* A finding from a persona interview.
|
|
171
|
+
*/
|
|
172
|
+
export interface PersonaFinding {
|
|
173
|
+
/** Tool this finding relates to */
|
|
174
|
+
tool: string;
|
|
175
|
+
/** Finding category */
|
|
176
|
+
category: 'behavior' | 'security' | 'reliability' | 'edge_case';
|
|
177
|
+
/** Severity level */
|
|
178
|
+
severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
179
|
+
/** Description of the finding */
|
|
180
|
+
description: string;
|
|
181
|
+
/** Evidence supporting the finding */
|
|
182
|
+
evidence?: string;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Tool behavioral profile in baseline format.
|
|
186
|
+
*/
|
|
187
|
+
export interface BaselineToolProfile {
|
|
188
|
+
/** Tool name */
|
|
189
|
+
name: string;
|
|
190
|
+
/** Tool description */
|
|
191
|
+
description: string;
|
|
192
|
+
/** Hash of input schema */
|
|
193
|
+
schemaHash: string;
|
|
194
|
+
/** Behavioral assertions */
|
|
195
|
+
assertions: BaselineAssertion[];
|
|
196
|
+
/** Security notes */
|
|
197
|
+
securityNotes: string[];
|
|
198
|
+
/** Known limitations */
|
|
199
|
+
limitations: string[];
|
|
200
|
+
/** Behavioral notes */
|
|
201
|
+
behavioralNotes: string[];
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Snapshot of accepted drift for a baseline.
|
|
205
|
+
*/
|
|
206
|
+
export interface AcceptedDiff {
|
|
207
|
+
toolsAdded: string[];
|
|
208
|
+
toolsRemoved: string[];
|
|
209
|
+
toolsModified: string[];
|
|
210
|
+
severity: 'none' | 'info' | 'warning' | 'breaking';
|
|
211
|
+
breakingCount: number;
|
|
212
|
+
warningCount: number;
|
|
213
|
+
infoCount: number;
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Drift acceptance metadata attached to a baseline.
|
|
217
|
+
*/
|
|
218
|
+
export interface DriftAcceptance {
|
|
219
|
+
acceptedAt: string | Date;
|
|
220
|
+
acceptedBy?: string;
|
|
221
|
+
reason?: string;
|
|
222
|
+
acceptedDiff: AcceptedDiff;
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Serializable schema evolution data for baselines.
|
|
226
|
+
*/
|
|
227
|
+
export interface ResponseSchemaEvolution {
|
|
228
|
+
currentHash: string;
|
|
229
|
+
history: Array<{
|
|
230
|
+
hash: string;
|
|
231
|
+
schema: InferredSchema;
|
|
232
|
+
observedAt: string | Date;
|
|
233
|
+
sampleCount: number;
|
|
234
|
+
}>;
|
|
235
|
+
isStable: boolean;
|
|
236
|
+
stabilityConfidence: number;
|
|
237
|
+
inconsistentFields: string[];
|
|
238
|
+
sampleCount: number;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Serializable documentation score summary for baseline storage.
|
|
242
|
+
*/
|
|
243
|
+
export interface DocumentationScoreSummary {
|
|
244
|
+
overallScore: number;
|
|
245
|
+
grade: string;
|
|
246
|
+
issueCount: number;
|
|
247
|
+
toolCount: number;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Canonical baseline format.
|
|
251
|
+
*
|
|
252
|
+
* This is the single baseline schema used by Bellwether CLI.
|
|
253
|
+
*
|
|
254
|
+
* Versioning: Uses CLI package version for compatibility checking.
|
|
255
|
+
* Baselines with the same CLI major version are compatible.
|
|
256
|
+
*/
|
|
257
|
+
export interface BellwetherBaseline {
|
|
258
|
+
/** CLI version that generated this baseline (e.g., '1.0.0') */
|
|
259
|
+
version: string;
|
|
260
|
+
/** Generation metadata */
|
|
261
|
+
metadata: BaselineMetadata;
|
|
262
|
+
/** Server fingerprint */
|
|
263
|
+
server: BaselineServerFingerprint;
|
|
264
|
+
/** Discovered capabilities */
|
|
265
|
+
capabilities: {
|
|
266
|
+
tools: ToolCapability[];
|
|
267
|
+
resources?: ResourceCapability[];
|
|
268
|
+
prompts?: PromptCapability[];
|
|
269
|
+
};
|
|
270
|
+
/** Interview results by persona */
|
|
271
|
+
interviews: PersonaInterview[];
|
|
272
|
+
/** Tool behavioral profiles */
|
|
273
|
+
toolProfiles: BaselineToolProfile[];
|
|
274
|
+
/** Workflow results (if workflows were tested) */
|
|
275
|
+
workflows?: WorkflowSignature[];
|
|
276
|
+
/** Overall behavioral assertions */
|
|
277
|
+
assertions: BaselineAssertion[];
|
|
278
|
+
/** Summary of findings */
|
|
279
|
+
summary: string;
|
|
280
|
+
/** SHA-256 hash of content (first 16 chars) for integrity */
|
|
281
|
+
hash: string;
|
|
282
|
+
/** Drift acceptance metadata (optional) */
|
|
283
|
+
acceptance?: DriftAcceptance;
|
|
284
|
+
/** Optional documentation score summary */
|
|
285
|
+
documentationScore?: DocumentationScoreSummary;
|
|
286
|
+
}
|
|
287
|
+
//# sourceMappingURL=baseline-format.d.ts.map
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline types for Bellwether.
|
|
3
|
+
*
|
|
4
|
+
* These types define the canonical baseline format used for:
|
|
5
|
+
* - Local baseline storage
|
|
6
|
+
* - Baseline comparison and drift detection
|
|
7
|
+
* - Tool capability tracking
|
|
8
|
+
*
|
|
9
|
+
* Originally part of a hosted integration, now standalone for open-source use.
|
|
10
|
+
*/
|
|
11
|
+
export {};
|
|
12
|
+
//# sourceMappingURL=baseline-format.js.map
|