@dotsetlabs/bellwether 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/README.md +3 -2
  3. package/dist/cache/response-cache.d.ts +4 -2
  4. package/dist/cache/response-cache.js +68 -30
  5. package/dist/cli/commands/baseline.js +23 -4
  6. package/dist/cli/commands/check.js +212 -36
  7. package/dist/cli/commands/registry.js +143 -5
  8. package/dist/cli/index.js +6 -4
  9. package/dist/cli/output.d.ts +1 -0
  10. package/dist/cli/output.js +32 -8
  11. package/dist/config/template.d.ts +2 -0
  12. package/dist/config/template.js +8 -2
  13. package/dist/interview/interviewer.js +70 -50
  14. package/dist/interview/orchestrator.js +49 -22
  15. package/dist/llm/anthropic.js +49 -16
  16. package/dist/llm/client.d.ts +2 -0
  17. package/dist/llm/client.js +61 -0
  18. package/dist/llm/ollama.js +9 -4
  19. package/dist/llm/openai.js +34 -23
  20. package/dist/transport/base-transport.d.ts +1 -1
  21. package/dist/transport/http-transport.d.ts +2 -2
  22. package/dist/transport/http-transport.js +26 -6
  23. package/dist/transport/mcp-client.d.ts +18 -6
  24. package/dist/transport/mcp-client.js +49 -19
  25. package/dist/transport/sse-transport.d.ts +1 -1
  26. package/dist/transport/sse-transport.js +4 -2
  27. package/dist/transport/stdio-transport.d.ts +1 -1
  28. package/dist/transport/stdio-transport.js +1 -1
  29. package/dist/utils/timeout.d.ts +10 -2
  30. package/dist/utils/timeout.js +9 -5
  31. package/dist/version.js +1 -1
  32. package/dist/workflow/executor.js +18 -13
  33. package/dist/workflow/loader.js +4 -1
  34. package/dist/workflow/state-tracker.js +22 -18
  35. package/man/bellwether.1 +204 -0
  36. package/man/bellwether.1.md +148 -0
  37. package/package.json +8 -9
package/CHANGELOG.md CHANGED
@@ -7,6 +7,56 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.0.3] - 2026-02-02
11
+
12
+ ### Added
13
+
14
+ - Added `version` input to GitHub Action for explicit npm version selection
15
+ - Action now derives version from ref (e.g., `v1.0.3`) or accepts explicit `inputs.version`
16
+ - Provides clear error message when version cannot be determined
17
+ - Added `signal` option to LLM completion requests for request cancellation via AbortSignal
18
+ - Added AbortController integration to timeout utilities for proper request cancellation
19
+ - Added JSON extraction from mixed LLM responses (handles prose around JSON blocks)
20
+
21
+ ### Changed
22
+
23
+ - Improved timeout handling with AbortController propagation across LLM and transport layers
24
+ - Improved error handling and resource cleanup in interview, orchestrator, and transport modules
25
+ - Refactored response cache, workflow executor, and state tracker for better reliability
26
+ - Updated CI/CD and GitHub/GitLab integration documentation
27
+
28
+ ### Fixed
29
+
30
+ - Fixed GitHub Action stderr handling in check command output capture
31
+ - Fixed various code formatting and linting issues across LLM clients and transport modules
32
+
33
+ ## [1.0.2] - 2026-01-30
34
+
35
+ ### Added
36
+
37
+ - Added SARIF and JUnit output format support for `bellwether check` without baseline comparison
38
+ - Use `--format sarif` for GitHub Code Scanning integration
39
+ - Use `--format junit` for CI/CD test reporting
40
+ - Added registry validation indicators showing environment variable requirements
41
+ - Servers requiring setup now display ⚙ indicator
42
+ - Environment variables show ✓/✗ status based on whether they're set
43
+ - Automatic detection of common service patterns (postgres→DATABASE_URL, etc.)
44
+ - Setup hints displayed for unconfigured servers
45
+
46
+ ### Changed
47
+
48
+ - Security and thorough presets now enable security testing by default (`check.security.enabled: true`)
49
+
50
+ ### Fixed
51
+
52
+ - Fixed baseline path resolution in `baseline compare` to be consistent with `baseline show`
53
+ - Now checks both output directory and current working directory before failing
54
+ - Fixed `bellwether auth status` requiring a config file
55
+ - Auth commands now work without bellwether.yaml present
56
+ - Fixed ANSI escape codes appearing in non-TTY output (e.g., when piping to files)
57
+ - StreamingDisplay now checks for TTY before applying ANSI styling
58
+ - Automatically respects `NO_COLOR` and `FORCE_COLOR=0` environment variables
59
+
10
60
  ## [1.0.1] - 2026-01-29
11
61
 
12
62
  ### Added
package/README.md CHANGED
@@ -124,8 +124,9 @@ Requires LLM (Ollama for free local, or OpenAI/Anthropic). Generates `AGENTS.md`
124
124
  ## GitHub Action
125
125
 
126
126
  ```yaml
127
- - uses: dotsetlabs/bellwether@v1
127
+ - uses: dotsetlabs/bellwether@v1.0.2
128
128
  with:
129
+ version: '1.0.2'
129
130
  server-command: 'npx @mcp/your-server'
130
131
  baseline-path: './bellwether-baseline.json'
131
132
  fail-on-severity: 'warning'
@@ -167,7 +168,7 @@ bellwether init --preset local npx @mcp/server # Local Ollama (free)
167
168
 
168
169
  ```bash
169
170
  git clone https://github.com/dotsetlabs/bellwether
170
- cd bellwether/cli
171
+ cd bellwether
171
172
  npm install
172
173
  npm run build
173
174
  npm test
@@ -10,6 +10,8 @@ export interface CacheEntry<T> {
10
10
  value: T;
11
11
  /** When the entry was created */
12
12
  createdAt: Date;
13
+ /** When the entry was last accessed */
14
+ lastAccessedAt: Date;
13
15
  /** When the entry expires */
14
16
  expiresAt: Date;
15
17
  /** Cache key (hash) */
@@ -99,9 +101,9 @@ export declare class ResponseCache {
99
101
  */
100
102
  private evictIfNeeded;
101
103
  /**
102
- * Evict the oldest entry (LRU based on creation time).
104
+ * Evict the least recently used entry (LRU based on last access time).
103
105
  */
104
- private evictOldest;
106
+ private evictLeastRecentlyUsed;
105
107
  /**
106
108
  * Estimate the size of a value in bytes.
107
109
  */
@@ -30,21 +30,9 @@ export class ResponseCache {
30
30
  * Generate a cache key from input data.
31
31
  */
32
32
  generateKey(...parts) {
33
- const serialized = parts.map((p) => {
34
- if (typeof p === 'string')
35
- return p;
36
- if (typeof p === 'undefined')
37
- return 'undefined';
38
- if (p === null)
39
- return 'null';
40
- try {
41
- return JSON.stringify(p, Object.keys(p).sort());
42
- }
43
- catch {
44
- return String(p);
45
- }
46
- }).join('|');
47
- return createHash('sha256').update(serialized).digest('hex').slice(0, 16);
33
+ const serialized = parts.map((p) => stableStringify(p)).join('|');
34
+ // Use 128-bit hash (32 hex chars) to reduce collision risk.
35
+ return createHash('sha256').update(serialized).digest('hex').slice(0, 32);
48
36
  }
49
37
  /**
50
38
  * Get an entry from cache.
@@ -66,6 +54,7 @@ export class ResponseCache {
66
54
  return undefined;
67
55
  }
68
56
  entry.hitCount++;
57
+ entry.lastAccessedAt = new Date();
69
58
  this.stats.hits++;
70
59
  logger.debug({ key, hitCount: entry.hitCount }, 'Cache hit');
71
60
  return entry.value;
@@ -86,6 +75,7 @@ export class ResponseCache {
86
75
  const entry = {
87
76
  value,
88
77
  createdAt: now,
78
+ lastAccessedAt: now,
89
79
  expiresAt: new Date(now.getTime() + ttl),
90
80
  key,
91
81
  description: options?.description,
@@ -168,31 +158,30 @@ export class ResponseCache {
168
158
  evictIfNeeded(newEntrySize) {
169
159
  // Check entry count
170
160
  while (this.cache.size >= this.config.maxEntries) {
171
- this.evictOldest();
161
+ this.evictLeastRecentlyUsed();
172
162
  }
173
163
  // Check size
174
- while (this.totalSizeBytes + newEntrySize > this.config.maxSizeBytes &&
175
- this.cache.size > 0) {
176
- this.evictOldest();
164
+ while (this.totalSizeBytes + newEntrySize > this.config.maxSizeBytes && this.cache.size > 0) {
165
+ this.evictLeastRecentlyUsed();
177
166
  }
178
167
  }
179
168
  /**
180
- * Evict the oldest entry (LRU based on creation time).
169
+ * Evict the least recently used entry (LRU based on last access time).
181
170
  */
182
- evictOldest() {
183
- let oldestKey;
184
- let oldestTime = Infinity;
171
+ evictLeastRecentlyUsed() {
172
+ let lruKey;
173
+ let oldestAccessTime = Infinity;
185
174
  for (const [key, entry] of this.cache) {
186
- const time = entry.createdAt.getTime();
187
- if (time < oldestTime) {
188
- oldestTime = time;
189
- oldestKey = key;
175
+ const time = entry.lastAccessedAt.getTime();
176
+ if (time < oldestAccessTime) {
177
+ oldestAccessTime = time;
178
+ lruKey = key;
190
179
  }
191
180
  }
192
- if (oldestKey) {
193
- this.delete(oldestKey);
181
+ if (lruKey) {
182
+ this.delete(lruKey);
194
183
  this.stats.evictions++;
195
- logger.debug({ key: oldestKey }, 'Evicted cache entry');
184
+ logger.debug({ key: lruKey }, 'Evicted cache entry');
196
185
  }
197
186
  }
198
187
  /**
@@ -207,6 +196,55 @@ export class ResponseCache {
207
196
  }
208
197
  }
209
198
  }
199
+ /**
200
+ * Stable, deterministic JSON stringify with deep key sorting.
201
+ * Falls back to string conversion for unsupported types.
202
+ */
203
+ function stableStringify(value) {
204
+ const seen = new WeakSet();
205
+ const normalize = (input) => {
206
+ if (input === null || input === undefined)
207
+ return input;
208
+ const type = typeof input;
209
+ if (type === 'string' || type === 'number' || type === 'boolean') {
210
+ return input;
211
+ }
212
+ if (type === 'bigint') {
213
+ return input.toString();
214
+ }
215
+ if (type === 'symbol' || type === 'function') {
216
+ return String(input);
217
+ }
218
+ if (input instanceof Date) {
219
+ return input.toISOString();
220
+ }
221
+ if (Array.isArray(input)) {
222
+ return input.map((item) => normalize(item));
223
+ }
224
+ if (typeof input === 'object') {
225
+ const obj = input;
226
+ if (seen.has(obj)) {
227
+ return '[Circular]';
228
+ }
229
+ seen.add(obj);
230
+ const keys = Object.keys(obj).sort();
231
+ const normalized = {};
232
+ for (const key of keys) {
233
+ normalized[key] = normalize(obj[key]);
234
+ }
235
+ return normalized;
236
+ }
237
+ try {
238
+ return JSON.parse(JSON.stringify(input));
239
+ }
240
+ catch {
241
+ return String(input);
242
+ }
243
+ };
244
+ const normalized = normalize(value);
245
+ const json = JSON.stringify(normalized);
246
+ return json === undefined ? 'undefined' : json;
247
+ }
210
248
  /**
211
249
  * Specialized cache for tool responses.
212
250
  */
@@ -146,12 +146,31 @@ baselineCommand
146
146
  output.error('No baseline path provided. Set baseline.path or baseline.comparePath in config, or pass a path argument.');
147
147
  process.exit(EXIT_CODES.ERROR);
148
148
  }
149
- const baselineBaseDir = baselinePath ? process.cwd() : outputDir;
150
- const fullBaselinePath = resolvedBaselinePath.startsWith('/')
151
- ? resolvedBaselinePath
152
- : join(baselineBaseDir, resolvedBaselinePath);
149
+ // Resolve baseline path consistently with 'show' command:
150
+ // 1. If absolute path, use as-is
151
+ // 2. First try relative to outputDir (e.g., .bellwether/)
152
+ // 3. Fall back to relative to cwd
153
+ let fullBaselinePath;
154
+ if (resolvedBaselinePath.startsWith('/')) {
155
+ fullBaselinePath = resolvedBaselinePath;
156
+ }
157
+ else {
158
+ const outputDirPath = join(outputDir, resolvedBaselinePath);
159
+ const cwdPath = join(process.cwd(), resolvedBaselinePath);
160
+ if (existsSync(outputDirPath)) {
161
+ fullBaselinePath = outputDirPath;
162
+ }
163
+ else if (existsSync(cwdPath)) {
164
+ fullBaselinePath = cwdPath;
165
+ }
166
+ else {
167
+ // Default to outputDir path for error message consistency
168
+ fullBaselinePath = outputDirPath;
169
+ }
170
+ }
153
171
  if (!existsSync(fullBaselinePath)) {
154
172
  output.error(`Baseline not found: ${fullBaselinePath}`);
173
+ output.error('\nRun `bellwether baseline save` to create a baseline.');
155
174
  process.exit(EXIT_CODES.ERROR);
156
175
  }
157
176
  let previousBaseline;
@@ -13,7 +13,7 @@ import { MCPClient } from '../../transport/mcp-client.js';
13
13
  import { discover } from '../../discovery/discovery.js';
14
14
  import { Interviewer } from '../../interview/interviewer.js';
15
15
  import { generateContractMd, generateJsonReport } from '../../docs/generator.js';
16
- import { loadConfig, ConfigNotFoundError, parseCommandString } from '../../config/loader.js';
16
+ import { loadConfig, ConfigNotFoundError, parseCommandString, } from '../../config/loader.js';
17
17
  import { validateConfigForCheck, getConfigWarnings } from '../../config/validator.js';
18
18
  import { createBaseline, loadBaseline, saveBaseline, getToolFingerprints, toToolCapability, compareBaselines, acceptDrift, formatDiffText, formatDiffJson, formatDiffCompact, formatDiffGitHubActions, formatDiffMarkdown, formatDiffJUnit, formatDiffSarif, applySeverityConfig, shouldFailOnDiff, analyzeForIncremental, formatIncrementalSummary, runSecurityTests, parseSecurityCategories, getAllSecurityCategories, } from '../../baseline/index.js';
19
19
  import { convertAssertions } from '../../baseline/converter.js';
@@ -21,7 +21,7 @@ import { getMetricsCollector, resetMetricsCollector } from '../../metrics/collec
21
21
  import { getGlobalCache, resetGlobalCache } from '../../cache/response-cache.js';
22
22
  import { InterviewProgressBar, formatCheckBanner } from '../utils/progress.js';
23
23
  import { buildCheckSummary, colorizeConfidence, formatConfidenceLevel, formatToolResultLine, } from '../output/terminal-reporter.js';
24
- import { loadScenariosFromFile, tryLoadDefaultScenarios, DEFAULT_SCENARIOS_FILE } from '../../scenarios/index.js';
24
+ import { loadScenariosFromFile, tryLoadDefaultScenarios, DEFAULT_SCENARIOS_FILE, } from '../../scenarios/index.js';
25
25
  import { loadWorkflowsFromFile, tryLoadDefaultWorkflows, DEFAULT_WORKFLOWS_FILE, WorkflowExecutor, generateWorkflowsFromTools, generateWorkflowYamlContent, } from '../../workflow/index.js';
26
26
  import * as output from '../output.js';
27
27
  import { extractServerContextFromArgs } from '../utils/server-context.js';
@@ -73,14 +73,6 @@ export const checkCommand = new Command('check')
73
73
  output.error(error instanceof Error ? error.message : String(error));
74
74
  process.exit(EXIT_CODES.ERROR);
75
75
  }
76
- const warnings = getConfigWarnings(config);
77
- if (warnings.length > 0) {
78
- output.warn('Configuration warnings:');
79
- for (const warning of warnings) {
80
- output.warn(` - ${warning}`);
81
- }
82
- output.newline();
83
- }
84
76
  // Extract settings from config
85
77
  const timeout = config.server.timeout;
86
78
  const outputDir = config.output.dir;
@@ -105,7 +97,8 @@ export const checkCommand = new Command('check')
105
97
  minimumSeverity: options.minSeverity ?? config.baseline.severity.minimumSeverity,
106
98
  failOnSeverity: options.failOnSeverity ?? config.baseline.severity.failOnSeverity,
107
99
  suppressWarnings: config.baseline.severity.suppressWarnings,
108
- aspectOverrides: config.baseline.severity.aspectOverrides,
100
+ aspectOverrides: config.baseline.severity
101
+ .aspectOverrides,
109
102
  };
110
103
  // Resolve check options from config (no CLI overrides for these)
111
104
  const incrementalEnabled = config.check.incremental;
@@ -114,9 +107,26 @@ export const checkCommand = new Command('check')
114
107
  const parallelWorkers = config.check.parallelWorkers;
115
108
  const performanceThreshold = config.check.performanceThreshold / PERCENTAGE_CONVERSION.DIVISOR;
116
109
  const diffFormat = options.format ?? config.check.diffFormat;
110
+ const machineReadableFormats = new Set(['json', 'junit', 'sarif']);
111
+ const machineReadable = machineReadableFormats.has(String(diffFormat).toLowerCase());
112
+ if (machineReadable) {
113
+ // Suppress standard CLI output to keep stdout clean for machine-readable formats.
114
+ output.configureOutput({ quiet: true });
115
+ }
116
+ const warnings = getConfigWarnings(config);
117
+ if (warnings.length > 0) {
118
+ output.warn('Configuration warnings:');
119
+ for (const warning of warnings) {
120
+ output.warn(` - ${warning}`);
121
+ }
122
+ if (!machineReadable) {
123
+ output.newline();
124
+ }
125
+ }
117
126
  // Resolve security options from config
118
127
  const securityEnabled = config.check.security.enabled;
119
- let securityCategories = config.check.security.categories;
128
+ let securityCategories = config.check.security
129
+ .categories;
120
130
  // Validate security categories
121
131
  try {
122
132
  securityCategories = parseSecurityCategories(securityCategories.join(','));
@@ -141,13 +151,15 @@ export const checkCommand = new Command('check')
141
151
  ? `${serverCommand} ${args.join(' ')}`.trim()
142
152
  : (remoteUrl ?? 'unknown');
143
153
  // Display startup banner
144
- const banner = formatCheckBanner({
145
- serverCommand: serverIdentifier,
146
- });
147
- output.info(banner);
148
- output.newline();
149
- output.info('Check: Schema validation and drift detection (free, deterministic)');
150
- output.newline();
154
+ if (!machineReadable) {
155
+ const banner = formatCheckBanner({
156
+ serverCommand: serverIdentifier,
157
+ });
158
+ output.info(banner);
159
+ output.newline();
160
+ output.info('Check: Schema validation and drift detection (free, deterministic)');
161
+ output.newline();
162
+ }
151
163
  // Initialize metrics collector
152
164
  resetMetricsCollector();
153
165
  const metricsCollector = getMetricsCollector();
@@ -182,9 +194,12 @@ export const checkCommand = new Command('check')
182
194
  }
183
195
  // Discovery phase
184
196
  output.info('Discovering capabilities...');
185
- const discovery = await discover(mcpClient, transport === 'stdio' ? serverCommand : remoteUrl ?? serverCommand, transport === 'stdio' ? args : []);
197
+ const discovery = await discover(mcpClient, transport === 'stdio' ? serverCommand : (remoteUrl ?? serverCommand), transport === 'stdio' ? args : []);
186
198
  const resourceCount = discovery.resources?.length ?? 0;
187
- const discoveryParts = [`${discovery.tools.length} tools`, `${discovery.prompts.length} prompts`];
199
+ const discoveryParts = [
200
+ `${discovery.tools.length} tools`,
201
+ `${discovery.prompts.length} prompts`,
202
+ ];
188
203
  if (resourceCount > 0) {
189
204
  discoveryParts.push(`${resourceCount} resources`);
190
205
  }
@@ -228,7 +243,9 @@ export const checkCommand = new Command('check')
228
243
  }
229
244
  else {
230
245
  incrementalBaseline = loadBaseline(baselinePath);
231
- const result = analyzeForIncremental(discovery.tools, incrementalBaseline, { maxCacheAgeHours: incrementalCacheHours });
246
+ const result = analyzeForIncremental(discovery.tools, incrementalBaseline, {
247
+ maxCacheAgeHours: incrementalCacheHours,
248
+ });
232
249
  incrementalResult = result;
233
250
  const summary = formatIncrementalSummary(result.changeSummary);
234
251
  output.info(`Incremental analysis: ${summary}`);
@@ -240,7 +257,7 @@ export const checkCommand = new Command('check')
240
257
  else {
241
258
  output.info(`Testing ${result.toolsToTest.length} tools (${result.toolsToSkip.length} cached)\n`);
242
259
  // Filter discovery to only include tools that need testing
243
- discovery.tools = discovery.tools.filter(t => result.toolsToTest.includes(t.name));
260
+ discovery.tools = discovery.tools.filter((t) => result.toolsToTest.includes(t.name));
244
261
  }
245
262
  }
246
263
  }
@@ -323,7 +340,7 @@ export const checkCommand = new Command('check')
323
340
  interviewer.setServerContext(serverContext);
324
341
  }
325
342
  // Set up progress display
326
- const progressBar = new InterviewProgressBar({ enabled: !verbose });
343
+ const progressBar = new InterviewProgressBar({ enabled: !verbose && !machineReadable });
327
344
  const reportedTools = new Set();
328
345
  const progressCallback = (progress) => {
329
346
  if (verbose) {
@@ -462,7 +479,7 @@ export const checkCommand = new Command('check')
462
479
  try {
463
480
  const response = await mcpClient.callTool(tool.name, args);
464
481
  const content = response.content
465
- .map((c) => c.type === 'text' ? c.text : '')
482
+ .map((c) => (c.type === 'text' ? c.text : ''))
466
483
  .join('\n');
467
484
  return {
468
485
  isError: response.isError ?? false,
@@ -580,7 +597,7 @@ export const checkCommand = new Command('check')
580
597
  const workflowResult = await workflowExecutor.execute(workflow);
581
598
  workflowResults.push(workflowResult);
582
599
  const statusIcon = workflowResult.success ? '\u2713' : '\u2717';
583
- const stepsInfo = `${workflowResult.steps.filter(s => s.success).length}/${workflow.steps.length} steps`;
600
+ const stepsInfo = `${workflowResult.steps.filter((s) => s.success).length}/${workflow.steps.length} steps`;
584
601
  if (workflowResult.success) {
585
602
  output.success(` ${statusIcon} ${workflow.name} (${stepsInfo}) - ${workflowResult.durationMs}ms`);
586
603
  }
@@ -599,7 +616,7 @@ export const checkCommand = new Command('check')
599
616
  }
600
617
  }
601
618
  // Workflow summary
602
- const passed = workflowResults.filter(r => r.success).length;
619
+ const passed = workflowResults.filter((r) => r.success).length;
603
620
  const failed = workflowResults.length - passed;
604
621
  output.newline();
605
622
  if (failed === 0) {
@@ -631,9 +648,7 @@ export const checkCommand = new Command('check')
631
648
  }
632
649
  if (writeJson) {
633
650
  // Add workflow results to the result object for the JSON report
634
- const resultWithWorkflows = workflowResults.length > 0
635
- ? { ...result, workflowResults }
636
- : result;
651
+ const resultWithWorkflows = workflowResults.length > 0 ? { ...result, workflowResults } : result;
637
652
  let jsonReport;
638
653
  try {
639
654
  jsonReport = generateJsonReport(resultWithWorkflows, {
@@ -671,10 +686,7 @@ export const checkCommand = new Command('check')
671
686
  if (incrementalResult && incrementalResult.cachedFingerprints.length > 0) {
672
687
  // Merge new fingerprints with cached ones
673
688
  const cachedTools = incrementalResult.cachedFingerprints.map(toToolCapability);
674
- const mergedTools = [
675
- ...currentBaseline.capabilities.tools,
676
- ...cachedTools,
677
- ].sort((a, b) => a.name.localeCompare(b.name));
689
+ const mergedTools = [...currentBaseline.capabilities.tools, ...cachedTools].sort((a, b) => a.name.localeCompare(b.name));
678
690
  currentBaseline = {
679
691
  ...currentBaseline,
680
692
  capabilities: {
@@ -768,6 +780,18 @@ export const checkCommand = new Command('check')
768
780
  saveBaseline(currentBaseline, saveBaselinePath);
769
781
  output.info(`\nBaseline saved: ${saveBaselinePath}`);
770
782
  }
783
+ // Output formatted results for sarif/junit when no baseline comparison
784
+ // This allows CI systems to consume check results even without drift detection
785
+ if (!baselinePath) {
786
+ const formattedCheckResults = formatCheckResults(currentBaseline, diffFormat);
787
+ if (formattedCheckResults) {
788
+ if (!machineReadable) {
789
+ output.info('\n--- Check Results ---');
790
+ }
791
+ // Output directly to stdout for machine-readable formats
792
+ console.log(formattedCheckResults);
793
+ }
794
+ }
771
795
  // Handle baseline comparison
772
796
  if (baselinePath) {
773
797
  if (!existsSync(baselinePath)) {
@@ -780,10 +804,17 @@ export const checkCommand = new Command('check')
780
804
  });
781
805
  // Apply severity configuration (filtering, overrides)
782
806
  const diff = applySeverityConfig(rawDiff, severityConfig);
783
- output.info('\n--- Drift Report ---');
807
+ if (!machineReadable) {
808
+ output.info('\n--- Drift Report ---');
809
+ }
784
810
  // Select formatter based on --format option
785
811
  const formattedDiff = formatDiff(diff, diffFormat, baselinePath);
786
- output.info(formattedDiff);
812
+ if (machineReadable) {
813
+ console.log(formattedDiff);
814
+ }
815
+ else {
816
+ output.info(formattedDiff);
817
+ }
787
818
  // Report performance regressions if detected
788
819
  if (diff.performanceReport?.hasRegressions) {
789
820
  output.warn('\n--- Performance Regressions ---');
@@ -919,4 +950,149 @@ function formatDiff(diff, format, baselinePath) {
919
950
  return formatDiffText(diff);
920
951
  }
921
952
  }
953
+ /**
954
+ * Format check results as JUnit XML (for CI systems that expect test results).
955
+ * This is used when --format junit is specified but no baseline comparison occurs.
956
+ */
957
+ function formatCheckResultsJUnit(baseline) {
958
+ const tools = getToolFingerprints(baseline);
959
+ const lines = [];
960
+ const securityFailures = tools.filter((t) => t.securityFingerprint?.findings?.some((f) => f.riskLevel === 'critical' || f.riskLevel === 'high')).length;
961
+ lines.push('<?xml version="1.0" encoding="UTF-8"?>');
962
+ lines.push('<testsuites>');
963
+ lines.push(` <testsuite name="bellwether-check" tests="${tools.length}" failures="${securityFailures}" errors="0">`);
964
+ for (const tool of tools) {
965
+ const successRate = tool.baselineSuccessRate ?? 1;
966
+ const status = successRate >= 0.9 ? 'passed' : 'warning';
967
+ lines.push(` <testcase name="${tool.name}" classname="mcp-tools" time="0">`);
968
+ lines.push(` <system-out>Success rate: ${(successRate * 100).toFixed(0)}%</system-out>`);
969
+ if (status === 'warning') {
970
+ lines.push(` <system-err>Tool has success rate below 90%</system-err>`);
971
+ }
972
+ lines.push(' </testcase>');
973
+ }
974
+ // Add security findings as test cases if present
975
+ const securityTools = tools.filter((t) => t.securityFingerprint?.findings?.length);
976
+ if (securityTools.length > 0) {
977
+ lines.push(` <!-- Security findings -->`);
978
+ for (const tool of securityTools) {
979
+ const findings = tool.securityFingerprint?.findings ?? [];
980
+ const criticalHigh = findings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high').length;
981
+ if (criticalHigh > 0) {
982
+ lines.push(` <testcase name="${tool.name}-security" classname="security">`);
983
+ lines.push(` <failure message="${criticalHigh} critical/high security findings">`);
984
+ for (const finding of findings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high')) {
985
+ lines.push(` ${finding.riskLevel.toUpperCase()}: ${finding.title} (${finding.cweId})`);
986
+ }
987
+ lines.push(` </failure>`);
988
+ lines.push(' </testcase>');
989
+ }
990
+ }
991
+ }
992
+ lines.push(' </testsuite>');
993
+ lines.push('</testsuites>');
994
+ return lines.join('\n');
995
+ }
996
+ /**
997
+ * Format check results as SARIF (for GitHub Code Scanning and other tools).
998
+ * This is used when --format sarif is specified but no baseline comparison occurs.
999
+ */
1000
+ function formatCheckResultsSarif(baseline) {
1001
+ const tools = getToolFingerprints(baseline);
1002
+ const serverUri = baseline.metadata?.serverCommand || baseline.server.name || 'mcp-server';
1003
+ const results = [];
1004
+ // Add results for tools with security findings
1005
+ const securityTools = tools.filter((t) => t.securityFingerprint?.findings?.length);
1006
+ for (const tool of securityTools) {
1007
+ const findings = tool.securityFingerprint?.findings ?? [];
1008
+ for (const finding of findings) {
1009
+ const level = finding.riskLevel === 'critical' || finding.riskLevel === 'high'
1010
+ ? 'error'
1011
+ : finding.riskLevel === 'medium'
1012
+ ? 'warning'
1013
+ : 'note';
1014
+ results.push({
1015
+ ruleId: finding.cweId || 'BWH-SEC',
1016
+ level,
1017
+ message: { text: `[${tool.name}] ${finding.title}: ${finding.description}` },
1018
+ locations: [
1019
+ {
1020
+ physicalLocation: {
1021
+ artifactLocation: { uri: serverUri },
1022
+ region: { startLine: 1 },
1023
+ },
1024
+ },
1025
+ ],
1026
+ });
1027
+ }
1028
+ }
1029
+ // Add results for tools with low success rate
1030
+ for (const tool of tools) {
1031
+ const successRate = tool.baselineSuccessRate ?? 1;
1032
+ if (successRate < 0.9) {
1033
+ results.push({
1034
+ ruleId: 'BWH-REL',
1035
+ level: 'warning',
1036
+ message: {
1037
+ text: `Tool "${tool.name}" has ${(successRate * 100).toFixed(0)}% success rate`,
1038
+ },
1039
+ locations: [
1040
+ {
1041
+ physicalLocation: {
1042
+ artifactLocation: { uri: serverUri },
1043
+ region: { startLine: 1 },
1044
+ },
1045
+ },
1046
+ ],
1047
+ });
1048
+ }
1049
+ }
1050
+ const sarif = {
1051
+ $schema: 'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json',
1052
+ version: '2.1.0',
1053
+ runs: [
1054
+ {
1055
+ tool: {
1056
+ driver: {
1057
+ name: 'bellwether',
1058
+ version: '1.0.0',
1059
+ informationUri: 'https://github.com/dotsetlabs/bellwether',
1060
+ rules: [
1061
+ {
1062
+ id: 'BWH-SEC',
1063
+ name: 'SecurityFinding',
1064
+ shortDescription: { text: 'Security vulnerability detected' },
1065
+ defaultConfiguration: { level: 'warning' },
1066
+ },
1067
+ {
1068
+ id: 'BWH-REL',
1069
+ name: 'LowReliability',
1070
+ shortDescription: { text: 'Tool reliability below threshold' },
1071
+ defaultConfiguration: { level: 'warning' },
1072
+ },
1073
+ ],
1074
+ },
1075
+ },
1076
+ results,
1077
+ },
1078
+ ],
1079
+ };
1080
+ return JSON.stringify(sarif, null, 2);
1081
+ }
1082
+ /**
1083
+ * Format check results using the specified output format.
1084
+ * Used when no baseline comparison occurs.
1085
+ */
1086
+ function formatCheckResults(baseline, format) {
1087
+ switch (format.toLowerCase()) {
1088
+ case 'junit':
1089
+ case 'junit-xml':
1090
+ case 'xml':
1091
+ return formatCheckResultsJUnit(baseline);
1092
+ case 'sarif':
1093
+ return formatCheckResultsSarif(baseline);
1094
+ default:
1095
+ return null; // No special formatting needed for other formats
1096
+ }
1097
+ }
922
1098
  //# sourceMappingURL=check.js.map