@dotsetlabs/bellwether 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +74 -0
  2. package/README.md +8 -2
  3. package/dist/baseline/accessors.d.ts +1 -1
  4. package/dist/baseline/accessors.js +1 -3
  5. package/dist/baseline/baseline-format.d.ts +287 -0
  6. package/dist/baseline/baseline-format.js +12 -0
  7. package/dist/baseline/comparator.js +249 -11
  8. package/dist/baseline/converter.d.ts +15 -15
  9. package/dist/baseline/converter.js +46 -34
  10. package/dist/baseline/diff.d.ts +1 -1
  11. package/dist/baseline/diff.js +45 -28
  12. package/dist/baseline/error-analyzer.d.ts +1 -1
  13. package/dist/baseline/error-analyzer.js +90 -17
  14. package/dist/baseline/incremental-checker.js +8 -5
  15. package/dist/baseline/index.d.ts +2 -12
  16. package/dist/baseline/index.js +3 -23
  17. package/dist/baseline/performance-tracker.d.ts +0 -1
  18. package/dist/baseline/performance-tracker.js +13 -20
  19. package/dist/baseline/response-fingerprint.js +39 -2
  20. package/dist/baseline/saver.js +41 -10
  21. package/dist/baseline/schema-compare.d.ts +22 -0
  22. package/dist/baseline/schema-compare.js +259 -16
  23. package/dist/baseline/types.d.ts +10 -7
  24. package/dist/cache/response-cache.d.ts +8 -0
  25. package/dist/cache/response-cache.js +110 -0
  26. package/dist/cli/commands/check.js +23 -6
  27. package/dist/cli/commands/explore.js +34 -14
  28. package/dist/cli/index.js +8 -0
  29. package/dist/config/template.js +8 -7
  30. package/dist/config/validator.d.ts +59 -59
  31. package/dist/config/validator.js +245 -90
  32. package/dist/constants/core.d.ts +4 -0
  33. package/dist/constants/core.js +8 -19
  34. package/dist/constants/registry.d.ts +17 -0
  35. package/dist/constants/registry.js +18 -0
  36. package/dist/constants/testing.d.ts +0 -369
  37. package/dist/constants/testing.js +18 -456
  38. package/dist/constants.d.ts +1 -1
  39. package/dist/constants.js +1 -1
  40. package/dist/docs/contract.js +131 -83
  41. package/dist/docs/report.js +8 -5
  42. package/dist/interview/insights.d.ts +17 -0
  43. package/dist/interview/insights.js +52 -0
  44. package/dist/interview/interviewer.js +52 -10
  45. package/dist/interview/prompt-test-generator.d.ts +12 -0
  46. package/dist/interview/prompt-test-generator.js +77 -0
  47. package/dist/interview/resource-test-generator.d.ts +12 -0
  48. package/dist/interview/resource-test-generator.js +20 -0
  49. package/dist/interview/schema-inferrer.js +26 -4
  50. package/dist/interview/schema-test-generator.js +278 -31
  51. package/dist/interview/stateful-test-runner.d.ts +3 -0
  52. package/dist/interview/stateful-test-runner.js +80 -0
  53. package/dist/interview/types.d.ts +12 -0
  54. package/dist/transport/mcp-client.js +1 -1
  55. package/dist/transport/sse-transport.d.ts +7 -3
  56. package/dist/transport/sse-transport.js +157 -67
  57. package/dist/version.js +1 -1
  58. package/man/bellwether.1 +1 -1
  59. package/man/bellwether.1.md +2 -2
  60. package/package.json +1 -1
  61. package/schemas/bellwether-check.schema.json +185 -0
  62. package/schemas/bellwether-explore.schema.json +837 -0
  63. package/scripts/completions/bellwether.bash +10 -4
  64. package/scripts/completions/bellwether.zsh +55 -2
@@ -73,16 +73,15 @@ export function formatDiffText(diff, useColors = true) {
73
73
  lines.push(red('─── Performance Regressions ───'));
74
74
  for (const regression of diff.performanceReport.regressions) {
75
75
  const percentStr = (regression.regressionPercent * 100).toFixed(1);
76
- const confidenceNote = regression.isReliable
77
- ? ''
78
- : ` ${yellow('(low confidence)')}`;
76
+ const confidenceNote = regression.isReliable ? '' : ` ${yellow('(low confidence)')}`;
79
77
  lines.push(` ${red('!')} ${regression.toolName}: ` +
80
78
  `${regression.previousP50Ms.toFixed(0)}ms → ` +
81
79
  `${regression.currentP50Ms.toFixed(0)}ms (+${percentStr}%)${confidenceNote}`);
82
80
  }
83
81
  lines.push('');
84
82
  // Show low confidence tools warning
85
- if (diff.performanceReport.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
83
+ if (diff.performanceReport.lowConfidenceTools &&
84
+ diff.performanceReport.lowConfidenceTools.length > 0) {
86
85
  lines.push(yellow(' Note: Some tools have low confidence metrics.'));
87
86
  lines.push(yellow(` Run with more samples for reliable baselines: ${diff.performanceReport.lowConfidenceTools.join(', ')}`));
88
87
  lines.push('');
@@ -94,7 +93,8 @@ export function formatDiffText(diff, useColors = true) {
94
93
  lines.push('');
95
94
  }
96
95
  // Performance confidence changes
97
- if (diff.performanceReport?.confidenceChanges && diff.performanceReport.confidenceChanges.length > 0) {
96
+ if (diff.performanceReport?.confidenceChanges &&
97
+ diff.performanceReport.confidenceChanges.length > 0) {
98
98
  lines.push(cyan('─── Confidence Changes ───'));
99
99
  for (const change of diff.performanceReport.confidenceChanges) {
100
100
  const icon = change.improved ? green('↑') : change.degraded ? yellow('↓') : '→';
@@ -212,7 +212,8 @@ export function formatDiffText(diff, useColors = true) {
212
212
  lines.push(` Info: ${diff.infoCount}`);
213
213
  if (diff.performanceReport) {
214
214
  lines.push(` Performance regressions: ${diff.performanceReport.regressionCount}`);
215
- if (diff.performanceReport.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
215
+ if (diff.performanceReport.lowConfidenceTools &&
216
+ diff.performanceReport.lowConfidenceTools.length > 0) {
216
217
  lines.push(` Low confidence tools: ${diff.performanceReport.lowConfidenceTools.length}`);
217
218
  }
218
219
  }
@@ -276,7 +277,8 @@ export function formatDiffCompact(diff) {
276
277
  if (diff.performanceReport?.regressionCount ?? 0 > 0) {
277
278
  parts.push(`perf_regressions=${diff.performanceReport?.regressionCount}`);
278
279
  }
279
- if (diff.performanceReport?.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
280
+ if (diff.performanceReport?.lowConfidenceTools &&
281
+ diff.performanceReport.lowConfidenceTools.length > 0) {
280
282
  parts.push(`low_confidence_tools=${diff.performanceReport.lowConfidenceTools.length}`);
281
283
  }
282
284
  if (diff.securityReport) {
@@ -336,8 +338,11 @@ export function formatDiffGitHubActions(diff) {
336
338
  lines.push(`::notice::Minor changes: ${diff.summary}`);
337
339
  }
338
340
  for (const change of diff.behaviorChanges) {
339
- const level = change.severity === 'breaking' ? 'error' :
340
- change.severity === 'warning' ? 'warning' : 'notice';
341
+ const level = change.severity === 'breaking'
342
+ ? 'error'
343
+ : change.severity === 'warning'
344
+ ? 'warning'
345
+ : 'notice';
341
346
  lines.push(`::${level}::${change.tool} - ${change.description}`);
342
347
  }
343
348
  for (const tool of diff.toolsRemoved) {
@@ -355,7 +360,8 @@ export function formatDiffGitHubActions(diff) {
355
360
  }
356
361
  }
357
362
  // Low confidence warning
358
- if (diff.performanceReport?.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
363
+ if (diff.performanceReport?.lowConfidenceTools &&
364
+ diff.performanceReport.lowConfidenceTools.length > 0) {
359
365
  lines.push(`::notice::Low confidence metrics for ${diff.performanceReport.lowConfidenceTools.length} tool(s): ${diff.performanceReport.lowConfidenceTools.join(', ')}`);
360
366
  }
361
367
  // Security findings
@@ -440,7 +446,9 @@ export function formatDiffMarkdown(diff) {
440
446
  toolDiff.schemaChanged ? 'Schema changed' : '',
441
447
  toolDiff.descriptionChanged ? 'Description changed' : '',
442
448
  `${toolDiff.changes.length} change(s)`,
443
- ].filter(Boolean).join(', ');
449
+ ]
450
+ .filter(Boolean)
451
+ .join(', ');
444
452
  lines.push(`| ${toolDiff.tool} | ⚠️ Modified | ${details} |`);
445
453
  }
446
454
  lines.push('');
@@ -451,8 +459,7 @@ export function formatDiffMarkdown(diff) {
451
459
  lines.push('| Tool | Aspect | Severity | Description |');
452
460
  lines.push('|------|--------|----------|-------------|');
453
461
  for (const change of diff.behaviorChanges) {
454
- const sevEmoji = change.severity === 'breaking' ? '🔴' :
455
- change.severity === 'warning' ? '🟡' : '🟢';
462
+ const sevEmoji = change.severity === 'breaking' ? '🔴' : change.severity === 'warning' ? '🟡' : '🟢';
456
463
  lines.push(`| ${change.tool} | ${change.aspect} | ${sevEmoji} ${change.severity} | ${change.description} |`);
457
464
  }
458
465
  lines.push('');
@@ -503,7 +510,11 @@ export function formatDiffMarkdown(diff) {
503
510
  lines.push('|------|--------|---------|');
504
511
  for (const issue of schemaReport.toolsWithIssues) {
505
512
  const statusIcon = issue.isBreaking ? '🔴' : issue.becameUnstable ? '🟡' : '🔵';
506
- const status = issue.isBreaking ? 'Breaking' : issue.becameUnstable ? 'Unstable' : 'Changed';
513
+ const status = issue.isBreaking
514
+ ? 'Breaking'
515
+ : issue.becameUnstable
516
+ ? 'Unstable'
517
+ : 'Changed';
507
518
  lines.push(`| ${issue.toolName} | ${statusIcon} ${status} | ${issue.summary} |`);
508
519
  }
509
520
  lines.push('');
@@ -525,11 +536,13 @@ export function formatDiffMarkdown(diff) {
525
536
  lines.push(`⚠️ **Error behavior changed**: ${et.summary}`);
526
537
  lines.push('');
527
538
  }
528
- if (et.newCategories.length > 0 || et.resolvedCategories.length > 0 ||
529
- et.increasingCategories.length > 0 || et.decreasingCategories.length > 0) {
539
+ if (et.newCategories.length > 0 ||
540
+ et.resolvedCategories.length > 0 ||
541
+ et.increasingCategories.length > 0 ||
542
+ et.decreasingCategories.length > 0) {
530
543
  lines.push('| Category | Trend | Previous | Current | Change |');
531
544
  lines.push('|----------|-------|----------|---------|--------|');
532
- for (const trend of et.trends.filter(t => t.trend !== 'stable')) {
545
+ for (const trend of et.trends.filter((t) => t.trend !== 'stable')) {
533
546
  const trendEmoji = getTrendEmoji(trend.trend);
534
547
  const changeStr = trend.changePercent !== 0
535
548
  ? `${trend.changePercent > 0 ? '+' : ''}${trend.changePercent}%`
@@ -543,7 +556,8 @@ export function formatDiffMarkdown(diff) {
543
556
  // Performance section
544
557
  if (diff.performanceReport) {
545
558
  const perfReport = diff.performanceReport;
546
- if (perfReport.hasRegressions || perfReport.improvementCount > 0 ||
559
+ if (perfReport.hasRegressions ||
560
+ perfReport.improvementCount > 0 ||
547
561
  (perfReport.lowConfidenceTools && perfReport.lowConfidenceTools.length > 0)) {
548
562
  lines.push('### Performance');
549
563
  lines.push('');
@@ -605,7 +619,8 @@ export function formatDiffMarkdown(diff) {
605
619
  lines.push(`- Info: **${diff.infoCount}**`);
606
620
  if (diff.performanceReport) {
607
621
  lines.push(`- Performance regressions: **${diff.performanceReport.regressionCount}**`);
608
- if (diff.performanceReport.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
622
+ if (diff.performanceReport.lowConfidenceTools &&
623
+ diff.performanceReport.lowConfidenceTools.length > 0) {
609
624
  lines.push(`- Low confidence tools: **${diff.performanceReport.lowConfidenceTools.length}**`);
610
625
  }
611
626
  }
@@ -674,9 +689,7 @@ function getTrendEmoji(trend) {
674
689
  */
675
690
  export function formatDiffJUnit(diff, suiteName = 'bellwether') {
676
691
  const timestamp = new Date().toISOString();
677
- const totalTests = diff.toolsAdded.length +
678
- diff.toolsRemoved.length +
679
- diff.behaviorChanges.length;
692
+ const totalTests = diff.toolsAdded.length + diff.toolsRemoved.length + diff.behaviorChanges.length;
680
693
  const failures = diff.breakingCount;
681
694
  const errors = 0;
682
695
  const skipped = 0;
@@ -743,7 +756,8 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
743
756
  lines.push(' </testcase>');
744
757
  }
745
758
  // Low confidence tools
746
- if (diff.performanceReport.lowConfidenceTools && diff.performanceReport.lowConfidenceTools.length > 0) {
759
+ if (diff.performanceReport.lowConfidenceTools &&
760
+ diff.performanceReport.lowConfidenceTools.length > 0) {
747
761
  for (const tool of diff.performanceReport.lowConfidenceTools) {
748
762
  const name = escapeXml(`confidence-${tool}`);
749
763
  lines.push(` <testcase name="${name}" classname="drift.confidence">`);
@@ -805,7 +819,8 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
805
819
  lines.push(' </testcase>');
806
820
  }
807
821
  // Show stable schemas as passing tests
808
- if (diff.schemaEvolutionReport.stableCount > 0 && diff.schemaEvolutionReport.toolsWithIssues.length === 0) {
822
+ if (diff.schemaEvolutionReport.stableCount > 0 &&
823
+ diff.schemaEvolutionReport.toolsWithIssues.length === 0) {
809
824
  lines.push(` <testcase name="schema-stability-check" classname="drift.schema">`);
810
825
  lines.push(` <system-out>${diff.schemaEvolutionReport.stableCount} tool(s) have stable response schemas</system-out>`);
811
826
  lines.push(' </testcase>');
@@ -830,7 +845,7 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
830
845
  }
831
846
  // Increasing error types (warnings)
832
847
  for (const category of et.increasingCategories) {
833
- const trend = et.trends.find(t => t.category === category);
848
+ const trend = et.trends.find((t) => t.category === category);
834
849
  const name = escapeXml(`error-trend-increasing-${category}`);
835
850
  lines.push(` <testcase name="${name}" classname="drift.errors">`);
836
851
  lines.push(` <system-err>[WARNING] Error frequency increasing: ${escapeXml(category)}${trend ? ` (+${trend.changePercent}%)` : ''}</system-err>`);
@@ -878,7 +893,7 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
878
893
  * Format diff as SARIF (Static Analysis Results Interchange Format) for GitHub Code Scanning.
879
894
  *
880
895
  * SARIF is the standard format for GitHub's code scanning feature and can be
881
- * uploaded to show drift detection results in pull request reviews.
896
+ * used to show drift detection results in pull request reviews.
882
897
  *
883
898
  * @see https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html
884
899
  *
@@ -1313,11 +1328,13 @@ export function formatDiffSarif(diff, baselinePath = 'bellwether-baseline.json')
1313
1328
  }
1314
1329
  // Increasing error types
1315
1330
  for (const category of et.increasingCategories) {
1316
- const trend = et.trends.find(t => t.category === category);
1331
+ const trend = et.trends.find((t) => t.category === category);
1317
1332
  results.push({
1318
1333
  ruleId: 'BWH011',
1319
1334
  level: 'warning',
1320
- message: { text: `Error frequency increasing: ${category}${trend ? ` (+${trend.changePercent}%)` : ''}` },
1335
+ message: {
1336
+ text: `Error frequency increasing: ${category}${trend ? ` (+${trend.changePercent}%)` : ''}`,
1337
+ },
1321
1338
  locations: [
1322
1339
  {
1323
1340
  physicalLocation: {
@@ -69,7 +69,7 @@ export interface ErrorAnalysisSummary {
69
69
  /** Unique remediations suggested */
70
70
  remediations: string[];
71
71
  /** Counts by error category */
72
- categoryCounts: Map<string, number>;
72
+ categoryCounts: Map<string, number> | Record<string, number>;
73
73
  /** Top root causes (most common) */
74
74
  topRootCauses: string[];
75
75
  /** Top remediations (most actionable) */
@@ -22,7 +22,8 @@ export function analyzeError(errorMessage, context) {
22
22
  let statusCategory = categorizeHttpStatus(httpStatus);
23
23
  const wasExpected = context?.wasExpected ?? context?.expectedOutcome === 'error';
24
24
  // If the error was expected (validation test), recategorize it
25
- if (wasExpected && (statusCategory === 'client_error_validation' || statusCategory === 'unknown')) {
25
+ if (wasExpected &&
26
+ (statusCategory === 'client_error_validation' || statusCategory === 'unknown')) {
26
27
  statusCategory = 'validation_expected';
27
28
  }
28
29
  const rootCause = wasExpected
@@ -96,7 +97,9 @@ export function generateErrorSummary(toolName, patterns) {
96
97
  }
97
98
  }
98
99
  // Count transient and actionable errors
99
- const transientErrors = analyses.filter((a) => a.transient).reduce((sum, a) => sum + a.pattern.count, 0);
100
+ const transientErrors = analyses
101
+ .filter((a) => a.transient)
102
+ .reduce((sum, a) => sum + a.pattern.count, 0);
100
103
  const actionableCount = analyses.filter((a) => a.remediation && !a.remediation.includes('Review')).length;
101
104
  // Collect unique remediations with frequency
102
105
  const remediationCounts = new Map();
@@ -324,7 +327,9 @@ export function inferRootCause(message, category) {
324
327
  if (lower.includes('invalid') || lower.includes('malformed')) {
325
328
  return 'Invalid input format or value';
326
329
  }
327
- if (lower.includes('not found') || lower.includes('does not exist') || lower.includes("doesn't exist")) {
330
+ if (lower.includes('not found') ||
331
+ lower.includes('does not exist') ||
332
+ lower.includes("doesn't exist")) {
328
333
  return 'Referenced resource does not exist';
329
334
  }
330
335
  if (lower.includes('already exists') || lower.includes('duplicate')) {
@@ -333,7 +338,9 @@ export function inferRootCause(message, category) {
333
338
  if (lower.includes('unauthorized') || lower.includes('authentication')) {
334
339
  return 'Authentication credentials missing or invalid';
335
340
  }
336
- if (lower.includes('forbidden') || lower.includes('permission') || lower.includes('access denied')) {
341
+ if (lower.includes('forbidden') ||
342
+ lower.includes('permission') ||
343
+ lower.includes('access denied')) {
337
344
  return 'Insufficient permissions for this operation';
338
345
  }
339
346
  if (lower.includes('rate') || lower.includes('throttl') || lower.includes('too many')) {
@@ -480,15 +487,69 @@ export function extractRelatedParameters(message) {
480
487
  */
481
488
  function isCommonWord(word) {
482
489
  const commonWords = new Set([
483
- 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
484
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
485
- 'could', 'should', 'may', 'might', 'must', 'shall', 'can',
486
- 'need', 'not', 'and', 'but', 'or', 'if', 'then', 'else',
487
- 'for', 'with', 'from', 'this', 'that', 'these', 'those',
488
- 'error', 'message', 'failed', 'invalid', 'missing', 'required',
489
- 'found', 'exist', 'exists', 'value', 'input', 'output', 'type',
490
- 'string', 'number', 'boolean', 'object', 'array', 'null', 'undefined',
491
- 'field', 'parameter', 'property', 'argument', 'key',
490
+ 'the',
491
+ 'is',
492
+ 'are',
493
+ 'was',
494
+ 'were',
495
+ 'be',
496
+ 'been',
497
+ 'being',
498
+ 'have',
499
+ 'has',
500
+ 'had',
501
+ 'do',
502
+ 'does',
503
+ 'did',
504
+ 'will',
505
+ 'would',
506
+ 'could',
507
+ 'should',
508
+ 'may',
509
+ 'might',
510
+ 'must',
511
+ 'shall',
512
+ 'can',
513
+ 'need',
514
+ 'not',
515
+ 'and',
516
+ 'but',
517
+ 'or',
518
+ 'if',
519
+ 'then',
520
+ 'else',
521
+ 'for',
522
+ 'with',
523
+ 'from',
524
+ 'this',
525
+ 'that',
526
+ 'these',
527
+ 'those',
528
+ 'error',
529
+ 'message',
530
+ 'failed',
531
+ 'invalid',
532
+ 'missing',
533
+ 'required',
534
+ 'found',
535
+ 'exist',
536
+ 'exists',
537
+ 'value',
538
+ 'input',
539
+ 'output',
540
+ 'type',
541
+ 'string',
542
+ 'number',
543
+ 'boolean',
544
+ 'object',
545
+ 'array',
546
+ 'null',
547
+ 'undefined',
548
+ 'field',
549
+ 'parameter',
550
+ 'property',
551
+ 'argument',
552
+ 'key',
492
553
  ]);
493
554
  return commonWords.has(word.toLowerCase());
494
555
  }
@@ -510,9 +571,19 @@ export function isTransientError(category, message) {
510
571
  return true;
511
572
  // Check for transient keywords
512
573
  const transientKeywords = [
513
- 'timeout', 'timed out', 'temporarily', 'retry', 'unavailable',
514
- 'connection', 'network', 'service unavailable', 'too many requests',
515
- 'try again', 'overloaded', 'busy', 'maintenance',
574
+ 'timeout',
575
+ 'timed out',
576
+ 'temporarily',
577
+ 'retry',
578
+ 'unavailable',
579
+ 'connection',
580
+ 'network',
581
+ 'service unavailable',
582
+ 'too many requests',
583
+ 'try again',
584
+ 'overloaded',
585
+ 'busy',
586
+ 'maintenance',
516
587
  ];
517
588
  return transientKeywords.some((keyword) => lower.includes(keyword));
518
589
  }
@@ -661,7 +732,9 @@ export function formatErrorTrendReport(report, useColors = false) {
661
732
  lines.push(' Trend details:');
662
733
  for (const trend of report.trends.filter((t) => t.trend !== 'stable')) {
663
734
  const arrow = getTrendArrow(trend.trend);
664
- const changeText = trend.changePercent !== 0 ? ` (${trend.changePercent > 0 ? '+' : ''}${trend.changePercent}%)` : '';
735
+ const changeText = trend.changePercent !== 0
736
+ ? ` (${trend.changePercent > 0 ? '+' : ''}${trend.changePercent}%)`
737
+ : '';
665
738
  lines.push(` ${arrow} ${trend.category}: ${trend.previousCount} → ${trend.currentCount}${changeText}`);
666
739
  }
667
740
  return lines.join('\n');
@@ -26,7 +26,7 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
26
26
  // If no baseline or force retest, test everything
27
27
  if (!baseline || forceRetest) {
28
28
  return {
29
- toolsToTest: currentTools.map(t => t.name),
29
+ toolsToTest: currentTools.map((t) => t.name),
30
30
  toolsToSkip: [],
31
31
  cachedFingerprints: [],
32
32
  changeSummary: {
@@ -34,7 +34,7 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
34
34
  changedTools: 0,
35
35
  unchangedTools: 0,
36
36
  removedTools: 0,
37
- newToolNames: baseline ? [] : currentTools.map(t => t.name),
37
+ newToolNames: baseline ? [] : currentTools.map((t) => t.name),
38
38
  changedToolNames: [],
39
39
  removedToolNames: [],
40
40
  },
@@ -47,8 +47,8 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
47
47
  const changedToolNames = [];
48
48
  const removedToolNames = [];
49
49
  // Build maps for comparison
50
- const baselineToolMap = new Map(getToolFingerprints(baseline).map(t => [t.name, t]));
51
- const currentToolSet = new Set(currentTools.map(t => t.name));
50
+ const baselineToolMap = new Map(getToolFingerprints(baseline).map((t) => [t.name, t]));
51
+ const currentToolSet = new Set(currentTools.map((t) => t.name));
52
52
  // Check current tools against baseline
53
53
  for (const tool of currentTools) {
54
54
  const baselineTool = baselineToolMap.get(tool.name);
@@ -65,7 +65,10 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
65
65
  }
66
66
  // Check if schema changed
67
67
  const currentSchemaHash = computeSchemaHash(tool.inputSchema);
68
- const baselineSchemaHash = baselineTool.schemaHash;
68
+ const baselineSchemaHash = baselineTool.inputSchemaHashAtTest ??
69
+ (baselineTool.inputSchema
70
+ ? computeSchemaHash(baselineTool.inputSchema)
71
+ : baselineTool.schemaHash);
69
72
  if (currentSchemaHash !== baselineSchemaHash) {
70
73
  // Schema changed - needs retesting
71
74
  toolsToTest.push(tool.name);
@@ -6,26 +6,16 @@ export { getBaselineGeneratedAt, getBaselineHash, getBaselineServerCommand, getB
6
6
  export { createBaseline, saveBaseline, loadBaseline, verifyBaselineHash, baselineExists, recalculateBaselineHash, acceptDrift, hasAcceptance, clearAcceptance, type LoadBaselineOptions, type AcceptDriftOptions, } from './saver.js';
7
7
  export { compareWithBaseline, compareBaselines, hasBreakingChanges, hasSecurityChanges, filterByMinimumSeverity, checkBaselineVersionCompatibility, compareSeverity, severityMeetsThreshold, applyAspectOverride, applySeverityConfig, shouldFailOnDiff, } from './comparator.js';
8
8
  export { formatDiffText, formatDiffJson, formatDiffCompact, formatDiffGitHubActions, formatDiffMarkdown, formatDiffJUnit, formatDiffSarif, formatSecurityReport, } from './diff.js';
9
- export { createCloudBaseline, } from './converter.js';
9
+ export { createBaselineFromInterview } from './converter.js';
10
10
  export { computeSchemaHash, compareSchemas, computeConsensusSchemaHash, type SchemaChangeType, type SchemaChange, type SchemaComparisonResult, } from './schema-compare.js';
11
11
  export { getBaselineVersion, parseVersion, areVersionsCompatible, compareVersions, getCompatibilityWarning, checkVersionCompatibility, assertVersionCompatibility, formatVersion, isCurrentVersion, isOlderVersion, isNewerVersion, requiresMigration, BaselineVersionError, type FormatVersion, type VersionCompatibility, } from './version.js';
12
12
  export { analyzeForIncremental, mergeFingerprints, formatIncrementalSummary, isIncrementalWorthwhile, addIncrementalMetadata, type IncrementalCheckResult, type IncrementalChangeSummary, type IncrementalCheckOptions, } from './incremental-checker.js';
13
13
  export { analyzeResponses, inferSchemaFromValue, compareFingerprints, compareErrorPatterns, computeInferredSchemaHash, type ResponseFingerprint, type ResponseContentType, type ResponseSize, type InferredSchema, type ErrorPattern, type ResponseAnalysis, type FingerprintDiff, type FingerprintChange, type ErrorPatternDiff, } from './response-fingerprint.js';
14
14
  export { compareInferredSchemas, buildSchemaEvolution, compareSchemaEvolution, formatSchemaEvolution, formatSchemaEvolutionDiff, hasSchemaEvolutionIssues, getSchemaStabilityGrade, type ResponseSchemaEvolution, type SchemaVersion as SchemaEvolutionVersion, type SchemaEvolutionDiff, type SchemaTypeChange, } from './response-schema-tracker.js';
15
- export { analyzeToolChangeImpact, analyzeDiffImpact, analyzeSchemaChanges, isBreakingChange, getBreakingChangeSummary, CHANGE_IMPACT, type SchemaChangeType as ImpactSchemaChangeType, type SchemaChangeDetail, type MigrationComplexity, type ChangeImpact, type DiffImpactAnalysis, type ActionItem, } from './change-impact-analyzer.js';
16
- export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, PERFORMANCE, type LatencyTrend, type ToolPerformanceMetrics, type PerformanceBaseline, type PerformanceComparison, type PerformanceReport, type LatencySample, } from './performance-tracker.js';
17
- export { checkDeprecations, checkToolDeprecation, markAsDeprecated, clearDeprecation, getDeprecatedTools, getExpiredTools, getUpcomingRemovals, formatDeprecationWarning, formatDeprecationReport, shouldFailOnDeprecation, DEPRECATION, DEPRECATION_DEFAULTS, DEPRECATION_THRESHOLDS, type DeprecationStatus, type DeprecationWarning, type DeprecationReport, type DeprecationConfig, } from './deprecation-tracker.js';
18
- export { calculateHealthScore, formatHealthScore, meetsHealthThreshold, getHealthBadgeColor, createHealthHistoryEntry, HEALTH_SCORING, HEALTH_WEIGHTS, GRADE_THRESHOLDS, SEVERITY_THRESHOLDS, HEALTH_PENALTIES, type HealthTrend, type ActionPriority, type HealthActionItem, type HealthComponents, type HealthScore, type HealthHistory, type HealthInput, } from './health-scorer.js';
19
- export { buildServerTimeline, buildToolTimeline, formatTimeline, formatServerTimelineSummary, generateVisualTimeline, serializeTimeline, deserializeTimeline, serializeServerTimeline, deserializeServerTimeline, getMostActiveTools, getMostBreakingTools, getBreakingChanges, getVersionAtTime, getChangesBetween, hadBreakingChanges, type SchemaEventType, type SchemaVersion, type SchemaTimeline, type ServerTimeline, type DeprecationEvent, type TimelineStats, type TimelineBuildOptions, } from './schema-evolution.js';
20
- export { generateMigrationGuide, formatMigrationGuideMarkdown, formatMigrationGuideText, hasBreakingMigrationChanges, getBreakingTools, type MigrationEffort, type MigrationStepType, type CodeExample, type BreakingChange, type MigrationStep, type MigrationGuide, type MigrationStats, } from './migration-generator.js';
21
- export { generateToolScenarios, generateBaselineScenarios, formatScenariosAsYaml, formatScenariosReport, getScenariosByPriority, getScenariosByCategory, getCriticalScenarios, getSecurityScenarios, type ScenarioCategory, type ScenarioPriority, type TestScenario, type AutoGeneratedScenarios, type ScenarioGenerationSummary, type ScenarioGenerationResult, type ScenarioGenerationConfig, } from './scenario-generator.js';
22
- export { generatePRComment, generateCompactPRComment, generateCIStatusSummary, generateDiffTable, generateBadgeUrl, generateBadgeMarkdown, getBadgeColor, shouldBlockMerge, getSeverityEmoji, type BadgeColor, type CommentSection, type AffectedWorkflow, type PRComment, type PRCommentConfig, } from './pr-comment-generator.js';
15
+ export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, type LatencyTrend, type ToolPerformanceMetrics, type PerformanceBaseline, type PerformanceComparison, type PerformanceReport, type LatencySample, } from './performance-tracker.js';
23
16
  export type { SecurityCategory, RiskLevel, SecurityPayload, SecurityTestResult, SecurityFinding, SecurityFingerprint, SecurityDiff, SecurityTestOptions, SecurityTestContext, SecurityToolCallResult, SecurityReport, } from '../security/types.js';
24
17
  export { runSecurityTests, compareSecurityFingerprints, getRiskLevelFromScore, parseSecurityCategories, getPayloadsForCategory, getAllSecurityPayloads, getAllSecurityCategories, } from '../security/index.js';
25
18
  export type { HttpStatusCategory, ErrorSeverity, EnhancedErrorAnalysis, ErrorAnalysisSummary, ErrorTrend, ErrorTrendReport, } from './error-analyzer.js';
26
19
  export { analyzeError, analyzeErrorPatterns, generateErrorSummary, analyzeErrorTrends, extractHttpStatus, categorizeHttpStatus, inferRootCause, generateRemediation, extractRelatedParameters, isTransientError, assessErrorSeverity, mapStatusToErrorCategory, formatEnhancedError, formatErrorTrendReport, formatCategoryName, } from './error-analyzer.js';
27
20
  export { scoreDocumentation, scoreToolDocumentation, calculateDescriptionCoverage, calculateDescriptionQuality, calculateParameterDocumentation, calculateExampleCoverage, hasExamples, scoreToGrade, generateSuggestions, compareDocumentationScores, formatDocumentationScore, formatDocumentationScoreCompact, formatDocumentationScoreChange, toDocumentationScoreSummary, getGradeIndicator, getGradeBadgeColor, meetsDocumentationThreshold, meetsDocumentationGrade, type DocumentationIssueSeverity, type DocumentationIssueType, } from './documentation-scorer.js';
28
- export { calculateAICompatibilityScore, generateAICompatibilityMarkdown, type AICompatibilityScore, type ScoreComponent, type AICompatibilityRecommendation, type ToolAIScore, type AICompatibilityInput, } from './ai-compatibility-scorer.js';
29
- export { calculateRiskScore, generateRiskScoreMarkdown, type RegressionRiskScore, type RiskFactor, } from './risk-scorer.js';
30
- export { calculatePruningDecisions, calculateToolPruning, prioritizeTools, generatePruningSummary, generatePruningMarkdown, type TestCategory, type TestCategoryDecision, type ToolPruningDecision, type ToolCharacteristics, type PruningInput, type PruningSummary, } from './test-pruner.js';
31
21
  //# sourceMappingURL=index.d.ts.map
@@ -5,39 +5,19 @@ export { getBaselineGeneratedAt, getBaselineHash, getBaselineServerCommand, getB
5
5
  export { createBaseline, saveBaseline, loadBaseline, verifyBaselineHash, baselineExists, recalculateBaselineHash, acceptDrift, hasAcceptance, clearAcceptance, } from './saver.js';
6
6
  export { compareWithBaseline, compareBaselines, hasBreakingChanges, hasSecurityChanges, filterByMinimumSeverity, checkBaselineVersionCompatibility, compareSeverity, severityMeetsThreshold, applyAspectOverride, applySeverityConfig, shouldFailOnDiff, } from './comparator.js';
7
7
  export { formatDiffText, formatDiffJson, formatDiffCompact, formatDiffGitHubActions, formatDiffMarkdown, formatDiffJUnit, formatDiffSarif, formatSecurityReport, } from './diff.js';
8
- export { createCloudBaseline, } from './converter.js';
8
+ export { createBaselineFromInterview } from './converter.js';
9
9
  export { computeSchemaHash, compareSchemas, computeConsensusSchemaHash, } from './schema-compare.js';
10
10
  export { getBaselineVersion, parseVersion, areVersionsCompatible, compareVersions, getCompatibilityWarning, checkVersionCompatibility, assertVersionCompatibility, formatVersion, isCurrentVersion, isOlderVersion, isNewerVersion, requiresMigration, BaselineVersionError, } from './version.js';
11
- // Legacy baseline migrations removed; cloud baseline is canonical.
11
+ // Legacy baseline migrations removed; current baseline format is canonical.
12
12
  // Incremental checking
13
13
  export { analyzeForIncremental, mergeFingerprints, formatIncrementalSummary, isIncrementalWorthwhile, addIncrementalMetadata, } from './incremental-checker.js';
14
14
  export { analyzeResponses, inferSchemaFromValue, compareFingerprints, compareErrorPatterns, computeInferredSchemaHash, } from './response-fingerprint.js';
15
15
  // Response schema evolution tracking
16
16
  export { compareInferredSchemas, buildSchemaEvolution, compareSchemaEvolution, formatSchemaEvolution, formatSchemaEvolutionDiff, hasSchemaEvolutionIssues, getSchemaStabilityGrade, } from './response-schema-tracker.js';
17
- // Change impact analysis
18
- export { analyzeToolChangeImpact, analyzeDiffImpact, analyzeSchemaChanges, isBreakingChange, getBreakingChangeSummary, CHANGE_IMPACT, } from './change-impact-analyzer.js';
19
17
  // Performance tracking
20
- export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, PERFORMANCE, } from './performance-tracker.js';
21
- // Deprecation tracking
22
- export { checkDeprecations, checkToolDeprecation, markAsDeprecated, clearDeprecation, getDeprecatedTools, getExpiredTools, getUpcomingRemovals, formatDeprecationWarning, formatDeprecationReport, shouldFailOnDeprecation, DEPRECATION, DEPRECATION_DEFAULTS, DEPRECATION_THRESHOLDS, } from './deprecation-tracker.js';
23
- // Health scoring
24
- export { calculateHealthScore, formatHealthScore, meetsHealthThreshold, getHealthBadgeColor, createHealthHistoryEntry, HEALTH_SCORING, HEALTH_WEIGHTS, GRADE_THRESHOLDS, SEVERITY_THRESHOLDS, HEALTH_PENALTIES, } from './health-scorer.js';
25
- // Schema evolution timeline
26
- export { buildServerTimeline, buildToolTimeline, formatTimeline, formatServerTimelineSummary, generateVisualTimeline, serializeTimeline, deserializeTimeline, serializeServerTimeline, deserializeServerTimeline, getMostActiveTools, getMostBreakingTools, getBreakingChanges, getVersionAtTime, getChangesBetween, hadBreakingChanges, } from './schema-evolution.js';
27
- // Migration guide generation
28
- export { generateMigrationGuide, formatMigrationGuideMarkdown, formatMigrationGuideText, hasBreakingMigrationChanges, getBreakingTools, } from './migration-generator.js';
29
- // Auto-generated test scenarios
30
- export { generateToolScenarios, generateBaselineScenarios, formatScenariosAsYaml, formatScenariosReport, getScenariosByPriority, getScenariosByCategory, getCriticalScenarios, getSecurityScenarios, } from './scenario-generator.js';
31
- // Enhanced PR comments
32
- export { generatePRComment, generateCompactPRComment, generateCIStatusSummary, generateDiffTable, generateBadgeUrl, generateBadgeMarkdown, getBadgeColor, shouldBlockMerge, getSeverityEmoji, } from './pr-comment-generator.js';
18
+ export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, } from './performance-tracker.js';
33
19
  export { runSecurityTests, compareSecurityFingerprints, getRiskLevelFromScore, parseSecurityCategories, getPayloadsForCategory, getAllSecurityPayloads, getAllSecurityCategories, } from '../security/index.js';
34
20
  export { analyzeError, analyzeErrorPatterns, generateErrorSummary, analyzeErrorTrends, extractHttpStatus, categorizeHttpStatus, inferRootCause, generateRemediation, extractRelatedParameters, isTransientError, assessErrorSeverity, mapStatusToErrorCategory, formatEnhancedError, formatErrorTrendReport, formatCategoryName, } from './error-analyzer.js';
35
21
  // Documentation quality scoring
36
22
  export { scoreDocumentation, scoreToolDocumentation, calculateDescriptionCoverage, calculateDescriptionQuality, calculateParameterDocumentation, calculateExampleCoverage, hasExamples, scoreToGrade, generateSuggestions, compareDocumentationScores, formatDocumentationScore, formatDocumentationScoreCompact, formatDocumentationScoreChange, toDocumentationScoreSummary, getGradeIndicator, getGradeBadgeColor, meetsDocumentationThreshold, meetsDocumentationGrade, } from './documentation-scorer.js';
37
- // AI Agent Compatibility Scoring
38
- export { calculateAICompatibilityScore, generateAICompatibilityMarkdown, } from './ai-compatibility-scorer.js';
39
- // Regression Risk Scoring
40
- export { calculateRiskScore, generateRiskScoreMarkdown, } from './risk-scorer.js';
41
- // Intelligent Test Pruning
42
- export { calculatePruningDecisions, calculateToolPruning, prioritizeTools, generatePruningSummary, generatePruningMarkdown, } from './test-pruner.js';
43
23
  //# sourceMappingURL=index.js.map
@@ -132,7 +132,6 @@ export interface LatencySample {
132
132
  */
133
133
  outcomeCorrect?: boolean;
134
134
  }
135
- export { PERFORMANCE_TRACKING as PERFORMANCE } from '../constants.js';
136
135
  /**
137
136
  * Calculate statistical confidence for performance metrics.
138
137
  *
@@ -6,8 +6,6 @@
6
6
  */
7
7
  import { getBaselineGeneratedAt, getToolFingerprints } from './accessors.js';
8
8
  import { PERFORMANCE_TRACKING, PERFORMANCE_CONFIDENCE } from '../constants.js';
9
- // Re-export centralized constant for backwards compatibility
10
- export { PERFORMANCE_TRACKING as PERFORMANCE } from '../constants.js';
11
9
  /**
12
10
  * Calculate statistical confidence for performance metrics.
13
11
  *
@@ -45,14 +43,14 @@ export function calculatePerformanceConfidence(samples, options = {}) {
45
43
  }
46
44
  // Categorize samples by expected outcome
47
45
  // Happy path tests: expectedOutcome === 'success' or undefined (backward compat)
48
- const happyPathSamples = samples.filter(s => s.expectedOutcome === 'success' || s.expectedOutcome === undefined);
46
+ const happyPathSamples = samples.filter((s) => s.expectedOutcome === 'success' || s.expectedOutcome === undefined);
49
47
  // Validation tests: expectedOutcome === 'error'
50
- const validationTestSamples = samples.filter(s => s.expectedOutcome === 'error');
48
+ const validationTestSamples = samples.filter((s) => s.expectedOutcome === 'error');
51
49
  // Count validation samples that correctly rejected (error as expected = success)
52
- const validationSuccesses = validationTestSamples.filter(s => !s.success && (s.outcomeCorrect === undefined || s.outcomeCorrect === true)).length;
50
+ const validationSuccesses = validationTestSamples.filter((s) => !s.success && (s.outcomeCorrect === undefined || s.outcomeCorrect === true)).length;
53
51
  // For confidence, only use happy path samples that succeeded
54
- const successfulHappyPath = happyPathSamples.filter(s => s.success);
55
- const allDurations = successfulHappyPath.map(s => s.durationMs);
52
+ const successfulHappyPath = happyPathSamples.filter((s) => s.success);
53
+ const allDurations = successfulHappyPath.map((s) => s.durationMs);
56
54
  // Handle all failures case
57
55
  if (allDurations.length === 0) {
58
56
  return {
@@ -69,12 +67,10 @@ export function calculatePerformanceConfidence(samples, options = {}) {
69
67
  // For variance calculation, exclude the first sample (cold start warmup)
70
68
  // This prevents JIT compilation, connection setup, and cache warming from
71
69
  // inflating the coefficient of variation and lowering confidence scores.
72
- const durationsForVariance = excludeWarmup && allDurations.length > 1
73
- ? allDurations.slice(1)
74
- : allDurations;
70
+ const durationsForVariance = excludeWarmup && allDurations.length > 1 ? allDurations.slice(1) : allDurations;
75
71
  // Calculate variance using post-warmup samples only
76
72
  const meanForVariance = durationsForVariance.reduce((sum, d) => sum + d, 0) / durationsForVariance.length;
77
- const squaredDiffs = durationsForVariance.map(d => Math.pow(d - meanForVariance, 2));
73
+ const squaredDiffs = durationsForVariance.map((d) => Math.pow(d - meanForVariance, 2));
78
74
  const variance = squaredDiffs.reduce((sum, d) => sum + d, 0) / durationsForVariance.length;
79
75
  const standardDeviation = Math.sqrt(variance);
80
76
  // Calculate coefficient of variation (CV = stdDev / mean)
@@ -186,8 +182,8 @@ export function calculateMetrics(samples) {
186
182
  return null;
187
183
  }
188
184
  const toolName = samples[0].toolName;
189
- const successfulSamples = samples.filter(s => s.success);
190
- const durations = successfulSamples.map(s => s.durationMs).sort((a, b) => a - b);
185
+ const successfulSamples = samples.filter((s) => s.success);
186
+ const durations = successfulSamples.map((s) => s.durationMs).sort((a, b) => a - b);
191
187
  if (durations.length === 0) {
192
188
  // All calls failed
193
189
  const confidence = calculatePerformanceConfidence(samples);
@@ -213,7 +209,7 @@ export function calculateMetrics(samples) {
213
209
  const minMs = durations[0];
214
210
  const maxMs = durations[durations.length - 1];
215
211
  // Calculate standard deviation
216
- const squaredDiffs = durations.map(d => Math.pow(d - avgMs, 2));
212
+ const squaredDiffs = durations.map((d) => Math.pow(d - avgMs, 2));
217
213
  const avgSquaredDiff = squaredDiffs.reduce((sum, d) => sum + d, 0) / squaredDiffs.length;
218
214
  const stdDevMs = Math.sqrt(avgSquaredDiff);
219
215
  // Calculate confidence from samples
@@ -323,8 +319,8 @@ export function comparePerformance(current, baseline, regressionThreshold = PERF
323
319
  const trend = determineTrend(p50Regression);
324
320
  // Check for regression
325
321
  const maxRegression = baseline.maxAllowedRegression ?? regressionThreshold;
326
- const hasRegression = p50Regression !== null && p50Regression > maxRegression ||
327
- p95Regression !== null && p95Regression > maxRegression;
322
+ const hasRegression = (p50Regression !== null && p50Regression > maxRegression) ||
323
+ (p95Regression !== null && p95Regression > maxRegression);
328
324
  // Determine severity
329
325
  const severity = determinePerformanceSeverity(p50Regression, p95Regression, maxRegression);
330
326
  // Generate summary (include confidence note if low)
@@ -520,10 +516,7 @@ export function formatMetrics(metrics) {
520
516
  * Format performance comparison for display.
521
517
  */
522
518
  export function formatComparison(comparison) {
523
- const lines = [
524
- `Tool: ${comparison.toolName}`,
525
- ` Trend: ${comparison.trend.toUpperCase()}`,
526
- ];
519
+ const lines = [`Tool: ${comparison.toolName}`, ` Trend: ${comparison.trend.toUpperCase()}`];
527
520
  if (comparison.p50RegressionPercent !== null) {
528
521
  const sign = comparison.p50RegressionPercent >= 0 ? '+' : '';
529
522
  lines.push(` p50 change: ${sign}${(comparison.p50RegressionPercent * 100).toFixed(1)}%`);