@dotsetlabs/bellwether 1.0.3 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/CHANGELOG.md +118 -0
  2. package/README.md +17 -2
  3. package/dist/auth/credentials.js +2 -0
  4. package/dist/baseline/accessors.d.ts +1 -1
  5. package/dist/baseline/accessors.js +13 -3
  6. package/dist/baseline/baseline-format.d.ts +335 -0
  7. package/dist/baseline/baseline-format.js +12 -0
  8. package/dist/baseline/comparator.js +494 -13
  9. package/dist/baseline/converter.d.ts +15 -15
  10. package/dist/baseline/converter.js +97 -37
  11. package/dist/baseline/diff.d.ts +1 -1
  12. package/dist/baseline/diff.js +45 -28
  13. package/dist/baseline/error-analyzer.d.ts +1 -1
  14. package/dist/baseline/error-analyzer.js +90 -17
  15. package/dist/baseline/incremental-checker.js +8 -5
  16. package/dist/baseline/index.d.ts +2 -12
  17. package/dist/baseline/index.js +3 -23
  18. package/dist/baseline/performance-tracker.d.ts +0 -1
  19. package/dist/baseline/performance-tracker.js +13 -20
  20. package/dist/baseline/response-fingerprint.js +40 -3
  21. package/dist/baseline/saver.js +75 -10
  22. package/dist/baseline/schema-compare.d.ts +22 -0
  23. package/dist/baseline/schema-compare.js +259 -16
  24. package/dist/baseline/types.d.ts +30 -7
  25. package/dist/cache/response-cache.d.ts +8 -0
  26. package/dist/cache/response-cache.js +119 -2
  27. package/dist/cli/commands/baseline.js +70 -35
  28. package/dist/cli/commands/check.js +71 -15
  29. package/dist/cli/commands/explore.js +69 -16
  30. package/dist/cli/commands/init.js +10 -7
  31. package/dist/cli/commands/watch.js +5 -5
  32. package/dist/cli/index.js +8 -0
  33. package/dist/config/loader.js +2 -2
  34. package/dist/config/template.js +8 -7
  35. package/dist/config/validator.d.ts +59 -59
  36. package/dist/config/validator.js +245 -90
  37. package/dist/constants/core.d.ts +5 -1
  38. package/dist/constants/core.js +9 -20
  39. package/dist/constants/registry.d.ts +17 -0
  40. package/dist/constants/registry.js +18 -0
  41. package/dist/constants/testing.d.ts +0 -369
  42. package/dist/constants/testing.js +18 -456
  43. package/dist/constants.d.ts +1 -1
  44. package/dist/constants.js +1 -1
  45. package/dist/discovery/discovery.js +88 -14
  46. package/dist/discovery/types.d.ts +5 -1
  47. package/dist/docs/agents.js +138 -50
  48. package/dist/docs/contract.js +194 -84
  49. package/dist/docs/report.js +8 -5
  50. package/dist/errors/retry.js +11 -5
  51. package/dist/interview/insights.d.ts +17 -0
  52. package/dist/interview/insights.js +52 -0
  53. package/dist/interview/interviewer.js +52 -10
  54. package/dist/interview/prompt-test-generator.d.ts +12 -0
  55. package/dist/interview/prompt-test-generator.js +77 -0
  56. package/dist/interview/rate-limiter.js +7 -3
  57. package/dist/interview/resource-test-generator.d.ts +12 -0
  58. package/dist/interview/resource-test-generator.js +20 -0
  59. package/dist/interview/schema-inferrer.js +26 -4
  60. package/dist/interview/schema-test-generator.js +278 -31
  61. package/dist/interview/stateful-test-runner.d.ts +3 -0
  62. package/dist/interview/stateful-test-runner.js +80 -0
  63. package/dist/interview/types.d.ts +12 -0
  64. package/dist/llm/anthropic.js +14 -4
  65. package/dist/llm/fallback.d.ts +1 -0
  66. package/dist/llm/fallback.js +7 -1
  67. package/dist/llm/openai.js +15 -4
  68. package/dist/protocol/index.d.ts +2 -0
  69. package/dist/protocol/index.js +2 -0
  70. package/dist/protocol/version-registry.d.ts +66 -0
  71. package/dist/protocol/version-registry.js +159 -0
  72. package/dist/transport/http-transport.d.ts +11 -1
  73. package/dist/transport/http-transport.js +21 -2
  74. package/dist/transport/mcp-client.d.ts +29 -1
  75. package/dist/transport/mcp-client.js +93 -8
  76. package/dist/transport/sse-transport.d.ts +7 -3
  77. package/dist/transport/sse-transport.js +162 -71
  78. package/dist/transport/types.d.ts +134 -1
  79. package/dist/utils/concurrency.d.ts +2 -0
  80. package/dist/utils/concurrency.js +9 -2
  81. package/dist/utils/markdown.js +13 -18
  82. package/dist/utils/timeout.js +2 -1
  83. package/dist/version.js +1 -1
  84. package/man/bellwether.1 +1 -1
  85. package/man/bellwether.1.md +2 -2
  86. package/package.json +1 -1
  87. package/schemas/bellwether-check.schema.json +185 -0
  88. package/schemas/bellwether-explore.schema.json +837 -0
  89. package/scripts/completions/bellwether.bash +10 -4
  90. package/scripts/completions/bellwether.zsh +55 -2
@@ -5,7 +5,8 @@ import { formatDateISO, formatDuration, escapeTableCell, mermaidLabel, validateJ
5
5
  import { smartTruncate, getExampleLength } from '../utils/smart-truncate.js';
6
6
  import { calculatePerformanceMetrics, extractParameters, looksLikeError } from './shared.js';
7
7
  import { analyzeExternalDependencies, formatExternalDependenciesMarkdown, } from '../baseline/external-dependency-detector.js';
8
- import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, } from '../constants.js';
8
+ import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, MCP, } from '../constants.js';
9
+ import { getFeatureFlags } from '../protocol/index.js';
9
10
  /**
10
11
  * Classify issues by their source to help users understand which issues
11
12
  * are actual bugs vs expected behavior or environment issues.
@@ -52,13 +53,15 @@ function classifyIssuesBySource(profiles) {
52
53
  // but tool didn't actually reject - this shouldn't happen with outcomeAssessment.correct check above
53
54
  // so we classify based on expected outcome and error classification
54
55
  // 1. Check for external dependency errors (highest priority for classification)
55
- if (errorClassification && errorClassification.externalServiceErrors > 0 && detectedServices.length > 0) {
56
+ if (errorClassification &&
57
+ errorClassification.externalServiceErrors > 0 &&
58
+ detectedServices.length > 0) {
56
59
  // Check if the error message matches known external service patterns
57
- const isExternalError = detectedServices.some(service => {
60
+ const isExternalError = detectedServices.some((service) => {
58
61
  const serviceConfig = EXTERNAL_DEPENDENCIES.SERVICES[service];
59
62
  if (!serviceConfig)
60
63
  return false;
61
- return serviceConfig.errorPatterns.some(pattern => pattern.test(errorMsg));
64
+ return serviceConfig.errorPatterns.some((pattern) => pattern.test(errorMsg));
62
65
  });
63
66
  if (isExternalError) {
64
67
  issue.service = detectedServices[0];
@@ -68,7 +71,7 @@ function classifyIssuesBySource(profiles) {
68
71
  }
69
72
  // 2. Check for environment configuration errors
70
73
  if (errorClassification && errorClassification.environmentErrors > 0) {
71
- const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
74
+ const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
72
75
  if (isEnvironmentError) {
73
76
  result.environment.push(issue);
74
77
  continue;
@@ -99,7 +102,7 @@ function classifyIssuesBySource(profiles) {
99
102
  continue;
100
103
  }
101
104
  // Check if error message indicates environment issue
102
- const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
105
+ const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
103
106
  if (isEnvironmentError) {
104
107
  result.environment.push(issue);
105
108
  continue;
@@ -145,11 +148,20 @@ export function generateContractMd(result, options) {
145
148
  // Overview
146
149
  lines.push('## Overview');
147
150
  lines.push('');
151
+ const features = getFeatureFlags(discovery.protocolVersion);
148
152
  lines.push(`**Server Version:** ${discovery.serverInfo.version}`);
149
153
  lines.push(`**Protocol Version:** ${discovery.protocolVersion}`);
154
+ if (discovery.protocolVersion !== MCP.PROTOCOL_VERSION) {
155
+ lines.push(`*(Server protocol; bellwether supports up to ${MCP.PROTOCOL_VERSION})*`);
156
+ }
150
157
  lines.push('');
151
158
  const performanceMetrics = calculatePerformanceMetrics(toolProfiles);
152
- const performanceByTool = new Map(performanceMetrics.map(metric => [metric.toolName, metric]));
159
+ const performanceByTool = new Map(performanceMetrics.map((metric) => [metric.toolName, metric]));
160
+ // Server instructions
161
+ if (discovery.instructions) {
162
+ lines.push(`**Server Instructions:** ${discovery.instructions}`);
163
+ lines.push('');
164
+ }
153
165
  // Capabilities summary
154
166
  lines.push('## Capabilities');
155
167
  lines.push('');
@@ -162,6 +174,15 @@ export function generateContractMd(result, options) {
162
174
  if (discovery.capabilities.resources) {
163
175
  lines.push(`- **Resources:** ${(discovery.resources ?? []).length} available`);
164
176
  }
177
+ if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
178
+ lines.push(`- **Resource Templates:** ${discovery.resourceTemplates.length} available`);
179
+ }
180
+ if (discovery.capabilities.completions && features.completions) {
181
+ lines.push('- **Completions:** Supported');
182
+ }
183
+ if (discovery.capabilities.tasks && features.tasks) {
184
+ lines.push('- **Tasks:** Supported');
185
+ }
165
186
  if (discovery.capabilities.logging) {
166
187
  lines.push('- **Logging:** Supported');
167
188
  }
@@ -176,7 +197,7 @@ export function generateContractMd(result, options) {
176
197
  const params = extractParameters(tool.inputSchema);
177
198
  const desc = tool.description?.substring(0, 50) || 'No description';
178
199
  const descDisplay = tool.description && tool.description.length > 50 ? `${desc}...` : desc;
179
- const profile = toolProfiles.find(p => p.name === tool.name);
200
+ const profile = toolProfiles.find((p) => p.name === tool.name);
180
201
  const perf = performanceByTool.get(tool.name);
181
202
  const successRate = calculateToolSuccessRate(profile, {
182
203
  countValidationAsSuccess,
@@ -291,7 +312,7 @@ export function generateContractMd(result, options) {
291
312
  lines.push('## Tools');
292
313
  lines.push('');
293
314
  for (const tool of discovery.tools) {
294
- const profile = toolProfiles.find(p => p.name === tool.name);
315
+ const profile = toolProfiles.find((p) => p.name === tool.name);
295
316
  lines.push(`### ${tool.name}`);
296
317
  lines.push('');
297
318
  lines.push(tool.description || 'No description available.');
@@ -319,6 +340,22 @@ export function generateContractMd(result, options) {
319
340
  lines.push('');
320
341
  }
321
342
  }
343
+ // Show tool annotations (behavioral hints) — version-gated
344
+ if (features.toolAnnotations && tool.annotations) {
345
+ const hints = [];
346
+ if (tool.annotations.readOnlyHint)
347
+ hints.push('read-only');
348
+ if (tool.annotations.destructiveHint)
349
+ hints.push('destructive');
350
+ if (tool.annotations.idempotentHint)
351
+ hints.push('idempotent');
352
+ if (tool.annotations.openWorldHint)
353
+ hints.push('open-world');
354
+ if (hints.length > 0) {
355
+ lines.push(`**Behavioral Hints:** ${hints.join(', ')}`);
356
+ lines.push('');
357
+ }
358
+ }
322
359
  if (tool.inputSchema) {
323
360
  lines.push('**Input Schema:**');
324
361
  const schemaJson = validateJsonForCodeBlock(tool.inputSchema);
@@ -327,6 +364,15 @@ export function generateContractMd(result, options) {
327
364
  lines.push('```');
328
365
  lines.push('');
329
366
  }
367
+ // Show output schema if present — version-gated
368
+ if (features.structuredOutput && tool.outputSchema) {
369
+ lines.push('**Output Schema:**');
370
+ const outputSchemaJson = validateJsonForCodeBlock(tool.outputSchema);
371
+ lines.push('```json');
372
+ lines.push(outputSchemaJson.content);
373
+ lines.push('```');
374
+ lines.push('');
375
+ }
330
376
  // Add example usage from successful interactions
331
377
  const examples = generateToolExamples(profile, maxExamplesPerTool, exampleLength);
332
378
  if (examples.length > 0) {
@@ -378,6 +424,24 @@ export function generateContractMd(result, options) {
378
424
  }
379
425
  }
380
426
  }
427
+ // Resource Templates section
428
+ if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
429
+ lines.push('## Resource Templates');
430
+ lines.push('');
431
+ for (const template of discovery.resourceTemplates) {
432
+ lines.push(`### ${template.name}`);
433
+ lines.push('');
434
+ lines.push(`**URI Template:** \`${template.uriTemplate}\``);
435
+ if (template.mimeType) {
436
+ lines.push(`**MIME Type:** ${template.mimeType}`);
437
+ }
438
+ lines.push('');
439
+ if (template.description) {
440
+ lines.push(template.description);
441
+ lines.push('');
442
+ }
443
+ }
444
+ }
381
445
  // Error Summary section
382
446
  const errorSummary = generateErrorSummarySection(toolProfiles);
383
447
  if (errorSummary.length > 0) {
@@ -397,7 +461,7 @@ function calculateReliabilityMetrics(profile, options) {
397
461
  if (!profile) {
398
462
  return null;
399
463
  }
400
- const interactions = profile.interactions.filter(i => !i.mocked);
464
+ const interactions = profile.interactions.filter((i) => !i.mocked);
401
465
  if (interactions.length === 0) {
402
466
  return null;
403
467
  }
@@ -408,7 +472,7 @@ function calculateReliabilityMetrics(profile, options) {
408
472
  for (const interaction of interactions) {
409
473
  const expected = interaction.question.expectedOutcome ?? 'success';
410
474
  const hasError = interaction.error || interaction.response?.isError;
411
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
475
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
412
476
  const hasErrorText = textContent && 'text' in textContent && looksLikeError(String(textContent.text));
413
477
  const gotError = hasError || hasErrorText;
414
478
  if (expected === 'error') {
@@ -437,7 +501,9 @@ function calculateReliabilityMetrics(profile, options) {
437
501
  const reliabilityRate = total > 0 ? (correctOutcomes / total) * 100 : 0;
438
502
  const happyPathRate = happyPathTotal > 0 ? (happyPathSuccesses / happyPathTotal) * 100 : 100;
439
503
  const validationRate = options.separateValidationMetrics
440
- ? (validationTotal > 0 ? (validationSuccesses / validationTotal) * 100 : 100)
504
+ ? validationTotal > 0
505
+ ? (validationSuccesses / validationTotal) * 100
506
+ : 100
441
507
  : 100;
442
508
  return {
443
509
  total,
@@ -481,8 +547,7 @@ function formatConfidenceIndicator(level) {
481
547
  function generateTransportIssuesSection(transportErrors, warnings) {
482
548
  const lines = [];
483
549
  // Skip if no transport issues to report
484
- if ((!transportErrors || transportErrors.length === 0) &&
485
- (!warnings || warnings.length === 0)) {
550
+ if ((!transportErrors || transportErrors.length === 0) && (!warnings || warnings.length === 0)) {
486
551
  return lines;
487
552
  }
488
553
  lines.push('## Transport Issues');
@@ -507,8 +572,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
507
572
  lines.push('The following transport-level errors were detected during server communication:');
508
573
  lines.push('');
509
574
  // Categorize errors
510
- const serverBugErrors = transportErrors.filter(e => e.likelyServerBug);
511
- const envErrors = transportErrors.filter(e => !e.likelyServerBug);
575
+ const serverBugErrors = transportErrors.filter((e) => e.likelyServerBug);
576
+ const envErrors = transportErrors.filter((e) => !e.likelyServerBug);
512
577
  // Server bugs (critical)
513
578
  if (serverBugErrors.length > 0) {
514
579
  lines.push('#### Likely Server Bugs');
@@ -548,8 +613,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
548
613
  lines.push('');
549
614
  }
550
615
  // Recommendations
551
- const hasInvalidJson = transportErrors.some(e => e.category === 'invalid_json');
552
- const hasProtocolError = transportErrors.some(e => e.category === 'protocol_violation');
616
+ const hasInvalidJson = transportErrors.some((e) => e.category === 'invalid_json');
617
+ const hasProtocolError = transportErrors.some((e) => e.category === 'protocol_violation');
553
618
  if (hasInvalidJson || hasProtocolError) {
554
619
  lines.push('### Recommendations');
555
620
  lines.push('');
@@ -607,7 +672,7 @@ function generateMetricsLegendSection() {
607
672
  }
608
673
  function generateValidationTestingSection(profiles) {
609
674
  const lines = [];
610
- const validationSummary = profiles.map(profile => {
675
+ const validationSummary = profiles.map((profile) => {
611
676
  const buckets = {
612
677
  input: summarizeValidationBucket(profile, 'input'),
613
678
  type: summarizeValidationBucket(profile, 'type'),
@@ -615,7 +680,7 @@ function generateValidationTestingSection(profiles) {
615
680
  };
616
681
  return { profile, buckets };
617
682
  });
618
- const hasValidationTests = validationSummary.some(summary => Object.values(summary.buckets).some(bucket => bucket.total > 0));
683
+ const hasValidationTests = validationSummary.some((summary) => Object.values(summary.buckets).some((bucket) => bucket.total > 0));
619
684
  if (!hasValidationTests) {
620
685
  return lines;
621
686
  }
@@ -668,8 +733,8 @@ function generateIssuesDetectedSection(profiles) {
668
733
  lines.push(`### ${ISSUE_CLASSIFICATION.ICONS.serverBug} ${ISSUE_CLASSIFICATION.HEADERS.serverBug}`);
669
734
  lines.push('');
670
735
  // Separate critical (accepts invalid input) from other bugs
671
- const criticalBugs = classified.serverBug.filter(i => i.critical);
672
- const otherBugs = classified.serverBug.filter(i => !i.critical);
736
+ const criticalBugs = classified.serverBug.filter((i) => i.critical);
737
+ const otherBugs = classified.serverBug.filter((i) => !i.critical);
673
738
  if (criticalBugs.length > 0) {
674
739
  lines.push('**Critical - Accepts Invalid Input:**');
675
740
  for (const issue of criticalBugs.slice(0, DISPLAY_LIMITS.ISSUES_DISPLAY_LIMIT)) {
@@ -809,7 +874,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
809
874
  return [];
810
875
  }
811
876
  // Only show if we have meaningful data
812
- const hasValidMetrics = metrics.some(m => m.callCount >= 2);
877
+ const hasValidMetrics = metrics.some((m) => m.callCount >= 2);
813
878
  if (!hasValidMetrics) {
814
879
  return [];
815
880
  }
@@ -830,11 +895,11 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
830
895
  }
831
896
  lines.push('');
832
897
  // Show low confidence warning if any tools have low confidence
833
- const lowConfidenceTools = metrics.filter(m => m.confidence?.confidenceLevel === 'low');
898
+ const lowConfidenceTools = metrics.filter((m) => m.confidence?.confidenceLevel === 'low');
834
899
  if (lowConfidenceTools.length > 0) {
835
900
  // Categorize low confidence by reason
836
- const lowSampleTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
837
- const highVariabilityTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
901
+ const lowSampleTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
902
+ const highVariabilityTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
838
903
  (m.confidence?.coefficientOfVariation ?? 0) > PERFORMANCE_CONFIDENCE.MEDIUM.MAX_CV);
839
904
  lines.push(`> **⚠️ Low Confidence**: ${lowConfidenceTools.length} tool(s) have low statistical confidence.`);
840
905
  if (lowSampleTools.length > 0) {
@@ -847,7 +912,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
847
912
  lines.push('');
848
913
  }
849
914
  // Add confidence summary section (collapsed)
850
- const hasConfidenceData = metrics.some(m => m.confidence);
915
+ const hasConfidenceData = metrics.some((m) => m.confidence);
851
916
  if (hasConfidenceData) {
852
917
  lines.push('<details>');
853
918
  lines.push('<summary>Confidence Metrics Details</summary>');
@@ -867,7 +932,9 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
867
932
  // In this case, display ~0% to indicate the variability is below measurement threshold
868
933
  const rawCV = m.confidence.coefficientOfVariation * 100;
869
934
  const cvDisplay = successfulSamples > 0
870
- ? (roundedStdDev === 0 && rawCV > 1 ? '~0%' : `${rawCV.toFixed(1)}%`)
935
+ ? roundedStdDev === 0 && rawCV > 1
936
+ ? '~0%'
937
+ : `${rawCV.toFixed(1)}%`
871
938
  : 'N/A';
872
939
  const levelLabel = PERFORMANCE_CONFIDENCE.LABELS[m.confidence.confidenceLevel];
873
940
  lines.push(`| \`${escapeTableCell(m.toolName)}\` | ${successfulSamples} | ${validationSamples} | ${totalTests} | ${stdDevDisplay} | ${cvDisplay} | ${levelLabel} |`);
@@ -931,11 +998,11 @@ function generateContractSecuritySection(fingerprints) {
931
998
  lines.push(`| Average Risk Score | ${avgRiskScore}/100 |`);
932
999
  // Count by severity
933
1000
  const bySeverity = {
934
- critical: allFindings.filter(f => f.riskLevel === 'critical').length,
935
- high: allFindings.filter(f => f.riskLevel === 'high').length,
936
- medium: allFindings.filter(f => f.riskLevel === 'medium').length,
937
- low: allFindings.filter(f => f.riskLevel === 'low').length,
938
- info: allFindings.filter(f => f.riskLevel === 'info').length,
1001
+ critical: allFindings.filter((f) => f.riskLevel === 'critical').length,
1002
+ high: allFindings.filter((f) => f.riskLevel === 'high').length,
1003
+ medium: allFindings.filter((f) => f.riskLevel === 'medium').length,
1004
+ low: allFindings.filter((f) => f.riskLevel === 'low').length,
1005
+ info: allFindings.filter((f) => f.riskLevel === 'info').length,
939
1006
  };
940
1007
  if (bySeverity.critical > 0) {
941
1008
  lines.push(`| Critical Findings | ${bySeverity.critical} |`);
@@ -954,7 +1021,7 @@ function generateContractSecuritySection(fingerprints) {
954
1021
  return lines;
955
1022
  }
956
1023
  // Show findings by severity
957
- const criticalAndHigh = allFindings.filter(f => f.riskLevel === 'critical' || f.riskLevel === 'high');
1024
+ const criticalAndHigh = allFindings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high');
958
1025
  if (criticalAndHigh.length > 0) {
959
1026
  lines.push('### Critical and High Severity Findings');
960
1027
  lines.push('');
@@ -987,7 +1054,7 @@ function generateContractSecuritySection(fingerprints) {
987
1054
  lines.push('');
988
1055
  }
989
1056
  // Show medium/low findings in collapsed section
990
- const mediumAndLow = allFindings.filter(f => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
1057
+ const mediumAndLow = allFindings.filter((f) => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
991
1058
  if (mediumAndLow.length > 0) {
992
1059
  lines.push('<details>');
993
1060
  lines.push(`<summary>Medium/Low Severity Findings (${mediumAndLow.length})</summary>`);
@@ -1027,10 +1094,10 @@ function generateWorkflowTestingSection(results) {
1027
1094
  if (results.length === 0) {
1028
1095
  return [];
1029
1096
  }
1030
- const passed = results.filter(r => r.success).length;
1097
+ const passed = results.filter((r) => r.success).length;
1031
1098
  const failed = results.length - passed;
1032
1099
  const totalSteps = results.reduce((sum, r) => sum + r.workflow.steps.length, 0);
1033
- const passedSteps = results.reduce((sum, r) => sum + r.steps.filter(s => s.success).length, 0);
1100
+ const passedSteps = results.reduce((sum, r) => sum + r.steps.filter((s) => s.success).length, 0);
1034
1101
  const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
1035
1102
  lines.push('## Workflow Testing');
1036
1103
  lines.push('');
@@ -1053,7 +1120,7 @@ function generateWorkflowTestingSection(results) {
1053
1120
  lines.push('|----------|--------|-------|----------|');
1054
1121
  for (const result of results) {
1055
1122
  const status = result.success ? '✓ Passed' : '✗ Failed';
1056
- const stepsInfo = `${result.steps.filter(s => s.success).length}/${result.workflow.steps.length}`;
1123
+ const stepsInfo = `${result.steps.filter((s) => s.success).length}/${result.workflow.steps.length}`;
1057
1124
  const duration = formatDuration(result.durationMs);
1058
1125
  lines.push(`| ${escapeTableCell(result.workflow.name)} | ${status} | ${stepsInfo} | ${duration} |`);
1059
1126
  }
@@ -1083,8 +1150,8 @@ function generateWorkflowTestingSection(results) {
1083
1150
  if (stepResult.error) {
1084
1151
  notes = escapeTableCell(truncateString(stepResult.error, 40));
1085
1152
  }
1086
- else if (stepResult.assertionResults?.some(a => !a.passed)) {
1087
- const failedAssertions = stepResult.assertionResults.filter(a => !a.passed);
1153
+ else if (stepResult.assertionResults?.some((a) => !a.passed)) {
1154
+ const failedAssertions = stepResult.assertionResults.filter((a) => !a.passed);
1088
1155
  notes = `${failedAssertions.length} assertion(s) failed`;
1089
1156
  }
1090
1157
  }
@@ -1177,17 +1244,17 @@ function generateSemanticTypesSection(inferences) {
1177
1244
  byType.set(inf.inferredType, existing);
1178
1245
  }
1179
1246
  // Sort by number of parameters (most common types first)
1180
- const sortedTypes = Array.from(byType.entries())
1181
- .sort((a, b) => b[1].length - a[1].length);
1247
+ const sortedTypes = Array.from(byType.entries()).sort((a, b) => b[1].length - a[1].length);
1182
1248
  lines.push('| Type | Parameters | Expected Format |');
1183
1249
  lines.push('|------|------------|-----------------|');
1184
1250
  for (const [type, params] of sortedTypes) {
1185
1251
  const displayName = SEMANTIC_VALIDATION.TYPE_DISPLAY_NAMES[type] ?? type;
1186
- const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ?? '';
1252
+ const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
1253
+ '';
1187
1254
  // Format parameters as tool.param
1188
1255
  const paramList = params
1189
1256
  .slice(0, 3)
1190
- .map(p => `\`${p.toolName}.${p.paramName}\``)
1257
+ .map((p) => `\`${p.toolName}.${p.paramName}\``)
1191
1258
  .join(', ');
1192
1259
  const moreCount = params.length > 3 ? ` +${params.length - 3} more` : '';
1193
1260
  lines.push(`| ${displayName} | ${paramList}${moreCount} | \`${exampleValue}\` |`);
@@ -1246,9 +1313,10 @@ function generateSchemaStabilitySection(schemaEvolution) {
1246
1313
  lines.push('Response schema consistency metrics for tools with sufficient test samples:');
1247
1314
  lines.push('');
1248
1315
  // Summary stats
1249
- const stableCount = toolsWithSchemas.filter(t => t.evolution.isStable).length;
1316
+ const stableCount = toolsWithSchemas.filter((t) => t.evolution.isStable).length;
1250
1317
  const unstableCount = toolsWithSchemas.length - stableCount;
1251
- const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) / toolsWithSchemas.length;
1318
+ const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
1319
+ toolsWithSchemas.length;
1252
1320
  lines.push('| Metric | Value |');
1253
1321
  lines.push('|--------|-------|');
1254
1322
  lines.push(`| Tools Analyzed | ${toolsWithSchemas.length} |`);
@@ -1269,7 +1337,7 @@ function generateSchemaStabilitySection(schemaEvolution) {
1269
1337
  lines.push('| Tool | Grade | Stability | Confidence | Samples | Issues |');
1270
1338
  lines.push('|------|-------|-----------|------------|---------|--------|');
1271
1339
  // Sort by grade (worst first, then by name)
1272
- const gradeOrder = { 'F': 0, 'D': 1, 'C': 2, 'B': 3, 'A': 4, 'N/A': 5 };
1340
+ const gradeOrder = { F: 0, D: 1, C: 2, B: 3, A: 4, 'N/A': 5 };
1273
1341
  const sortedTools = [...toolsWithSchemas].sort((a, b) => {
1274
1342
  const gradeCompare = gradeOrder[a.grade] - gradeOrder[b.grade];
1275
1343
  if (gradeCompare !== 0)
@@ -1284,13 +1352,15 @@ function generateSchemaStabilitySection(schemaEvolution) {
1284
1352
  const confidenceDisplay = `${Math.round(evolution.stabilityConfidence * 100)}%`;
1285
1353
  const issues = evolution.inconsistentFields.length > 0
1286
1354
  ? evolution.inconsistentFields.slice(0, 2).join(', ') +
1287
- (evolution.inconsistentFields.length > 2 ? ` +${evolution.inconsistentFields.length - 2}` : '')
1355
+ (evolution.inconsistentFields.length > 2
1356
+ ? ` +${evolution.inconsistentFields.length - 2}`
1357
+ : '')
1288
1358
  : '-';
1289
1359
  lines.push(`| \`${escapeTableCell(name)}\` | ${gradeEmoji} ${grade} | ${stabilityStatus} | ${confidenceDisplay} | ${evolution.sampleCount} | ${escapeTableCell(issues)} |`);
1290
1360
  }
1291
1361
  lines.push('');
1292
1362
  // Detailed breakdown for unstable tools
1293
- const unstableTools = sortedTools.filter(t => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
1363
+ const unstableTools = sortedTools.filter((t) => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
1294
1364
  if (unstableTools.length > 0) {
1295
1365
  lines.push('<details>');
1296
1366
  lines.push('<summary>Unstable Schema Details</summary>');
@@ -1327,12 +1397,18 @@ function generateSchemaStabilitySection(schemaEvolution) {
1327
1397
  */
1328
1398
  function getGradeEmoji(grade) {
1329
1399
  switch (grade) {
1330
- case 'A': return '🟢';
1331
- case 'B': return '🟢';
1332
- case 'C': return '🟡';
1333
- case 'D': return '🟠';
1334
- case 'F': return '🔴';
1335
- case 'N/A': return '';
1400
+ case 'A':
1401
+ return '🟢';
1402
+ case 'B':
1403
+ return '🟢';
1404
+ case 'C':
1405
+ return '🟡';
1406
+ case 'D':
1407
+ return '🟠';
1408
+ case 'F':
1409
+ return '🔴';
1410
+ case 'N/A':
1411
+ return '⚪';
1336
1412
  }
1337
1413
  }
1338
1414
  /**
@@ -1360,7 +1436,8 @@ function generateErrorAnalysisSection(summaries) {
1360
1436
  const allCategories = new Set();
1361
1437
  const transientCount = toolsWithErrors.reduce((sum, t) => sum + t.summary.transientErrors, 0);
1362
1438
  for (const { summary } of toolsWithErrors) {
1363
- for (const cat of summary.categoryCounts.keys()) {
1439
+ const counts = normalizeCategoryCounts(summary.categoryCounts);
1440
+ for (const cat of counts.keys()) {
1364
1441
  allCategories.add(cat);
1365
1442
  }
1366
1443
  }
@@ -1374,7 +1451,8 @@ function generateErrorAnalysisSection(summaries) {
1374
1451
  // Overall error breakdown by category
1375
1452
  const globalCategoryCounts = new Map();
1376
1453
  for (const { summary } of toolsWithErrors) {
1377
- for (const [cat, count] of summary.categoryCounts) {
1454
+ const counts = normalizeCategoryCounts(summary.categoryCounts);
1455
+ for (const [cat, count] of counts) {
1378
1456
  globalCategoryCounts.set(cat, (globalCategoryCounts.get(cat) ?? 0) + count);
1379
1457
  }
1380
1458
  }
@@ -1384,10 +1462,10 @@ function generateErrorAnalysisSection(summaries) {
1384
1462
  lines.push('| Category | Count | Description |');
1385
1463
  lines.push('|----------|-------|-------------|');
1386
1464
  // Sort by count descending
1387
- const sortedCategories = Array.from(globalCategoryCounts.entries())
1388
- .sort((a, b) => b[1] - a[1]);
1465
+ const sortedCategories = Array.from(globalCategoryCounts.entries()).sort((a, b) => b[1] - a[1]);
1389
1466
  for (const [category, count] of sortedCategories) {
1390
- const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ?? category;
1467
+ const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
1468
+ category;
1391
1469
  const emoji = getCategoryEmoji(category);
1392
1470
  lines.push(`| ${emoji} ${label} | ${count} | ${escapeTableCell(formatCategoryDescription(category))} |`);
1393
1471
  }
@@ -1406,14 +1484,12 @@ function generateErrorAnalysisSection(summaries) {
1406
1484
  ? (ERROR_ANALYSIS.CATEGORY_LABELS[topCategory] ?? topCategory)
1407
1485
  : '-';
1408
1486
  const topRemediation = summary.topRemediations[0] ?? '-';
1409
- const truncatedRemediation = topRemediation.length > 50
1410
- ? `${topRemediation.slice(0, 47)}...`
1411
- : topRemediation;
1487
+ const truncatedRemediation = topRemediation.length > 50 ? `${topRemediation.slice(0, 47)}...` : topRemediation;
1412
1488
  lines.push(`| \`${escapeTableCell(name)}\` | ${summary.totalErrors} | ${summary.transientErrors} | ${topCategoryLabel} | ${escapeTableCell(truncatedRemediation)} |`);
1413
1489
  }
1414
1490
  lines.push('');
1415
1491
  // Detailed remediation suggestions (collapsed)
1416
- const toolsWithRemediations = sortedTools.filter(t => t.summary.topRemediations.length > 0);
1492
+ const toolsWithRemediations = sortedTools.filter((t) => t.summary.topRemediations.length > 0);
1417
1493
  if (toolsWithRemediations.length > 0) {
1418
1494
  lines.push('<details>');
1419
1495
  lines.push('<summary>Remediation Suggestions</summary>');
@@ -1463,13 +1539,20 @@ function generateErrorAnalysisSection(summaries) {
1463
1539
  */
1464
1540
  function getCategoryEmoji(category) {
1465
1541
  switch (category) {
1466
- case 'client_error_validation': return '⚠️';
1467
- case 'client_error_auth': return '🔐';
1468
- case 'client_error_not_found': return '🔍';
1469
- case 'client_error_conflict': return '💥';
1470
- case 'client_error_rate_limit': return '⏱️';
1471
- case 'server_error': return '🔥';
1472
- default: return '';
1542
+ case 'client_error_validation':
1543
+ return '⚠️';
1544
+ case 'client_error_auth':
1545
+ return '🔐';
1546
+ case 'client_error_not_found':
1547
+ return '🔍';
1548
+ case 'client_error_conflict':
1549
+ return '💥';
1550
+ case 'client_error_rate_limit':
1551
+ return '⏱️';
1552
+ case 'server_error':
1553
+ return '🔥';
1554
+ default:
1555
+ return '❓';
1473
1556
  }
1474
1557
  }
1475
1558
  /**
@@ -1493,13 +1576,30 @@ function formatCategoryDescription(category) {
1493
1576
  return 'Unknown error category';
1494
1577
  }
1495
1578
  }
1579
+ /**
1580
+ * Get the top category from a category counts map.
1581
+ */
1582
+ function normalizeCategoryCounts(counts) {
1583
+ if (!counts) {
1584
+ return new Map();
1585
+ }
1586
+ if (counts instanceof Map) {
1587
+ return counts;
1588
+ }
1589
+ if (typeof counts !== 'object') {
1590
+ return new Map();
1591
+ }
1592
+ const entries = Object.entries(counts).filter((entry) => typeof entry[1] === 'number');
1593
+ return new Map(entries);
1594
+ }
1496
1595
  /**
1497
1596
  * Get the top category from a category counts map.
1498
1597
  */
1499
1598
  function getTopCategory(counts) {
1599
+ const normalized = normalizeCategoryCounts(counts);
1500
1600
  let topCategory;
1501
1601
  let topCount = 0;
1502
- for (const [category, count] of counts) {
1602
+ for (const [category, count] of normalized) {
1503
1603
  if (count > topCount) {
1504
1604
  topCount = count;
1505
1605
  topCategory = category;
@@ -1601,7 +1701,10 @@ function formatIssueTypeLabel(type) {
1601
1701
  case 'no_examples':
1602
1702
  return 'No Examples';
1603
1703
  default:
1604
- return type.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
1704
+ return type
1705
+ .split('_')
1706
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
1707
+ .join(' ');
1605
1708
  }
1606
1709
  }
1607
1710
  /**
@@ -1617,10 +1720,10 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
1617
1720
  return [];
1618
1721
  }
1619
1722
  // Find successful interactions
1620
- const successful = profile.interactions.filter(i => {
1723
+ const successful = profile.interactions.filter((i) => {
1621
1724
  if (i.error || i.response?.isError)
1622
1725
  return false;
1623
- const textContent = i.response?.content?.find(c => c.type === 'text');
1726
+ const textContent = i.response?.content?.find((c) => c.type === 'text');
1624
1727
  if (textContent && 'text' in textContent) {
1625
1728
  if (looksLikeError(String(textContent.text)))
1626
1729
  return false;
@@ -1640,7 +1743,7 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
1640
1743
  if (seenArgsHashes.has(argsHash))
1641
1744
  continue;
1642
1745
  seenArgsHashes.add(argsHash);
1643
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1746
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1644
1747
  if (!textContent || !('text' in textContent))
1645
1748
  continue;
1646
1749
  const responseText = String(textContent.text);
@@ -1696,7 +1799,7 @@ function generateToolErrorPatterns(profile) {
1696
1799
  continue;
1697
1800
  }
1698
1801
  const errorText = interaction.error || '';
1699
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1802
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1700
1803
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1701
1804
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1702
1805
  if (!isError)
@@ -1706,7 +1809,8 @@ function generateToolErrorPatterns(profile) {
1706
1809
  continue;
1707
1810
  const category = categorizeError(errorContent);
1708
1811
  const existing = errorCategories.get(category) || [];
1709
- if (existing.length < 2) { // Max 2 examples per category
1812
+ if (existing.length < 2) {
1813
+ // Max 2 examples per category
1710
1814
  const truncated = errorContent.length > 100 ? `${errorContent.slice(0, 97)}...` : errorContent;
1711
1815
  existing.push(truncated);
1712
1816
  }
@@ -1758,7 +1862,7 @@ function generateErrorSummarySection(profiles) {
1758
1862
  continue;
1759
1863
  }
1760
1864
  const errorText = interaction.error || '';
1761
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1865
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1762
1866
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1763
1867
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1764
1868
  if (!isError)
@@ -1771,7 +1875,8 @@ function generateErrorSummarySection(profiles) {
1771
1875
  existing.count++;
1772
1876
  existing.tools.add(profile.name);
1773
1877
  if (!existing.example) {
1774
- existing.example = errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
1878
+ existing.example =
1879
+ errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
1775
1880
  }
1776
1881
  categoryCounts.set(category, existing);
1777
1882
  }
@@ -1786,7 +1891,10 @@ function generateErrorSummarySection(profiles) {
1786
1891
  lines.push('| Category | Count | Affected Tools |');
1787
1892
  lines.push('|----------|-------|----------------|');
1788
1893
  for (const [category, data] of categoryCounts) {
1789
- const toolList = Array.from(data.tools).slice(0, 3).map(t => `\`${t}\``).join(', ');
1894
+ const toolList = Array.from(data.tools)
1895
+ .slice(0, 3)
1896
+ .map((t) => `\`${t}\``)
1897
+ .join(', ');
1790
1898
  const more = data.tools.size > 3 ? ` +${data.tools.size - 3} more` : '';
1791
1899
  lines.push(`| ${category} | ${data.count} | ${toolList}${more} |`);
1792
1900
  }
@@ -1813,7 +1921,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
1813
1921
  continue;
1814
1922
  }
1815
1923
  const errorText = interaction.error || '';
1816
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1924
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1817
1925
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1818
1926
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1819
1927
  if (!isError)
@@ -1843,7 +1951,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
1843
1951
  });
1844
1952
  }
1845
1953
  if (patterns.length > 0) {
1846
- const tool = tools.find(t => t.name === profile.name);
1954
+ const tool = tools.find((t) => t.name === profile.name);
1847
1955
  errorInputs.push({
1848
1956
  toolName: profile.name,
1849
1957
  toolDescription: tool?.description,
@@ -1962,7 +2070,9 @@ function collectAssertionFailures(profile) {
1962
2070
  for (const result of interaction.assertionResults ?? []) {
1963
2071
  if (result.passed)
1964
2072
  continue;
1965
- const message = result.message ? `${result.type}: ${result.message}` : `${result.type} failed`;
2073
+ const message = result.message
2074
+ ? `${result.type}: ${result.message}`
2075
+ : `${result.type} failed`;
1966
2076
  failures.add(message);
1967
2077
  }
1968
2078
  }