@dotsetlabs/bellwether 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +74 -0
  2. package/README.md +8 -2
  3. package/dist/baseline/accessors.d.ts +1 -1
  4. package/dist/baseline/accessors.js +1 -3
  5. package/dist/baseline/baseline-format.d.ts +287 -0
  6. package/dist/baseline/baseline-format.js +12 -0
  7. package/dist/baseline/comparator.js +249 -11
  8. package/dist/baseline/converter.d.ts +15 -15
  9. package/dist/baseline/converter.js +46 -34
  10. package/dist/baseline/diff.d.ts +1 -1
  11. package/dist/baseline/diff.js +45 -28
  12. package/dist/baseline/error-analyzer.d.ts +1 -1
  13. package/dist/baseline/error-analyzer.js +90 -17
  14. package/dist/baseline/incremental-checker.js +8 -5
  15. package/dist/baseline/index.d.ts +2 -12
  16. package/dist/baseline/index.js +3 -23
  17. package/dist/baseline/performance-tracker.d.ts +0 -1
  18. package/dist/baseline/performance-tracker.js +13 -20
  19. package/dist/baseline/response-fingerprint.js +39 -2
  20. package/dist/baseline/saver.js +41 -10
  21. package/dist/baseline/schema-compare.d.ts +22 -0
  22. package/dist/baseline/schema-compare.js +259 -16
  23. package/dist/baseline/types.d.ts +10 -7
  24. package/dist/cache/response-cache.d.ts +8 -0
  25. package/dist/cache/response-cache.js +110 -0
  26. package/dist/cli/commands/check.js +23 -6
  27. package/dist/cli/commands/explore.js +34 -14
  28. package/dist/cli/index.js +8 -0
  29. package/dist/config/template.js +8 -7
  30. package/dist/config/validator.d.ts +59 -59
  31. package/dist/config/validator.js +245 -90
  32. package/dist/constants/core.d.ts +4 -0
  33. package/dist/constants/core.js +8 -19
  34. package/dist/constants/registry.d.ts +17 -0
  35. package/dist/constants/registry.js +18 -0
  36. package/dist/constants/testing.d.ts +0 -369
  37. package/dist/constants/testing.js +18 -456
  38. package/dist/constants.d.ts +1 -1
  39. package/dist/constants.js +1 -1
  40. package/dist/docs/contract.js +131 -83
  41. package/dist/docs/report.js +8 -5
  42. package/dist/interview/insights.d.ts +17 -0
  43. package/dist/interview/insights.js +52 -0
  44. package/dist/interview/interviewer.js +52 -10
  45. package/dist/interview/prompt-test-generator.d.ts +12 -0
  46. package/dist/interview/prompt-test-generator.js +77 -0
  47. package/dist/interview/resource-test-generator.d.ts +12 -0
  48. package/dist/interview/resource-test-generator.js +20 -0
  49. package/dist/interview/schema-inferrer.js +26 -4
  50. package/dist/interview/schema-test-generator.js +278 -31
  51. package/dist/interview/stateful-test-runner.d.ts +3 -0
  52. package/dist/interview/stateful-test-runner.js +80 -0
  53. package/dist/interview/types.d.ts +12 -0
  54. package/dist/transport/mcp-client.js +1 -1
  55. package/dist/transport/sse-transport.d.ts +7 -3
  56. package/dist/transport/sse-transport.js +157 -67
  57. package/dist/version.js +1 -1
  58. package/man/bellwether.1 +1 -1
  59. package/man/bellwether.1.md +2 -2
  60. package/package.json +1 -1
  61. package/schemas/bellwether-check.schema.json +185 -0
  62. package/schemas/bellwether-explore.schema.json +837 -0
  63. package/scripts/completions/bellwether.bash +10 -4
  64. package/scripts/completions/bellwether.zsh +55 -2
@@ -52,13 +52,15 @@ function classifyIssuesBySource(profiles) {
52
52
  // but tool didn't actually reject - this shouldn't happen with outcomeAssessment.correct check above
53
53
  // so we classify based on expected outcome and error classification
54
54
  // 1. Check for external dependency errors (highest priority for classification)
55
- if (errorClassification && errorClassification.externalServiceErrors > 0 && detectedServices.length > 0) {
55
+ if (errorClassification &&
56
+ errorClassification.externalServiceErrors > 0 &&
57
+ detectedServices.length > 0) {
56
58
  // Check if the error message matches known external service patterns
57
- const isExternalError = detectedServices.some(service => {
59
+ const isExternalError = detectedServices.some((service) => {
58
60
  const serviceConfig = EXTERNAL_DEPENDENCIES.SERVICES[service];
59
61
  if (!serviceConfig)
60
62
  return false;
61
- return serviceConfig.errorPatterns.some(pattern => pattern.test(errorMsg));
63
+ return serviceConfig.errorPatterns.some((pattern) => pattern.test(errorMsg));
62
64
  });
63
65
  if (isExternalError) {
64
66
  issue.service = detectedServices[0];
@@ -68,7 +70,7 @@ function classifyIssuesBySource(profiles) {
68
70
  }
69
71
  // 2. Check for environment configuration errors
70
72
  if (errorClassification && errorClassification.environmentErrors > 0) {
71
- const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
73
+ const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
72
74
  if (isEnvironmentError) {
73
75
  result.environment.push(issue);
74
76
  continue;
@@ -99,7 +101,7 @@ function classifyIssuesBySource(profiles) {
99
101
  continue;
100
102
  }
101
103
  // Check if error message indicates environment issue
102
- const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
104
+ const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
103
105
  if (isEnvironmentError) {
104
106
  result.environment.push(issue);
105
107
  continue;
@@ -149,7 +151,7 @@ export function generateContractMd(result, options) {
149
151
  lines.push(`**Protocol Version:** ${discovery.protocolVersion}`);
150
152
  lines.push('');
151
153
  const performanceMetrics = calculatePerformanceMetrics(toolProfiles);
152
- const performanceByTool = new Map(performanceMetrics.map(metric => [metric.toolName, metric]));
154
+ const performanceByTool = new Map(performanceMetrics.map((metric) => [metric.toolName, metric]));
153
155
  // Capabilities summary
154
156
  lines.push('## Capabilities');
155
157
  lines.push('');
@@ -176,7 +178,7 @@ export function generateContractMd(result, options) {
176
178
  const params = extractParameters(tool.inputSchema);
177
179
  const desc = tool.description?.substring(0, 50) || 'No description';
178
180
  const descDisplay = tool.description && tool.description.length > 50 ? `${desc}...` : desc;
179
- const profile = toolProfiles.find(p => p.name === tool.name);
181
+ const profile = toolProfiles.find((p) => p.name === tool.name);
180
182
  const perf = performanceByTool.get(tool.name);
181
183
  const successRate = calculateToolSuccessRate(profile, {
182
184
  countValidationAsSuccess,
@@ -291,7 +293,7 @@ export function generateContractMd(result, options) {
291
293
  lines.push('## Tools');
292
294
  lines.push('');
293
295
  for (const tool of discovery.tools) {
294
- const profile = toolProfiles.find(p => p.name === tool.name);
296
+ const profile = toolProfiles.find((p) => p.name === tool.name);
295
297
  lines.push(`### ${tool.name}`);
296
298
  lines.push('');
297
299
  lines.push(tool.description || 'No description available.');
@@ -397,7 +399,7 @@ function calculateReliabilityMetrics(profile, options) {
397
399
  if (!profile) {
398
400
  return null;
399
401
  }
400
- const interactions = profile.interactions.filter(i => !i.mocked);
402
+ const interactions = profile.interactions.filter((i) => !i.mocked);
401
403
  if (interactions.length === 0) {
402
404
  return null;
403
405
  }
@@ -408,7 +410,7 @@ function calculateReliabilityMetrics(profile, options) {
408
410
  for (const interaction of interactions) {
409
411
  const expected = interaction.question.expectedOutcome ?? 'success';
410
412
  const hasError = interaction.error || interaction.response?.isError;
411
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
413
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
412
414
  const hasErrorText = textContent && 'text' in textContent && looksLikeError(String(textContent.text));
413
415
  const gotError = hasError || hasErrorText;
414
416
  if (expected === 'error') {
@@ -437,7 +439,9 @@ function calculateReliabilityMetrics(profile, options) {
437
439
  const reliabilityRate = total > 0 ? (correctOutcomes / total) * 100 : 0;
438
440
  const happyPathRate = happyPathTotal > 0 ? (happyPathSuccesses / happyPathTotal) * 100 : 100;
439
441
  const validationRate = options.separateValidationMetrics
440
- ? (validationTotal > 0 ? (validationSuccesses / validationTotal) * 100 : 100)
442
+ ? validationTotal > 0
443
+ ? (validationSuccesses / validationTotal) * 100
444
+ : 100
441
445
  : 100;
442
446
  return {
443
447
  total,
@@ -481,8 +485,7 @@ function formatConfidenceIndicator(level) {
481
485
  function generateTransportIssuesSection(transportErrors, warnings) {
482
486
  const lines = [];
483
487
  // Skip if no transport issues to report
484
- if ((!transportErrors || transportErrors.length === 0) &&
485
- (!warnings || warnings.length === 0)) {
488
+ if ((!transportErrors || transportErrors.length === 0) && (!warnings || warnings.length === 0)) {
486
489
  return lines;
487
490
  }
488
491
  lines.push('## Transport Issues');
@@ -507,8 +510,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
507
510
  lines.push('The following transport-level errors were detected during server communication:');
508
511
  lines.push('');
509
512
  // Categorize errors
510
- const serverBugErrors = transportErrors.filter(e => e.likelyServerBug);
511
- const envErrors = transportErrors.filter(e => !e.likelyServerBug);
513
+ const serverBugErrors = transportErrors.filter((e) => e.likelyServerBug);
514
+ const envErrors = transportErrors.filter((e) => !e.likelyServerBug);
512
515
  // Server bugs (critical)
513
516
  if (serverBugErrors.length > 0) {
514
517
  lines.push('#### Likely Server Bugs');
@@ -548,8 +551,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
548
551
  lines.push('');
549
552
  }
550
553
  // Recommendations
551
- const hasInvalidJson = transportErrors.some(e => e.category === 'invalid_json');
552
- const hasProtocolError = transportErrors.some(e => e.category === 'protocol_violation');
554
+ const hasInvalidJson = transportErrors.some((e) => e.category === 'invalid_json');
555
+ const hasProtocolError = transportErrors.some((e) => e.category === 'protocol_violation');
553
556
  if (hasInvalidJson || hasProtocolError) {
554
557
  lines.push('### Recommendations');
555
558
  lines.push('');
@@ -607,7 +610,7 @@ function generateMetricsLegendSection() {
607
610
  }
608
611
  function generateValidationTestingSection(profiles) {
609
612
  const lines = [];
610
- const validationSummary = profiles.map(profile => {
613
+ const validationSummary = profiles.map((profile) => {
611
614
  const buckets = {
612
615
  input: summarizeValidationBucket(profile, 'input'),
613
616
  type: summarizeValidationBucket(profile, 'type'),
@@ -615,7 +618,7 @@ function generateValidationTestingSection(profiles) {
615
618
  };
616
619
  return { profile, buckets };
617
620
  });
618
- const hasValidationTests = validationSummary.some(summary => Object.values(summary.buckets).some(bucket => bucket.total > 0));
621
+ const hasValidationTests = validationSummary.some((summary) => Object.values(summary.buckets).some((bucket) => bucket.total > 0));
619
622
  if (!hasValidationTests) {
620
623
  return lines;
621
624
  }
@@ -668,8 +671,8 @@ function generateIssuesDetectedSection(profiles) {
668
671
  lines.push(`### ${ISSUE_CLASSIFICATION.ICONS.serverBug} ${ISSUE_CLASSIFICATION.HEADERS.serverBug}`);
669
672
  lines.push('');
670
673
  // Separate critical (accepts invalid input) from other bugs
671
- const criticalBugs = classified.serverBug.filter(i => i.critical);
672
- const otherBugs = classified.serverBug.filter(i => !i.critical);
674
+ const criticalBugs = classified.serverBug.filter((i) => i.critical);
675
+ const otherBugs = classified.serverBug.filter((i) => !i.critical);
673
676
  if (criticalBugs.length > 0) {
674
677
  lines.push('**Critical - Accepts Invalid Input:**');
675
678
  for (const issue of criticalBugs.slice(0, DISPLAY_LIMITS.ISSUES_DISPLAY_LIMIT)) {
@@ -809,7 +812,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
809
812
  return [];
810
813
  }
811
814
  // Only show if we have meaningful data
812
- const hasValidMetrics = metrics.some(m => m.callCount >= 2);
815
+ const hasValidMetrics = metrics.some((m) => m.callCount >= 2);
813
816
  if (!hasValidMetrics) {
814
817
  return [];
815
818
  }
@@ -830,11 +833,11 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
830
833
  }
831
834
  lines.push('');
832
835
  // Show low confidence warning if any tools have low confidence
833
- const lowConfidenceTools = metrics.filter(m => m.confidence?.confidenceLevel === 'low');
836
+ const lowConfidenceTools = metrics.filter((m) => m.confidence?.confidenceLevel === 'low');
834
837
  if (lowConfidenceTools.length > 0) {
835
838
  // Categorize low confidence by reason
836
- const lowSampleTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
837
- const highVariabilityTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
839
+ const lowSampleTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
840
+ const highVariabilityTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
838
841
  (m.confidence?.coefficientOfVariation ?? 0) > PERFORMANCE_CONFIDENCE.MEDIUM.MAX_CV);
839
842
  lines.push(`> **⚠️ Low Confidence**: ${lowConfidenceTools.length} tool(s) have low statistical confidence.`);
840
843
  if (lowSampleTools.length > 0) {
@@ -847,7 +850,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
847
850
  lines.push('');
848
851
  }
849
852
  // Add confidence summary section (collapsed)
850
- const hasConfidenceData = metrics.some(m => m.confidence);
853
+ const hasConfidenceData = metrics.some((m) => m.confidence);
851
854
  if (hasConfidenceData) {
852
855
  lines.push('<details>');
853
856
  lines.push('<summary>Confidence Metrics Details</summary>');
@@ -867,7 +870,9 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
867
870
  // In this case, display ~0% to indicate the variability is below measurement threshold
868
871
  const rawCV = m.confidence.coefficientOfVariation * 100;
869
872
  const cvDisplay = successfulSamples > 0
870
- ? (roundedStdDev === 0 && rawCV > 1 ? '~0%' : `${rawCV.toFixed(1)}%`)
873
+ ? roundedStdDev === 0 && rawCV > 1
874
+ ? '~0%'
875
+ : `${rawCV.toFixed(1)}%`
871
876
  : 'N/A';
872
877
  const levelLabel = PERFORMANCE_CONFIDENCE.LABELS[m.confidence.confidenceLevel];
873
878
  lines.push(`| \`${escapeTableCell(m.toolName)}\` | ${successfulSamples} | ${validationSamples} | ${totalTests} | ${stdDevDisplay} | ${cvDisplay} | ${levelLabel} |`);
@@ -931,11 +936,11 @@ function generateContractSecuritySection(fingerprints) {
931
936
  lines.push(`| Average Risk Score | ${avgRiskScore}/100 |`);
932
937
  // Count by severity
933
938
  const bySeverity = {
934
- critical: allFindings.filter(f => f.riskLevel === 'critical').length,
935
- high: allFindings.filter(f => f.riskLevel === 'high').length,
936
- medium: allFindings.filter(f => f.riskLevel === 'medium').length,
937
- low: allFindings.filter(f => f.riskLevel === 'low').length,
938
- info: allFindings.filter(f => f.riskLevel === 'info').length,
939
+ critical: allFindings.filter((f) => f.riskLevel === 'critical').length,
940
+ high: allFindings.filter((f) => f.riskLevel === 'high').length,
941
+ medium: allFindings.filter((f) => f.riskLevel === 'medium').length,
942
+ low: allFindings.filter((f) => f.riskLevel === 'low').length,
943
+ info: allFindings.filter((f) => f.riskLevel === 'info').length,
939
944
  };
940
945
  if (bySeverity.critical > 0) {
941
946
  lines.push(`| Critical Findings | ${bySeverity.critical} |`);
@@ -954,7 +959,7 @@ function generateContractSecuritySection(fingerprints) {
954
959
  return lines;
955
960
  }
956
961
  // Show findings by severity
957
- const criticalAndHigh = allFindings.filter(f => f.riskLevel === 'critical' || f.riskLevel === 'high');
962
+ const criticalAndHigh = allFindings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high');
958
963
  if (criticalAndHigh.length > 0) {
959
964
  lines.push('### Critical and High Severity Findings');
960
965
  lines.push('');
@@ -987,7 +992,7 @@ function generateContractSecuritySection(fingerprints) {
987
992
  lines.push('');
988
993
  }
989
994
  // Show medium/low findings in collapsed section
990
- const mediumAndLow = allFindings.filter(f => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
995
+ const mediumAndLow = allFindings.filter((f) => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
991
996
  if (mediumAndLow.length > 0) {
992
997
  lines.push('<details>');
993
998
  lines.push(`<summary>Medium/Low Severity Findings (${mediumAndLow.length})</summary>`);
@@ -1027,10 +1032,10 @@ function generateWorkflowTestingSection(results) {
1027
1032
  if (results.length === 0) {
1028
1033
  return [];
1029
1034
  }
1030
- const passed = results.filter(r => r.success).length;
1035
+ const passed = results.filter((r) => r.success).length;
1031
1036
  const failed = results.length - passed;
1032
1037
  const totalSteps = results.reduce((sum, r) => sum + r.workflow.steps.length, 0);
1033
- const passedSteps = results.reduce((sum, r) => sum + r.steps.filter(s => s.success).length, 0);
1038
+ const passedSteps = results.reduce((sum, r) => sum + r.steps.filter((s) => s.success).length, 0);
1034
1039
  const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
1035
1040
  lines.push('## Workflow Testing');
1036
1041
  lines.push('');
@@ -1053,7 +1058,7 @@ function generateWorkflowTestingSection(results) {
1053
1058
  lines.push('|----------|--------|-------|----------|');
1054
1059
  for (const result of results) {
1055
1060
  const status = result.success ? '✓ Passed' : '✗ Failed';
1056
- const stepsInfo = `${result.steps.filter(s => s.success).length}/${result.workflow.steps.length}`;
1061
+ const stepsInfo = `${result.steps.filter((s) => s.success).length}/${result.workflow.steps.length}`;
1057
1062
  const duration = formatDuration(result.durationMs);
1058
1063
  lines.push(`| ${escapeTableCell(result.workflow.name)} | ${status} | ${stepsInfo} | ${duration} |`);
1059
1064
  }
@@ -1083,8 +1088,8 @@ function generateWorkflowTestingSection(results) {
1083
1088
  if (stepResult.error) {
1084
1089
  notes = escapeTableCell(truncateString(stepResult.error, 40));
1085
1090
  }
1086
- else if (stepResult.assertionResults?.some(a => !a.passed)) {
1087
- const failedAssertions = stepResult.assertionResults.filter(a => !a.passed);
1091
+ else if (stepResult.assertionResults?.some((a) => !a.passed)) {
1092
+ const failedAssertions = stepResult.assertionResults.filter((a) => !a.passed);
1088
1093
  notes = `${failedAssertions.length} assertion(s) failed`;
1089
1094
  }
1090
1095
  }
@@ -1177,17 +1182,17 @@ function generateSemanticTypesSection(inferences) {
1177
1182
  byType.set(inf.inferredType, existing);
1178
1183
  }
1179
1184
  // Sort by number of parameters (most common types first)
1180
- const sortedTypes = Array.from(byType.entries())
1181
- .sort((a, b) => b[1].length - a[1].length);
1185
+ const sortedTypes = Array.from(byType.entries()).sort((a, b) => b[1].length - a[1].length);
1182
1186
  lines.push('| Type | Parameters | Expected Format |');
1183
1187
  lines.push('|------|------------|-----------------|');
1184
1188
  for (const [type, params] of sortedTypes) {
1185
1189
  const displayName = SEMANTIC_VALIDATION.TYPE_DISPLAY_NAMES[type] ?? type;
1186
- const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ?? '';
1190
+ const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
1191
+ '';
1187
1192
  // Format parameters as tool.param
1188
1193
  const paramList = params
1189
1194
  .slice(0, 3)
1190
- .map(p => `\`${p.toolName}.${p.paramName}\``)
1195
+ .map((p) => `\`${p.toolName}.${p.paramName}\``)
1191
1196
  .join(', ');
1192
1197
  const moreCount = params.length > 3 ? ` +${params.length - 3} more` : '';
1193
1198
  lines.push(`| ${displayName} | ${paramList}${moreCount} | \`${exampleValue}\` |`);
@@ -1246,9 +1251,10 @@ function generateSchemaStabilitySection(schemaEvolution) {
1246
1251
  lines.push('Response schema consistency metrics for tools with sufficient test samples:');
1247
1252
  lines.push('');
1248
1253
  // Summary stats
1249
- const stableCount = toolsWithSchemas.filter(t => t.evolution.isStable).length;
1254
+ const stableCount = toolsWithSchemas.filter((t) => t.evolution.isStable).length;
1250
1255
  const unstableCount = toolsWithSchemas.length - stableCount;
1251
- const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) / toolsWithSchemas.length;
1256
+ const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
1257
+ toolsWithSchemas.length;
1252
1258
  lines.push('| Metric | Value |');
1253
1259
  lines.push('|--------|-------|');
1254
1260
  lines.push(`| Tools Analyzed | ${toolsWithSchemas.length} |`);
@@ -1269,7 +1275,7 @@ function generateSchemaStabilitySection(schemaEvolution) {
1269
1275
  lines.push('| Tool | Grade | Stability | Confidence | Samples | Issues |');
1270
1276
  lines.push('|------|-------|-----------|------------|---------|--------|');
1271
1277
  // Sort by grade (worst first, then by name)
1272
- const gradeOrder = { 'F': 0, 'D': 1, 'C': 2, 'B': 3, 'A': 4, 'N/A': 5 };
1278
+ const gradeOrder = { F: 0, D: 1, C: 2, B: 3, A: 4, 'N/A': 5 };
1273
1279
  const sortedTools = [...toolsWithSchemas].sort((a, b) => {
1274
1280
  const gradeCompare = gradeOrder[a.grade] - gradeOrder[b.grade];
1275
1281
  if (gradeCompare !== 0)
@@ -1284,13 +1290,15 @@ function generateSchemaStabilitySection(schemaEvolution) {
1284
1290
  const confidenceDisplay = `${Math.round(evolution.stabilityConfidence * 100)}%`;
1285
1291
  const issues = evolution.inconsistentFields.length > 0
1286
1292
  ? evolution.inconsistentFields.slice(0, 2).join(', ') +
1287
- (evolution.inconsistentFields.length > 2 ? ` +${evolution.inconsistentFields.length - 2}` : '')
1293
+ (evolution.inconsistentFields.length > 2
1294
+ ? ` +${evolution.inconsistentFields.length - 2}`
1295
+ : '')
1288
1296
  : '-';
1289
1297
  lines.push(`| \`${escapeTableCell(name)}\` | ${gradeEmoji} ${grade} | ${stabilityStatus} | ${confidenceDisplay} | ${evolution.sampleCount} | ${escapeTableCell(issues)} |`);
1290
1298
  }
1291
1299
  lines.push('');
1292
1300
  // Detailed breakdown for unstable tools
1293
- const unstableTools = sortedTools.filter(t => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
1301
+ const unstableTools = sortedTools.filter((t) => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
1294
1302
  if (unstableTools.length > 0) {
1295
1303
  lines.push('<details>');
1296
1304
  lines.push('<summary>Unstable Schema Details</summary>');
@@ -1327,12 +1335,18 @@ function generateSchemaStabilitySection(schemaEvolution) {
1327
1335
  */
1328
1336
  function getGradeEmoji(grade) {
1329
1337
  switch (grade) {
1330
- case 'A': return '🟢';
1331
- case 'B': return '🟢';
1332
- case 'C': return '🟡';
1333
- case 'D': return '🟠';
1334
- case 'F': return '🔴';
1335
- case 'N/A': return '';
1338
+ case 'A':
1339
+ return '🟢';
1340
+ case 'B':
1341
+ return '🟢';
1342
+ case 'C':
1343
+ return '🟡';
1344
+ case 'D':
1345
+ return '🟠';
1346
+ case 'F':
1347
+ return '🔴';
1348
+ case 'N/A':
1349
+ return '⚪';
1336
1350
  }
1337
1351
  }
1338
1352
  /**
@@ -1360,7 +1374,8 @@ function generateErrorAnalysisSection(summaries) {
1360
1374
  const allCategories = new Set();
1361
1375
  const transientCount = toolsWithErrors.reduce((sum, t) => sum + t.summary.transientErrors, 0);
1362
1376
  for (const { summary } of toolsWithErrors) {
1363
- for (const cat of summary.categoryCounts.keys()) {
1377
+ const counts = normalizeCategoryCounts(summary.categoryCounts);
1378
+ for (const cat of counts.keys()) {
1364
1379
  allCategories.add(cat);
1365
1380
  }
1366
1381
  }
@@ -1374,7 +1389,8 @@ function generateErrorAnalysisSection(summaries) {
1374
1389
  // Overall error breakdown by category
1375
1390
  const globalCategoryCounts = new Map();
1376
1391
  for (const { summary } of toolsWithErrors) {
1377
- for (const [cat, count] of summary.categoryCounts) {
1392
+ const counts = normalizeCategoryCounts(summary.categoryCounts);
1393
+ for (const [cat, count] of counts) {
1378
1394
  globalCategoryCounts.set(cat, (globalCategoryCounts.get(cat) ?? 0) + count);
1379
1395
  }
1380
1396
  }
@@ -1384,10 +1400,10 @@ function generateErrorAnalysisSection(summaries) {
1384
1400
  lines.push('| Category | Count | Description |');
1385
1401
  lines.push('|----------|-------|-------------|');
1386
1402
  // Sort by count descending
1387
- const sortedCategories = Array.from(globalCategoryCounts.entries())
1388
- .sort((a, b) => b[1] - a[1]);
1403
+ const sortedCategories = Array.from(globalCategoryCounts.entries()).sort((a, b) => b[1] - a[1]);
1389
1404
  for (const [category, count] of sortedCategories) {
1390
- const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ?? category;
1405
+ const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
1406
+ category;
1391
1407
  const emoji = getCategoryEmoji(category);
1392
1408
  lines.push(`| ${emoji} ${label} | ${count} | ${escapeTableCell(formatCategoryDescription(category))} |`);
1393
1409
  }
@@ -1406,14 +1422,12 @@ function generateErrorAnalysisSection(summaries) {
1406
1422
  ? (ERROR_ANALYSIS.CATEGORY_LABELS[topCategory] ?? topCategory)
1407
1423
  : '-';
1408
1424
  const topRemediation = summary.topRemediations[0] ?? '-';
1409
- const truncatedRemediation = topRemediation.length > 50
1410
- ? `${topRemediation.slice(0, 47)}...`
1411
- : topRemediation;
1425
+ const truncatedRemediation = topRemediation.length > 50 ? `${topRemediation.slice(0, 47)}...` : topRemediation;
1412
1426
  lines.push(`| \`${escapeTableCell(name)}\` | ${summary.totalErrors} | ${summary.transientErrors} | ${topCategoryLabel} | ${escapeTableCell(truncatedRemediation)} |`);
1413
1427
  }
1414
1428
  lines.push('');
1415
1429
  // Detailed remediation suggestions (collapsed)
1416
- const toolsWithRemediations = sortedTools.filter(t => t.summary.topRemediations.length > 0);
1430
+ const toolsWithRemediations = sortedTools.filter((t) => t.summary.topRemediations.length > 0);
1417
1431
  if (toolsWithRemediations.length > 0) {
1418
1432
  lines.push('<details>');
1419
1433
  lines.push('<summary>Remediation Suggestions</summary>');
@@ -1463,13 +1477,20 @@ function generateErrorAnalysisSection(summaries) {
1463
1477
  */
1464
1478
  function getCategoryEmoji(category) {
1465
1479
  switch (category) {
1466
- case 'client_error_validation': return '⚠️';
1467
- case 'client_error_auth': return '🔐';
1468
- case 'client_error_not_found': return '🔍';
1469
- case 'client_error_conflict': return '💥';
1470
- case 'client_error_rate_limit': return '⏱️';
1471
- case 'server_error': return '🔥';
1472
- default: return '';
1480
+ case 'client_error_validation':
1481
+ return '⚠️';
1482
+ case 'client_error_auth':
1483
+ return '🔐';
1484
+ case 'client_error_not_found':
1485
+ return '🔍';
1486
+ case 'client_error_conflict':
1487
+ return '💥';
1488
+ case 'client_error_rate_limit':
1489
+ return '⏱️';
1490
+ case 'server_error':
1491
+ return '🔥';
1492
+ default:
1493
+ return '❓';
1473
1494
  }
1474
1495
  }
1475
1496
  /**
@@ -1493,13 +1514,30 @@ function formatCategoryDescription(category) {
1493
1514
  return 'Unknown error category';
1494
1515
  }
1495
1516
  }
1517
+ /**
1518
+ * Get the top category from a category counts map.
1519
+ */
1520
+ function normalizeCategoryCounts(counts) {
1521
+ if (!counts) {
1522
+ return new Map();
1523
+ }
1524
+ if (counts instanceof Map) {
1525
+ return counts;
1526
+ }
1527
+ if (typeof counts !== 'object') {
1528
+ return new Map();
1529
+ }
1530
+ const entries = Object.entries(counts).filter((entry) => typeof entry[1] === 'number');
1531
+ return new Map(entries);
1532
+ }
1496
1533
  /**
1497
1534
  * Get the top category from a category counts map.
1498
1535
  */
1499
1536
  function getTopCategory(counts) {
1537
+ const normalized = normalizeCategoryCounts(counts);
1500
1538
  let topCategory;
1501
1539
  let topCount = 0;
1502
- for (const [category, count] of counts) {
1540
+ for (const [category, count] of normalized) {
1503
1541
  if (count > topCount) {
1504
1542
  topCount = count;
1505
1543
  topCategory = category;
@@ -1601,7 +1639,10 @@ function formatIssueTypeLabel(type) {
1601
1639
  case 'no_examples':
1602
1640
  return 'No Examples';
1603
1641
  default:
1604
- return type.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
1642
+ return type
1643
+ .split('_')
1644
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
1645
+ .join(' ');
1605
1646
  }
1606
1647
  }
1607
1648
  /**
@@ -1617,10 +1658,10 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
1617
1658
  return [];
1618
1659
  }
1619
1660
  // Find successful interactions
1620
- const successful = profile.interactions.filter(i => {
1661
+ const successful = profile.interactions.filter((i) => {
1621
1662
  if (i.error || i.response?.isError)
1622
1663
  return false;
1623
- const textContent = i.response?.content?.find(c => c.type === 'text');
1664
+ const textContent = i.response?.content?.find((c) => c.type === 'text');
1624
1665
  if (textContent && 'text' in textContent) {
1625
1666
  if (looksLikeError(String(textContent.text)))
1626
1667
  return false;
@@ -1640,7 +1681,7 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
1640
1681
  if (seenArgsHashes.has(argsHash))
1641
1682
  continue;
1642
1683
  seenArgsHashes.add(argsHash);
1643
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1684
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1644
1685
  if (!textContent || !('text' in textContent))
1645
1686
  continue;
1646
1687
  const responseText = String(textContent.text);
@@ -1696,7 +1737,7 @@ function generateToolErrorPatterns(profile) {
1696
1737
  continue;
1697
1738
  }
1698
1739
  const errorText = interaction.error || '';
1699
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1740
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1700
1741
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1701
1742
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1702
1743
  if (!isError)
@@ -1706,7 +1747,8 @@ function generateToolErrorPatterns(profile) {
1706
1747
  continue;
1707
1748
  const category = categorizeError(errorContent);
1708
1749
  const existing = errorCategories.get(category) || [];
1709
- if (existing.length < 2) { // Max 2 examples per category
1750
+ if (existing.length < 2) {
1751
+ // Max 2 examples per category
1710
1752
  const truncated = errorContent.length > 100 ? `${errorContent.slice(0, 97)}...` : errorContent;
1711
1753
  existing.push(truncated);
1712
1754
  }
@@ -1758,7 +1800,7 @@ function generateErrorSummarySection(profiles) {
1758
1800
  continue;
1759
1801
  }
1760
1802
  const errorText = interaction.error || '';
1761
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1803
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1762
1804
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1763
1805
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1764
1806
  if (!isError)
@@ -1771,7 +1813,8 @@ function generateErrorSummarySection(profiles) {
1771
1813
  existing.count++;
1772
1814
  existing.tools.add(profile.name);
1773
1815
  if (!existing.example) {
1774
- existing.example = errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
1816
+ existing.example =
1817
+ errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
1775
1818
  }
1776
1819
  categoryCounts.set(category, existing);
1777
1820
  }
@@ -1786,7 +1829,10 @@ function generateErrorSummarySection(profiles) {
1786
1829
  lines.push('| Category | Count | Affected Tools |');
1787
1830
  lines.push('|----------|-------|----------------|');
1788
1831
  for (const [category, data] of categoryCounts) {
1789
- const toolList = Array.from(data.tools).slice(0, 3).map(t => `\`${t}\``).join(', ');
1832
+ const toolList = Array.from(data.tools)
1833
+ .slice(0, 3)
1834
+ .map((t) => `\`${t}\``)
1835
+ .join(', ');
1790
1836
  const more = data.tools.size > 3 ? ` +${data.tools.size - 3} more` : '';
1791
1837
  lines.push(`| ${category} | ${data.count} | ${toolList}${more} |`);
1792
1838
  }
@@ -1813,7 +1859,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
1813
1859
  continue;
1814
1860
  }
1815
1861
  const errorText = interaction.error || '';
1816
- const textContent = interaction.response?.content?.find(c => c.type === 'text');
1862
+ const textContent = interaction.response?.content?.find((c) => c.type === 'text');
1817
1863
  const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
1818
1864
  const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
1819
1865
  if (!isError)
@@ -1843,7 +1889,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
1843
1889
  });
1844
1890
  }
1845
1891
  if (patterns.length > 0) {
1846
- const tool = tools.find(t => t.name === profile.name);
1892
+ const tool = tools.find((t) => t.name === profile.name);
1847
1893
  errorInputs.push({
1848
1894
  toolName: profile.name,
1849
1895
  toolDescription: tool?.description,
@@ -1962,7 +2008,9 @@ function collectAssertionFailures(profile) {
1962
2008
  for (const result of interaction.assertionResults ?? []) {
1963
2009
  if (result.passed)
1964
2010
  continue;
1965
- const message = result.message ? `${result.type}: ${result.message}` : `${result.type} failed`;
2011
+ const message = result.message
2012
+ ? `${result.type}: ${result.message}`
2013
+ : `${result.type} failed`;
1966
2014
  failures.add(message);
1967
2015
  }
1968
2016
  }
@@ -1,14 +1,13 @@
1
1
  import { Ajv2020 as Ajv } from 'ajv/dist/2020.js';
2
2
  import { readFileSync } from 'fs';
3
+ import { isAbsolute } from 'path';
3
4
  import { fileURLToPath } from 'url';
4
5
  import { REPORT_SCHEMAS } from '../constants.js';
5
6
  /**
6
7
  * Generate a JSON report of the interview.
7
8
  */
8
9
  export function generateJsonReport(result, options = {}) {
9
- const report = options.schemaUrl
10
- ? { $schema: options.schemaUrl, ...result }
11
- : { ...result };
10
+ const report = options.schemaUrl ? { $schema: options.schemaUrl, ...result } : { ...result };
12
11
  const jsonReadyReport = JSON.parse(JSON.stringify(report));
13
12
  if (options.validate) {
14
13
  const schemaPath = resolveSchemaPath(options.schemaPath);
@@ -18,7 +17,11 @@ export function generateJsonReport(result, options = {}) {
18
17
  }
19
18
  function resolveSchemaPath(schemaPath) {
20
19
  if (schemaPath) {
21
- return schemaPath;
20
+ if (isAbsolute(schemaPath)) {
21
+ return schemaPath;
22
+ }
23
+ const base = new URL('../../', import.meta.url);
24
+ return fileURLToPath(new URL(schemaPath, base));
22
25
  }
23
26
  const url = new URL(`../../${REPORT_SCHEMAS.CHECK_REPORT_SCHEMA_FILE}`, import.meta.url);
24
27
  return fileURLToPath(url);
@@ -38,7 +41,7 @@ function validateReportAgainstSchema(report, schemaPath) {
38
41
  const validate = ajv.compile(schema);
39
42
  if (!validate(report)) {
40
43
  const errorText = ajv.errorsText(validate.errors, { separator: '\n' });
41
- throw new Error(`Check report schema validation failed:\n${errorText}`);
44
+ throw new Error(`Report schema validation failed:\n${errorText}`);
42
45
  }
43
46
  }
44
47
  //# sourceMappingURL=report.js.map
@@ -0,0 +1,17 @@
1
+ import type { InterviewResult } from './types.js';
2
+ import type { SemanticInference } from '../validation/semantic-types.js';
3
+ import type { ResponseSchemaEvolution } from '../baseline/response-schema-tracker.js';
4
+ import type { ErrorAnalysisSummary } from '../baseline/error-analyzer.js';
5
+ import type { DocumentationScore } from '../baseline/documentation-scorer.js';
6
+ export interface InterviewInsights {
7
+ semanticInferences?: Record<string, SemanticInference[]>;
8
+ schemaEvolution?: Record<string, ResponseSchemaEvolution>;
9
+ errorAnalysisSummaries?: Record<string, ErrorAnalysisSummary>;
10
+ documentationScore?: DocumentationScore;
11
+ }
12
+ /**
13
+ * Build derived insights from an interview result.
14
+ * These insights are used for documentation and JSON report enrichment.
15
+ */
16
+ export declare function buildInterviewInsights(result: InterviewResult): InterviewInsights;
17
+ //# sourceMappingURL=insights.d.ts.map