npm - @dotsetlabs/bellwether - Versions diffs - 1.0.3 → 2.0.1 - Mend

@dotsetlabs/bellwether 1.0.3 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/CHANGELOG.md +118 -0
package/README.md +17 -2
package/dist/auth/credentials.js +2 -0
package/dist/baseline/accessors.d.ts +1 -1
package/dist/baseline/accessors.js +13 -3
package/dist/baseline/baseline-format.d.ts +335 -0
package/dist/baseline/baseline-format.js +12 -0
package/dist/baseline/comparator.js +494 -13
package/dist/baseline/converter.d.ts +15 -15
package/dist/baseline/converter.js +97 -37
package/dist/baseline/diff.d.ts +1 -1
package/dist/baseline/diff.js +45 -28
package/dist/baseline/error-analyzer.d.ts +1 -1
package/dist/baseline/error-analyzer.js +90 -17
package/dist/baseline/incremental-checker.js +8 -5
package/dist/baseline/index.d.ts +2 -12
package/dist/baseline/index.js +3 -23
package/dist/baseline/performance-tracker.d.ts +0 -1
package/dist/baseline/performance-tracker.js +13 -20
package/dist/baseline/response-fingerprint.js +40 -3
package/dist/baseline/saver.js +75 -10
package/dist/baseline/schema-compare.d.ts +22 -0
package/dist/baseline/schema-compare.js +259 -16
package/dist/baseline/types.d.ts +30 -7
package/dist/cache/response-cache.d.ts +8 -0
package/dist/cache/response-cache.js +119 -2
package/dist/cli/commands/baseline.js +70 -35
package/dist/cli/commands/check.js +71 -15
package/dist/cli/commands/explore.js +69 -16
package/dist/cli/commands/init.js +10 -7
package/dist/cli/commands/watch.js +5 -5
package/dist/cli/index.js +8 -0
package/dist/config/loader.js +2 -2
package/dist/config/template.js +8 -7
package/dist/config/validator.d.ts +59 -59
package/dist/config/validator.js +245 -90
package/dist/constants/core.d.ts +5 -1
package/dist/constants/core.js +9 -20
package/dist/constants/registry.d.ts +17 -0
package/dist/constants/registry.js +18 -0
package/dist/constants/testing.d.ts +0 -369
package/dist/constants/testing.js +18 -456
package/dist/constants.d.ts +1 -1
package/dist/constants.js +1 -1
package/dist/discovery/discovery.js +88 -14
package/dist/discovery/types.d.ts +5 -1
package/dist/docs/agents.js +138 -50
package/dist/docs/contract.js +194 -84
package/dist/docs/report.js +8 -5
package/dist/errors/retry.js +11 -5
package/dist/interview/insights.d.ts +17 -0
package/dist/interview/insights.js +52 -0
package/dist/interview/interviewer.js +52 -10
package/dist/interview/prompt-test-generator.d.ts +12 -0
package/dist/interview/prompt-test-generator.js +77 -0
package/dist/interview/rate-limiter.js +7 -3
package/dist/interview/resource-test-generator.d.ts +12 -0
package/dist/interview/resource-test-generator.js +20 -0
package/dist/interview/schema-inferrer.js +26 -4
package/dist/interview/schema-test-generator.js +278 -31
package/dist/interview/stateful-test-runner.d.ts +3 -0
package/dist/interview/stateful-test-runner.js +80 -0
package/dist/interview/types.d.ts +12 -0
package/dist/llm/anthropic.js +14 -4
package/dist/llm/fallback.d.ts +1 -0
package/dist/llm/fallback.js +7 -1
package/dist/llm/openai.js +15 -4
package/dist/protocol/index.d.ts +2 -0
package/dist/protocol/index.js +2 -0
package/dist/protocol/version-registry.d.ts +66 -0
package/dist/protocol/version-registry.js +159 -0
package/dist/transport/http-transport.d.ts +11 -1
package/dist/transport/http-transport.js +21 -2
package/dist/transport/mcp-client.d.ts +29 -1
package/dist/transport/mcp-client.js +93 -8
package/dist/transport/sse-transport.d.ts +7 -3
package/dist/transport/sse-transport.js +162 -71
package/dist/transport/types.d.ts +134 -1
package/dist/utils/concurrency.d.ts +2 -0
package/dist/utils/concurrency.js +9 -2
package/dist/utils/markdown.js +13 -18
package/dist/utils/timeout.js +2 -1
package/dist/version.js +1 -1
package/man/bellwether.1 +1 -1
package/man/bellwether.1.md +2 -2
package/package.json +1 -1
package/schemas/bellwether-check.schema.json +185 -0
package/schemas/bellwether-explore.schema.json +837 -0
package/scripts/completions/bellwether.bash +10 -4
package/scripts/completions/bellwether.zsh +55 -2

package/dist/docs/contract.js CHANGED Viewed

@@ -5,7 +5,8 @@ import { formatDateISO, formatDuration, escapeTableCell, mermaidLabel, validateJ
 import { smartTruncate, getExampleLength } from '../utils/smart-truncate.js';
 import { calculatePerformanceMetrics, extractParameters, looksLikeError } from './shared.js';
 import { analyzeExternalDependencies, formatExternalDependenciesMarkdown, } from '../baseline/external-dependency-detector.js';
-import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, } from '../constants.js';
+import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, MCP, } from '../constants.js';
+import { getFeatureFlags } from '../protocol/index.js';
 /**
  * Classify issues by their source to help users understand which issues
  * are actual bugs vs expected behavior or environment issues.
@@ -52,13 +53,15 @@ function classifyIssuesBySource(profiles) {
             // but tool didn't actually reject - this shouldn't happen with outcomeAssessment.correct check above
             // so we classify based on expected outcome and error classification
             // 1. Check for external dependency errors (highest priority for classification)
-            if (errorClassification && errorClassification.externalServiceErrors > 0 && detectedServices.length > 0) {
+            if (errorClassification &&
+                errorClassification.externalServiceErrors > 0 &&
+                detectedServices.length > 0) {
                 // Check if the error message matches known external service patterns
-                const isExternalError = detectedServices.some(service => {
+                const isExternalError = detectedServices.some((service) => {
                     const serviceConfig = EXTERNAL_DEPENDENCIES.SERVICES[service];
                     if (!serviceConfig)
                         return false;
-                    return serviceConfig.errorPatterns.some(pattern => pattern.test(errorMsg));
+                    return serviceConfig.errorPatterns.some((pattern) => pattern.test(errorMsg));
                 });
                 if (isExternalError) {
                     issue.service = detectedServices[0];
@@ -68,7 +71,7 @@ function classifyIssuesBySource(profiles) {
             }
             // 2. Check for environment configuration errors
             if (errorClassification && errorClassification.environmentErrors > 0) {
-                const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
+                const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
                 if (isEnvironmentError) {
                     result.environment.push(issue);
                     continue;
@@ -99,7 +102,7 @@ function classifyIssuesBySource(profiles) {
                     continue;
                 }
                 // Check if error message indicates environment issue
-                const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
+                const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
                 if (isEnvironmentError) {
                     result.environment.push(issue);
                     continue;
@@ -145,11 +148,20 @@ export function generateContractMd(result, options) {
     // Overview
     lines.push('## Overview');
     lines.push('');
+    const features = getFeatureFlags(discovery.protocolVersion);
     lines.push(`**Server Version:** ${discovery.serverInfo.version}`);
     lines.push(`**Protocol Version:** ${discovery.protocolVersion}`);
+    if (discovery.protocolVersion !== MCP.PROTOCOL_VERSION) {
+        lines.push(`*(Server protocol; bellwether supports up to ${MCP.PROTOCOL_VERSION})*`);
+    }
     lines.push('');
     const performanceMetrics = calculatePerformanceMetrics(toolProfiles);
-    const performanceByTool = new Map(performanceMetrics.map(metric => [metric.toolName, metric]));
+    const performanceByTool = new Map(performanceMetrics.map((metric) => [metric.toolName, metric]));
+    // Server instructions
+    if (discovery.instructions) {
+        lines.push(`**Server Instructions:** ${discovery.instructions}`);
+        lines.push('');
+    }
     // Capabilities summary
     lines.push('## Capabilities');
     lines.push('');
@@ -162,6 +174,15 @@ export function generateContractMd(result, options) {
     if (discovery.capabilities.resources) {
         lines.push(`- **Resources:** ${(discovery.resources ?? []).length} available`);
     }
+    if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
+        lines.push(`- **Resource Templates:** ${discovery.resourceTemplates.length} available`);
+    }
+    if (discovery.capabilities.completions && features.completions) {
+        lines.push('- **Completions:** Supported');
+    }
+    if (discovery.capabilities.tasks && features.tasks) {
+        lines.push('- **Tasks:** Supported');
+    }
     if (discovery.capabilities.logging) {
         lines.push('- **Logging:** Supported');
     }
@@ -176,7 +197,7 @@ export function generateContractMd(result, options) {
             const params = extractParameters(tool.inputSchema);
             const desc = tool.description?.substring(0, 50) || 'No description';
             const descDisplay = tool.description && tool.description.length > 50 ? `${desc}...` : desc;
-            const profile = toolProfiles.find(p => p.name === tool.name);
+            const profile = toolProfiles.find((p) => p.name === tool.name);
             const perf = performanceByTool.get(tool.name);
             const successRate = calculateToolSuccessRate(profile, {
                 countValidationAsSuccess,
@@ -291,7 +312,7 @@ export function generateContractMd(result, options) {
         lines.push('## Tools');
         lines.push('');
         for (const tool of discovery.tools) {
-            const profile = toolProfiles.find(p => p.name === tool.name);
+            const profile = toolProfiles.find((p) => p.name === tool.name);
             lines.push(`### ${tool.name}`);
             lines.push('');
             lines.push(tool.description || 'No description available.');
@@ -319,6 +340,22 @@ export function generateContractMd(result, options) {
                     lines.push('');
                 }
             }
+            // Show tool annotations (behavioral hints) — version-gated
+            if (features.toolAnnotations && tool.annotations) {
+                const hints = [];
+                if (tool.annotations.readOnlyHint)
+                    hints.push('read-only');
+                if (tool.annotations.destructiveHint)
+                    hints.push('destructive');
+                if (tool.annotations.idempotentHint)
+                    hints.push('idempotent');
+                if (tool.annotations.openWorldHint)
+                    hints.push('open-world');
+                if (hints.length > 0) {
+                    lines.push(`**Behavioral Hints:** ${hints.join(', ')}`);
+                    lines.push('');
+                }
+            }
             if (tool.inputSchema) {
                 lines.push('**Input Schema:**');
                 const schemaJson = validateJsonForCodeBlock(tool.inputSchema);
@@ -327,6 +364,15 @@ export function generateContractMd(result, options) {
                 lines.push('```');
                 lines.push('');
             }
+            // Show output schema if present — version-gated
+            if (features.structuredOutput && tool.outputSchema) {
+                lines.push('**Output Schema:**');
+                const outputSchemaJson = validateJsonForCodeBlock(tool.outputSchema);
+                lines.push('```json');
+                lines.push(outputSchemaJson.content);
+                lines.push('```');
+                lines.push('');
+            }
             // Add example usage from successful interactions
             const examples = generateToolExamples(profile, maxExamplesPerTool, exampleLength);
             if (examples.length > 0) {
@@ -378,6 +424,24 @@ export function generateContractMd(result, options) {
             }
         }
     }
+    // Resource Templates section
+    if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
+        lines.push('## Resource Templates');
+        lines.push('');
+        for (const template of discovery.resourceTemplates) {
+            lines.push(`### ${template.name}`);
+            lines.push('');
+            lines.push(`**URI Template:** \`${template.uriTemplate}\``);
+            if (template.mimeType) {
+                lines.push(`**MIME Type:** ${template.mimeType}`);
+            }
+            lines.push('');
+            if (template.description) {
+                lines.push(template.description);
+                lines.push('');
+            }
+        }
+    }
     // Error Summary section
     const errorSummary = generateErrorSummarySection(toolProfiles);
     if (errorSummary.length > 0) {
@@ -397,7 +461,7 @@ function calculateReliabilityMetrics(profile, options) {
     if (!profile) {
         return null;
     }
-    const interactions = profile.interactions.filter(i => !i.mocked);
+    const interactions = profile.interactions.filter((i) => !i.mocked);
     if (interactions.length === 0) {
         return null;
     }
@@ -408,7 +472,7 @@ function calculateReliabilityMetrics(profile, options) {
     for (const interaction of interactions) {
         const expected = interaction.question.expectedOutcome ?? 'success';
         const hasError = interaction.error || interaction.response?.isError;
-        const textContent = interaction.response?.content?.find(c => c.type === 'text');
+        const textContent = interaction.response?.content?.find((c) => c.type === 'text');
         const hasErrorText = textContent && 'text' in textContent && looksLikeError(String(textContent.text));
         const gotError = hasError || hasErrorText;
         if (expected === 'error') {
@@ -437,7 +501,9 @@ function calculateReliabilityMetrics(profile, options) {
     const reliabilityRate = total > 0 ? (correctOutcomes / total) * 100 : 0;
     const happyPathRate = happyPathTotal > 0 ? (happyPathSuccesses / happyPathTotal) * 100 : 100;
     const validationRate = options.separateValidationMetrics
-        ? (validationTotal > 0 ? (validationSuccesses / validationTotal) * 100 : 100)
+        ? validationTotal > 0
+            ? (validationSuccesses / validationTotal) * 100
+            : 100
         : 100;
     return {
         total,
@@ -481,8 +547,7 @@ function formatConfidenceIndicator(level) {
 function generateTransportIssuesSection(transportErrors, warnings) {
     const lines = [];
     // Skip if no transport issues to report
-    if ((!transportErrors || transportErrors.length === 0) &&
-        (!warnings || warnings.length === 0)) {
+    if ((!transportErrors || transportErrors.length === 0) && (!warnings || warnings.length === 0)) {
         return lines;
     }
     lines.push('## Transport Issues');
@@ -507,8 +572,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
         lines.push('The following transport-level errors were detected during server communication:');
         lines.push('');
         // Categorize errors
-        const serverBugErrors = transportErrors.filter(e => e.likelyServerBug);
-        const envErrors = transportErrors.filter(e => !e.likelyServerBug);
+        const serverBugErrors = transportErrors.filter((e) => e.likelyServerBug);
+        const envErrors = transportErrors.filter((e) => !e.likelyServerBug);
         // Server bugs (critical)
         if (serverBugErrors.length > 0) {
             lines.push('#### Likely Server Bugs');
@@ -548,8 +613,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
             lines.push('');
         }
         // Recommendations
-        const hasInvalidJson = transportErrors.some(e => e.category === 'invalid_json');
-        const hasProtocolError = transportErrors.some(e => e.category === 'protocol_violation');
+        const hasInvalidJson = transportErrors.some((e) => e.category === 'invalid_json');
+        const hasProtocolError = transportErrors.some((e) => e.category === 'protocol_violation');
         if (hasInvalidJson || hasProtocolError) {
             lines.push('### Recommendations');
             lines.push('');
@@ -607,7 +672,7 @@ function generateMetricsLegendSection() {
 }
 function generateValidationTestingSection(profiles) {
     const lines = [];
-    const validationSummary = profiles.map(profile => {
+    const validationSummary = profiles.map((profile) => {
         const buckets = {
             input: summarizeValidationBucket(profile, 'input'),
             type: summarizeValidationBucket(profile, 'type'),
@@ -615,7 +680,7 @@ function generateValidationTestingSection(profiles) {
         };
         return { profile, buckets };
     });
-    const hasValidationTests = validationSummary.some(summary => Object.values(summary.buckets).some(bucket => bucket.total > 0));
+    const hasValidationTests = validationSummary.some((summary) => Object.values(summary.buckets).some((bucket) => bucket.total > 0));
     if (!hasValidationTests) {
         return lines;
     }
@@ -668,8 +733,8 @@ function generateIssuesDetectedSection(profiles) {
         lines.push(`### ${ISSUE_CLASSIFICATION.ICONS.serverBug} ${ISSUE_CLASSIFICATION.HEADERS.serverBug}`);
         lines.push('');
         // Separate critical (accepts invalid input) from other bugs
-        const criticalBugs = classified.serverBug.filter(i => i.critical);
-        const otherBugs = classified.serverBug.filter(i => !i.critical);
+        const criticalBugs = classified.serverBug.filter((i) => i.critical);
+        const otherBugs = classified.serverBug.filter((i) => !i.critical);
         if (criticalBugs.length > 0) {
             lines.push('**Critical - Accepts Invalid Input:**');
             for (const issue of criticalBugs.slice(0, DISPLAY_LIMITS.ISSUES_DISPLAY_LIMIT)) {
@@ -809,7 +874,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
         return [];
     }
     // Only show if we have meaningful data
-    const hasValidMetrics = metrics.some(m => m.callCount >= 2);
+    const hasValidMetrics = metrics.some((m) => m.callCount >= 2);
     if (!hasValidMetrics) {
         return [];
     }
@@ -830,11 +895,11 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
     }
     lines.push('');
     // Show low confidence warning if any tools have low confidence
-    const lowConfidenceTools = metrics.filter(m => m.confidence?.confidenceLevel === 'low');
+    const lowConfidenceTools = metrics.filter((m) => m.confidence?.confidenceLevel === 'low');
     if (lowConfidenceTools.length > 0) {
         // Categorize low confidence by reason
-        const lowSampleTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
-        const highVariabilityTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
+        const lowSampleTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
+        const highVariabilityTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
             (m.confidence?.coefficientOfVariation ?? 0) > PERFORMANCE_CONFIDENCE.MEDIUM.MAX_CV);
         lines.push(`> **⚠️ Low Confidence**: ${lowConfidenceTools.length} tool(s) have low statistical confidence.`);
         if (lowSampleTools.length > 0) {
@@ -847,7 +912,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
         lines.push('');
     }
     // Add confidence summary section (collapsed)
-    const hasConfidenceData = metrics.some(m => m.confidence);
+    const hasConfidenceData = metrics.some((m) => m.confidence);
     if (hasConfidenceData) {
         lines.push('<details>');
         lines.push('<summary>Confidence Metrics Details</summary>');
@@ -867,7 +932,9 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
                 // In this case, display ~0% to indicate the variability is below measurement threshold
                 const rawCV = m.confidence.coefficientOfVariation * 100;
                 const cvDisplay = successfulSamples > 0
-                    ? (roundedStdDev === 0 && rawCV > 1 ? '~0%' : `${rawCV.toFixed(1)}%`)
+                    ? roundedStdDev === 0 && rawCV > 1
+                        ? '~0%'
+                        : `${rawCV.toFixed(1)}%`
                     : 'N/A';
                 const levelLabel = PERFORMANCE_CONFIDENCE.LABELS[m.confidence.confidenceLevel];
                 lines.push(`| \`${escapeTableCell(m.toolName)}\` | ${successfulSamples} | ${validationSamples} | ${totalTests} | ${stdDevDisplay} | ${cvDisplay} | ${levelLabel} |`);
@@ -931,11 +998,11 @@ function generateContractSecuritySection(fingerprints) {
     lines.push(`| Average Risk Score | ${avgRiskScore}/100 |`);
     // Count by severity
     const bySeverity = {
-        critical: allFindings.filter(f => f.riskLevel === 'critical').length,
-        high: allFindings.filter(f => f.riskLevel === 'high').length,
-        medium: allFindings.filter(f => f.riskLevel === 'medium').length,
-        low: allFindings.filter(f => f.riskLevel === 'low').length,
-        info: allFindings.filter(f => f.riskLevel === 'info').length,
+        critical: allFindings.filter((f) => f.riskLevel === 'critical').length,
+        high: allFindings.filter((f) => f.riskLevel === 'high').length,
+        medium: allFindings.filter((f) => f.riskLevel === 'medium').length,
+        low: allFindings.filter((f) => f.riskLevel === 'low').length,
+        info: allFindings.filter((f) => f.riskLevel === 'info').length,
     };
     if (bySeverity.critical > 0) {
         lines.push(`| Critical Findings | ${bySeverity.critical} |`);
@@ -954,7 +1021,7 @@ function generateContractSecuritySection(fingerprints) {
         return lines;
     }
     // Show findings by severity
-    const criticalAndHigh = allFindings.filter(f => f.riskLevel === 'critical' || f.riskLevel === 'high');
+    const criticalAndHigh = allFindings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high');
     if (criticalAndHigh.length > 0) {
         lines.push('### Critical and High Severity Findings');
         lines.push('');
@@ -987,7 +1054,7 @@ function generateContractSecuritySection(fingerprints) {
         lines.push('');
     }
     // Show medium/low findings in collapsed section
-    const mediumAndLow = allFindings.filter(f => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
+    const mediumAndLow = allFindings.filter((f) => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
     if (mediumAndLow.length > 0) {
         lines.push('<details>');
         lines.push(`<summary>Medium/Low Severity Findings (${mediumAndLow.length})</summary>`);
@@ -1027,10 +1094,10 @@ function generateWorkflowTestingSection(results) {
     if (results.length === 0) {
         return [];
     }
-    const passed = results.filter(r => r.success).length;
+    const passed = results.filter((r) => r.success).length;
     const failed = results.length - passed;
     const totalSteps = results.reduce((sum, r) => sum + r.workflow.steps.length, 0);
-    const passedSteps = results.reduce((sum, r) => sum + r.steps.filter(s => s.success).length, 0);
+    const passedSteps = results.reduce((sum, r) => sum + r.steps.filter((s) => s.success).length, 0);
     const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
     lines.push('## Workflow Testing');
     lines.push('');
@@ -1053,7 +1120,7 @@ function generateWorkflowTestingSection(results) {
     lines.push('|----------|--------|-------|----------|');
     for (const result of results) {
         const status = result.success ? '✓ Passed' : '✗ Failed';
-        const stepsInfo = `${result.steps.filter(s => s.success).length}/${result.workflow.steps.length}`;
+        const stepsInfo = `${result.steps.filter((s) => s.success).length}/${result.workflow.steps.length}`;
         const duration = formatDuration(result.durationMs);
         lines.push(`| ${escapeTableCell(result.workflow.name)} | ${status} | ${stepsInfo} | ${duration} |`);
     }
@@ -1083,8 +1150,8 @@ function generateWorkflowTestingSection(results) {
                 if (stepResult.error) {
                     notes = escapeTableCell(truncateString(stepResult.error, 40));
                 }
-                else if (stepResult.assertionResults?.some(a => !a.passed)) {
-                    const failedAssertions = stepResult.assertionResults.filter(a => !a.passed);
+                else if (stepResult.assertionResults?.some((a) => !a.passed)) {
+                    const failedAssertions = stepResult.assertionResults.filter((a) => !a.passed);
                     notes = `${failedAssertions.length} assertion(s) failed`;
                 }
             }
@@ -1177,17 +1244,17 @@ function generateSemanticTypesSection(inferences) {
         byType.set(inf.inferredType, existing);
     }
     // Sort by number of parameters (most common types first)
-    const sortedTypes = Array.from(byType.entries())
-        .sort((a, b) => b[1].length - a[1].length);
+    const sortedTypes = Array.from(byType.entries()).sort((a, b) => b[1].length - a[1].length);
     lines.push('| Type | Parameters | Expected Format |');
     lines.push('|------|------------|-----------------|');
     for (const [type, params] of sortedTypes) {
         const displayName = SEMANTIC_VALIDATION.TYPE_DISPLAY_NAMES[type] ?? type;
-        const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ?? '';
+        const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
+            '';
         // Format parameters as tool.param
         const paramList = params
             .slice(0, 3)
-            .map(p => `\`${p.toolName}.${p.paramName}\``)
+            .map((p) => `\`${p.toolName}.${p.paramName}\``)
             .join(', ');
         const moreCount = params.length > 3 ? ` +${params.length - 3} more` : '';
         lines.push(`| ${displayName} | ${paramList}${moreCount} | \`${exampleValue}\` |`);
@@ -1246,9 +1313,10 @@ function generateSchemaStabilitySection(schemaEvolution) {
     lines.push('Response schema consistency metrics for tools with sufficient test samples:');
     lines.push('');
     // Summary stats
-    const stableCount = toolsWithSchemas.filter(t => t.evolution.isStable).length;
+    const stableCount = toolsWithSchemas.filter((t) => t.evolution.isStable).length;
     const unstableCount = toolsWithSchemas.length - stableCount;
-    const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) / toolsWithSchemas.length;
+    const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
+        toolsWithSchemas.length;
     lines.push('| Metric | Value |');
     lines.push('|--------|-------|');
     lines.push(`| Tools Analyzed | ${toolsWithSchemas.length} |`);
@@ -1269,7 +1337,7 @@ function generateSchemaStabilitySection(schemaEvolution) {
     lines.push('| Tool | Grade | Stability | Confidence | Samples | Issues |');
     lines.push('|------|-------|-----------|------------|---------|--------|');
     // Sort by grade (worst first, then by name)
-    const gradeOrder = { 'F': 0, 'D': 1, 'C': 2, 'B': 3, 'A': 4, 'N/A': 5 };
+    const gradeOrder = { F: 0, D: 1, C: 2, B: 3, A: 4, 'N/A': 5 };
     const sortedTools = [...toolsWithSchemas].sort((a, b) => {
         const gradeCompare = gradeOrder[a.grade] - gradeOrder[b.grade];
         if (gradeCompare !== 0)
@@ -1284,13 +1352,15 @@ function generateSchemaStabilitySection(schemaEvolution) {
         const confidenceDisplay = `${Math.round(evolution.stabilityConfidence * 100)}%`;
         const issues = evolution.inconsistentFields.length > 0
             ? evolution.inconsistentFields.slice(0, 2).join(', ') +
-                (evolution.inconsistentFields.length > 2 ? ` +${evolution.inconsistentFields.length - 2}` : '')
+                (evolution.inconsistentFields.length > 2
+                    ? ` +${evolution.inconsistentFields.length - 2}`
+                    : '')
             : '-';
         lines.push(`| \`${escapeTableCell(name)}\` | ${gradeEmoji} ${grade} | ${stabilityStatus} | ${confidenceDisplay} | ${evolution.sampleCount} | ${escapeTableCell(issues)} |`);
     }
     lines.push('');
     // Detailed breakdown for unstable tools
-    const unstableTools = sortedTools.filter(t => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
+    const unstableTools = sortedTools.filter((t) => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
     if (unstableTools.length > 0) {
         lines.push('<details>');
         lines.push('<summary>Unstable Schema Details</summary>');
@@ -1327,12 +1397,18 @@ function generateSchemaStabilitySection(schemaEvolution) {
  */
 function getGradeEmoji(grade) {
     switch (grade) {
-        case 'A': return '🟢';
-        case 'B': return '🟢';
-        case 'C': return '🟡';
-        case 'D': return '🟠';
-        case 'F': return '🔴';
-        case 'N/A': return '⚪';
+        case 'A':
+            return '🟢';
+        case 'B':
+            return '🟢';
+        case 'C':
+            return '🟡';
+        case 'D':
+            return '🟠';
+        case 'F':
+            return '🔴';
+        case 'N/A':
+            return '⚪';
     }
 }
 /**
@@ -1360,7 +1436,8 @@ function generateErrorAnalysisSection(summaries) {
     const allCategories = new Set();
     const transientCount = toolsWithErrors.reduce((sum, t) => sum + t.summary.transientErrors, 0);
     for (const { summary } of toolsWithErrors) {
-        for (const cat of summary.categoryCounts.keys()) {
+        const counts = normalizeCategoryCounts(summary.categoryCounts);
+        for (const cat of counts.keys()) {
             allCategories.add(cat);
         }
     }
@@ -1374,7 +1451,8 @@ function generateErrorAnalysisSection(summaries) {
     // Overall error breakdown by category
     const globalCategoryCounts = new Map();
     for (const { summary } of toolsWithErrors) {
-        for (const [cat, count] of summary.categoryCounts) {
+        const counts = normalizeCategoryCounts(summary.categoryCounts);
+        for (const [cat, count] of counts) {
             globalCategoryCounts.set(cat, (globalCategoryCounts.get(cat) ?? 0) + count);
         }
     }
@@ -1384,10 +1462,10 @@ function generateErrorAnalysisSection(summaries) {
         lines.push('| Category | Count | Description |');
         lines.push('|----------|-------|-------------|');
         // Sort by count descending
-        const sortedCategories = Array.from(globalCategoryCounts.entries())
-            .sort((a, b) => b[1] - a[1]);
+        const sortedCategories = Array.from(globalCategoryCounts.entries()).sort((a, b) => b[1] - a[1]);
         for (const [category, count] of sortedCategories) {
-            const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ?? category;
+            const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
+                category;
             const emoji = getCategoryEmoji(category);
             lines.push(`| ${emoji} ${label} | ${count} | ${escapeTableCell(formatCategoryDescription(category))} |`);
         }
@@ -1406,14 +1484,12 @@ function generateErrorAnalysisSection(summaries) {
             ? (ERROR_ANALYSIS.CATEGORY_LABELS[topCategory] ?? topCategory)
             : '-';
         const topRemediation = summary.topRemediations[0] ?? '-';
-        const truncatedRemediation = topRemediation.length > 50
-            ? `${topRemediation.slice(0, 47)}...`
-            : topRemediation;
+        const truncatedRemediation = topRemediation.length > 50 ? `${topRemediation.slice(0, 47)}...` : topRemediation;
         lines.push(`| \`${escapeTableCell(name)}\` | ${summary.totalErrors} | ${summary.transientErrors} | ${topCategoryLabel} | ${escapeTableCell(truncatedRemediation)} |`);
     }
     lines.push('');
     // Detailed remediation suggestions (collapsed)
-    const toolsWithRemediations = sortedTools.filter(t => t.summary.topRemediations.length > 0);
+    const toolsWithRemediations = sortedTools.filter((t) => t.summary.topRemediations.length > 0);
     if (toolsWithRemediations.length > 0) {
         lines.push('<details>');
         lines.push('<summary>Remediation Suggestions</summary>');
@@ -1463,13 +1539,20 @@ function generateErrorAnalysisSection(summaries) {
  */
 function getCategoryEmoji(category) {
     switch (category) {
-        case 'client_error_validation': return '⚠️';
-        case 'client_error_auth': return '🔐';
-        case 'client_error_not_found': return '🔍';
-        case 'client_error_conflict': return '💥';
-        case 'client_error_rate_limit': return '⏱️';
-        case 'server_error': return '🔥';
-        default: return '❓';
+        case 'client_error_validation':
+            return '⚠️';
+        case 'client_error_auth':
+            return '🔐';
+        case 'client_error_not_found':
+            return '🔍';
+        case 'client_error_conflict':
+            return '💥';
+        case 'client_error_rate_limit':
+            return '⏱️';
+        case 'server_error':
+            return '🔥';
+        default:
+            return '❓';
     }
 }
 /**
@@ -1493,13 +1576,30 @@ function formatCategoryDescription(category) {
             return 'Unknown error category';
     }
 }
+/**
+ * Get the top category from a category counts map.
+ */
+function normalizeCategoryCounts(counts) {
+    if (!counts) {
+        return new Map();
+    }
+    if (counts instanceof Map) {
+        return counts;
+    }
+    if (typeof counts !== 'object') {
+        return new Map();
+    }
+    const entries = Object.entries(counts).filter((entry) => typeof entry[1] === 'number');
+    return new Map(entries);
+}
 /**
  * Get the top category from a category counts map.
  */
 function getTopCategory(counts) {
+    const normalized = normalizeCategoryCounts(counts);
     let topCategory;
     let topCount = 0;
-    for (const [category, count] of counts) {
+    for (const [category, count] of normalized) {
         if (count > topCount) {
             topCount = count;
             topCategory = category;
@@ -1601,7 +1701,10 @@ function formatIssueTypeLabel(type) {
         case 'no_examples':
             return 'No Examples';
         default:
-            return type.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
+            return type
+                .split('_')
+                .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
+                .join(' ');
     }
 }
 /**
@@ -1617,10 +1720,10 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
         return [];
     }
     // Find successful interactions
-    const successful = profile.interactions.filter(i => {
+    const successful = profile.interactions.filter((i) => {
         if (i.error || i.response?.isError)
             return false;
-        const textContent = i.response?.content?.find(c => c.type === 'text');
+        const textContent = i.response?.content?.find((c) => c.type === 'text');
         if (textContent && 'text' in textContent) {
             if (looksLikeError(String(textContent.text)))
                 return false;
@@ -1640,7 +1743,7 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
         if (seenArgsHashes.has(argsHash))
             continue;
         seenArgsHashes.add(argsHash);
-        const textContent = interaction.response?.content?.find(c => c.type === 'text');
+        const textContent = interaction.response?.content?.find((c) => c.type === 'text');
         if (!textContent || !('text' in textContent))
             continue;
         const responseText = String(textContent.text);
@@ -1696,7 +1799,7 @@ function generateToolErrorPatterns(profile) {
             continue;
         }
         const errorText = interaction.error || '';
-        const textContent = interaction.response?.content?.find(c => c.type === 'text');
+        const textContent = interaction.response?.content?.find((c) => c.type === 'text');
         const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
         const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
         if (!isError)
@@ -1706,7 +1809,8 @@ function generateToolErrorPatterns(profile) {
             continue;
         const category = categorizeError(errorContent);
         const existing = errorCategories.get(category) || [];
-        if (existing.length < 2) { // Max 2 examples per category
+        if (existing.length < 2) {
+            // Max 2 examples per category
             const truncated = errorContent.length > 100 ? `${errorContent.slice(0, 97)}...` : errorContent;
             existing.push(truncated);
         }
@@ -1758,7 +1862,7 @@ function generateErrorSummarySection(profiles) {
                 continue;
             }
             const errorText = interaction.error || '';
-            const textContent = interaction.response?.content?.find(c => c.type === 'text');
+            const textContent = interaction.response?.content?.find((c) => c.type === 'text');
             const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
             const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
             if (!isError)
@@ -1771,7 +1875,8 @@ function generateErrorSummarySection(profiles) {
             existing.count++;
             existing.tools.add(profile.name);
             if (!existing.example) {
-                existing.example = errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
+                existing.example =
+                    errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
             }
             categoryCounts.set(category, existing);
         }
@@ -1786,7 +1891,10 @@ function generateErrorSummarySection(profiles) {
     lines.push('| Category | Count | Affected Tools |');
     lines.push('|----------|-------|----------------|');
     for (const [category, data] of categoryCounts) {
-        const toolList = Array.from(data.tools).slice(0, 3).map(t => `\`${t}\``).join(', ');
+        const toolList = Array.from(data.tools)
+            .slice(0, 3)
+            .map((t) => `\`${t}\``)
+            .join(', ');
         const more = data.tools.size > 3 ? ` +${data.tools.size - 3} more` : '';
         lines.push(`| ${category} | ${data.count} | ${toolList}${more} |`);
     }
@@ -1813,7 +1921,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
                 continue;
             }
             const errorText = interaction.error || '';
-            const textContent = interaction.response?.content?.find(c => c.type === 'text');
+            const textContent = interaction.response?.content?.find((c) => c.type === 'text');
             const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
             const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
             if (!isError)
@@ -1843,7 +1951,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
             });
         }
         if (patterns.length > 0) {
-            const tool = tools.find(t => t.name === profile.name);
+            const tool = tools.find((t) => t.name === profile.name);
             errorInputs.push({
                 toolName: profile.name,
                 toolDescription: tool?.description,
@@ -1962,7 +2070,9 @@ function collectAssertionFailures(profile) {
         for (const result of interaction.assertionResults ?? []) {
             if (result.passed)
                 continue;
-            const message = result.message ? `${result.type}: ${result.message}` : `${result.type} failed`;
+            const message = result.message
+                ? `${result.type}: ${result.message}`
+                : `${result.type} failed`;
             failures.add(message);
         }
     }