@dotsetlabs/bellwether 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/README.md +8 -2
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +1 -3
- package/dist/baseline/baseline-format.d.ts +287 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +249 -11
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +46 -34
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +39 -2
- package/dist/baseline/saver.js +41 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +10 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +110 -0
- package/dist/cli/commands/check.js +23 -6
- package/dist/cli/commands/explore.js +34 -14
- package/dist/cli/index.js +8 -0
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +4 -0
- package/dist/constants/core.js +8 -19
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/docs/contract.js +131 -83
- package/dist/docs/report.js +8 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/transport/mcp-client.js +1 -1
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +157 -67
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
package/dist/docs/contract.js
CHANGED
|
@@ -52,13 +52,15 @@ function classifyIssuesBySource(profiles) {
|
|
|
52
52
|
// but tool didn't actually reject - this shouldn't happen with outcomeAssessment.correct check above
|
|
53
53
|
// so we classify based on expected outcome and error classification
|
|
54
54
|
// 1. Check for external dependency errors (highest priority for classification)
|
|
55
|
-
if (errorClassification &&
|
|
55
|
+
if (errorClassification &&
|
|
56
|
+
errorClassification.externalServiceErrors > 0 &&
|
|
57
|
+
detectedServices.length > 0) {
|
|
56
58
|
// Check if the error message matches known external service patterns
|
|
57
|
-
const isExternalError = detectedServices.some(service => {
|
|
59
|
+
const isExternalError = detectedServices.some((service) => {
|
|
58
60
|
const serviceConfig = EXTERNAL_DEPENDENCIES.SERVICES[service];
|
|
59
61
|
if (!serviceConfig)
|
|
60
62
|
return false;
|
|
61
|
-
return serviceConfig.errorPatterns.some(pattern => pattern.test(errorMsg));
|
|
63
|
+
return serviceConfig.errorPatterns.some((pattern) => pattern.test(errorMsg));
|
|
62
64
|
});
|
|
63
65
|
if (isExternalError) {
|
|
64
66
|
issue.service = detectedServices[0];
|
|
@@ -68,7 +70,7 @@ function classifyIssuesBySource(profiles) {
|
|
|
68
70
|
}
|
|
69
71
|
// 2. Check for environment configuration errors
|
|
70
72
|
if (errorClassification && errorClassification.environmentErrors > 0) {
|
|
71
|
-
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
|
|
73
|
+
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
|
|
72
74
|
if (isEnvironmentError) {
|
|
73
75
|
result.environment.push(issue);
|
|
74
76
|
continue;
|
|
@@ -99,7 +101,7 @@ function classifyIssuesBySource(profiles) {
|
|
|
99
101
|
continue;
|
|
100
102
|
}
|
|
101
103
|
// Check if error message indicates environment issue
|
|
102
|
-
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
|
|
104
|
+
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
|
|
103
105
|
if (isEnvironmentError) {
|
|
104
106
|
result.environment.push(issue);
|
|
105
107
|
continue;
|
|
@@ -149,7 +151,7 @@ export function generateContractMd(result, options) {
|
|
|
149
151
|
lines.push(`**Protocol Version:** ${discovery.protocolVersion}`);
|
|
150
152
|
lines.push('');
|
|
151
153
|
const performanceMetrics = calculatePerformanceMetrics(toolProfiles);
|
|
152
|
-
const performanceByTool = new Map(performanceMetrics.map(metric => [metric.toolName, metric]));
|
|
154
|
+
const performanceByTool = new Map(performanceMetrics.map((metric) => [metric.toolName, metric]));
|
|
153
155
|
// Capabilities summary
|
|
154
156
|
lines.push('## Capabilities');
|
|
155
157
|
lines.push('');
|
|
@@ -176,7 +178,7 @@ export function generateContractMd(result, options) {
|
|
|
176
178
|
const params = extractParameters(tool.inputSchema);
|
|
177
179
|
const desc = tool.description?.substring(0, 50) || 'No description';
|
|
178
180
|
const descDisplay = tool.description && tool.description.length > 50 ? `${desc}...` : desc;
|
|
179
|
-
const profile = toolProfiles.find(p => p.name === tool.name);
|
|
181
|
+
const profile = toolProfiles.find((p) => p.name === tool.name);
|
|
180
182
|
const perf = performanceByTool.get(tool.name);
|
|
181
183
|
const successRate = calculateToolSuccessRate(profile, {
|
|
182
184
|
countValidationAsSuccess,
|
|
@@ -291,7 +293,7 @@ export function generateContractMd(result, options) {
|
|
|
291
293
|
lines.push('## Tools');
|
|
292
294
|
lines.push('');
|
|
293
295
|
for (const tool of discovery.tools) {
|
|
294
|
-
const profile = toolProfiles.find(p => p.name === tool.name);
|
|
296
|
+
const profile = toolProfiles.find((p) => p.name === tool.name);
|
|
295
297
|
lines.push(`### ${tool.name}`);
|
|
296
298
|
lines.push('');
|
|
297
299
|
lines.push(tool.description || 'No description available.');
|
|
@@ -397,7 +399,7 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
397
399
|
if (!profile) {
|
|
398
400
|
return null;
|
|
399
401
|
}
|
|
400
|
-
const interactions = profile.interactions.filter(i => !i.mocked);
|
|
402
|
+
const interactions = profile.interactions.filter((i) => !i.mocked);
|
|
401
403
|
if (interactions.length === 0) {
|
|
402
404
|
return null;
|
|
403
405
|
}
|
|
@@ -408,7 +410,7 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
408
410
|
for (const interaction of interactions) {
|
|
409
411
|
const expected = interaction.question.expectedOutcome ?? 'success';
|
|
410
412
|
const hasError = interaction.error || interaction.response?.isError;
|
|
411
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
413
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
412
414
|
const hasErrorText = textContent && 'text' in textContent && looksLikeError(String(textContent.text));
|
|
413
415
|
const gotError = hasError || hasErrorText;
|
|
414
416
|
if (expected === 'error') {
|
|
@@ -437,7 +439,9 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
437
439
|
const reliabilityRate = total > 0 ? (correctOutcomes / total) * 100 : 0;
|
|
438
440
|
const happyPathRate = happyPathTotal > 0 ? (happyPathSuccesses / happyPathTotal) * 100 : 100;
|
|
439
441
|
const validationRate = options.separateValidationMetrics
|
|
440
|
-
?
|
|
442
|
+
? validationTotal > 0
|
|
443
|
+
? (validationSuccesses / validationTotal) * 100
|
|
444
|
+
: 100
|
|
441
445
|
: 100;
|
|
442
446
|
return {
|
|
443
447
|
total,
|
|
@@ -481,8 +485,7 @@ function formatConfidenceIndicator(level) {
|
|
|
481
485
|
function generateTransportIssuesSection(transportErrors, warnings) {
|
|
482
486
|
const lines = [];
|
|
483
487
|
// Skip if no transport issues to report
|
|
484
|
-
if ((!transportErrors || transportErrors.length === 0) &&
|
|
485
|
-
(!warnings || warnings.length === 0)) {
|
|
488
|
+
if ((!transportErrors || transportErrors.length === 0) && (!warnings || warnings.length === 0)) {
|
|
486
489
|
return lines;
|
|
487
490
|
}
|
|
488
491
|
lines.push('## Transport Issues');
|
|
@@ -507,8 +510,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
|
|
|
507
510
|
lines.push('The following transport-level errors were detected during server communication:');
|
|
508
511
|
lines.push('');
|
|
509
512
|
// Categorize errors
|
|
510
|
-
const serverBugErrors = transportErrors.filter(e => e.likelyServerBug);
|
|
511
|
-
const envErrors = transportErrors.filter(e => !e.likelyServerBug);
|
|
513
|
+
const serverBugErrors = transportErrors.filter((e) => e.likelyServerBug);
|
|
514
|
+
const envErrors = transportErrors.filter((e) => !e.likelyServerBug);
|
|
512
515
|
// Server bugs (critical)
|
|
513
516
|
if (serverBugErrors.length > 0) {
|
|
514
517
|
lines.push('#### Likely Server Bugs');
|
|
@@ -548,8 +551,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
|
|
|
548
551
|
lines.push('');
|
|
549
552
|
}
|
|
550
553
|
// Recommendations
|
|
551
|
-
const hasInvalidJson = transportErrors.some(e => e.category === 'invalid_json');
|
|
552
|
-
const hasProtocolError = transportErrors.some(e => e.category === 'protocol_violation');
|
|
554
|
+
const hasInvalidJson = transportErrors.some((e) => e.category === 'invalid_json');
|
|
555
|
+
const hasProtocolError = transportErrors.some((e) => e.category === 'protocol_violation');
|
|
553
556
|
if (hasInvalidJson || hasProtocolError) {
|
|
554
557
|
lines.push('### Recommendations');
|
|
555
558
|
lines.push('');
|
|
@@ -607,7 +610,7 @@ function generateMetricsLegendSection() {
|
|
|
607
610
|
}
|
|
608
611
|
function generateValidationTestingSection(profiles) {
|
|
609
612
|
const lines = [];
|
|
610
|
-
const validationSummary = profiles.map(profile => {
|
|
613
|
+
const validationSummary = profiles.map((profile) => {
|
|
611
614
|
const buckets = {
|
|
612
615
|
input: summarizeValidationBucket(profile, 'input'),
|
|
613
616
|
type: summarizeValidationBucket(profile, 'type'),
|
|
@@ -615,7 +618,7 @@ function generateValidationTestingSection(profiles) {
|
|
|
615
618
|
};
|
|
616
619
|
return { profile, buckets };
|
|
617
620
|
});
|
|
618
|
-
const hasValidationTests = validationSummary.some(summary => Object.values(summary.buckets).some(bucket => bucket.total > 0));
|
|
621
|
+
const hasValidationTests = validationSummary.some((summary) => Object.values(summary.buckets).some((bucket) => bucket.total > 0));
|
|
619
622
|
if (!hasValidationTests) {
|
|
620
623
|
return lines;
|
|
621
624
|
}
|
|
@@ -668,8 +671,8 @@ function generateIssuesDetectedSection(profiles) {
|
|
|
668
671
|
lines.push(`### ${ISSUE_CLASSIFICATION.ICONS.serverBug} ${ISSUE_CLASSIFICATION.HEADERS.serverBug}`);
|
|
669
672
|
lines.push('');
|
|
670
673
|
// Separate critical (accepts invalid input) from other bugs
|
|
671
|
-
const criticalBugs = classified.serverBug.filter(i => i.critical);
|
|
672
|
-
const otherBugs = classified.serverBug.filter(i => !i.critical);
|
|
674
|
+
const criticalBugs = classified.serverBug.filter((i) => i.critical);
|
|
675
|
+
const otherBugs = classified.serverBug.filter((i) => !i.critical);
|
|
673
676
|
if (criticalBugs.length > 0) {
|
|
674
677
|
lines.push('**Critical - Accepts Invalid Input:**');
|
|
675
678
|
for (const issue of criticalBugs.slice(0, DISPLAY_LIMITS.ISSUES_DISPLAY_LIMIT)) {
|
|
@@ -809,7 +812,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
809
812
|
return [];
|
|
810
813
|
}
|
|
811
814
|
// Only show if we have meaningful data
|
|
812
|
-
const hasValidMetrics = metrics.some(m => m.callCount >= 2);
|
|
815
|
+
const hasValidMetrics = metrics.some((m) => m.callCount >= 2);
|
|
813
816
|
if (!hasValidMetrics) {
|
|
814
817
|
return [];
|
|
815
818
|
}
|
|
@@ -830,11 +833,11 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
830
833
|
}
|
|
831
834
|
lines.push('');
|
|
832
835
|
// Show low confidence warning if any tools have low confidence
|
|
833
|
-
const lowConfidenceTools = metrics.filter(m => m.confidence?.confidenceLevel === 'low');
|
|
836
|
+
const lowConfidenceTools = metrics.filter((m) => m.confidence?.confidenceLevel === 'low');
|
|
834
837
|
if (lowConfidenceTools.length > 0) {
|
|
835
838
|
// Categorize low confidence by reason
|
|
836
|
-
const lowSampleTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
|
|
837
|
-
const highVariabilityTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
|
|
839
|
+
const lowSampleTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
|
|
840
|
+
const highVariabilityTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
|
|
838
841
|
(m.confidence?.coefficientOfVariation ?? 0) > PERFORMANCE_CONFIDENCE.MEDIUM.MAX_CV);
|
|
839
842
|
lines.push(`> **⚠️ Low Confidence**: ${lowConfidenceTools.length} tool(s) have low statistical confidence.`);
|
|
840
843
|
if (lowSampleTools.length > 0) {
|
|
@@ -847,7 +850,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
847
850
|
lines.push('');
|
|
848
851
|
}
|
|
849
852
|
// Add confidence summary section (collapsed)
|
|
850
|
-
const hasConfidenceData = metrics.some(m => m.confidence);
|
|
853
|
+
const hasConfidenceData = metrics.some((m) => m.confidence);
|
|
851
854
|
if (hasConfidenceData) {
|
|
852
855
|
lines.push('<details>');
|
|
853
856
|
lines.push('<summary>Confidence Metrics Details</summary>');
|
|
@@ -867,7 +870,9 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
867
870
|
// In this case, display ~0% to indicate the variability is below measurement threshold
|
|
868
871
|
const rawCV = m.confidence.coefficientOfVariation * 100;
|
|
869
872
|
const cvDisplay = successfulSamples > 0
|
|
870
|
-
?
|
|
873
|
+
? roundedStdDev === 0 && rawCV > 1
|
|
874
|
+
? '~0%'
|
|
875
|
+
: `${rawCV.toFixed(1)}%`
|
|
871
876
|
: 'N/A';
|
|
872
877
|
const levelLabel = PERFORMANCE_CONFIDENCE.LABELS[m.confidence.confidenceLevel];
|
|
873
878
|
lines.push(`| \`${escapeTableCell(m.toolName)}\` | ${successfulSamples} | ${validationSamples} | ${totalTests} | ${stdDevDisplay} | ${cvDisplay} | ${levelLabel} |`);
|
|
@@ -931,11 +936,11 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
931
936
|
lines.push(`| Average Risk Score | ${avgRiskScore}/100 |`);
|
|
932
937
|
// Count by severity
|
|
933
938
|
const bySeverity = {
|
|
934
|
-
critical: allFindings.filter(f => f.riskLevel === 'critical').length,
|
|
935
|
-
high: allFindings.filter(f => f.riskLevel === 'high').length,
|
|
936
|
-
medium: allFindings.filter(f => f.riskLevel === 'medium').length,
|
|
937
|
-
low: allFindings.filter(f => f.riskLevel === 'low').length,
|
|
938
|
-
info: allFindings.filter(f => f.riskLevel === 'info').length,
|
|
939
|
+
critical: allFindings.filter((f) => f.riskLevel === 'critical').length,
|
|
940
|
+
high: allFindings.filter((f) => f.riskLevel === 'high').length,
|
|
941
|
+
medium: allFindings.filter((f) => f.riskLevel === 'medium').length,
|
|
942
|
+
low: allFindings.filter((f) => f.riskLevel === 'low').length,
|
|
943
|
+
info: allFindings.filter((f) => f.riskLevel === 'info').length,
|
|
939
944
|
};
|
|
940
945
|
if (bySeverity.critical > 0) {
|
|
941
946
|
lines.push(`| Critical Findings | ${bySeverity.critical} |`);
|
|
@@ -954,7 +959,7 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
954
959
|
return lines;
|
|
955
960
|
}
|
|
956
961
|
// Show findings by severity
|
|
957
|
-
const criticalAndHigh = allFindings.filter(f => f.riskLevel === 'critical' || f.riskLevel === 'high');
|
|
962
|
+
const criticalAndHigh = allFindings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high');
|
|
958
963
|
if (criticalAndHigh.length > 0) {
|
|
959
964
|
lines.push('### Critical and High Severity Findings');
|
|
960
965
|
lines.push('');
|
|
@@ -987,7 +992,7 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
987
992
|
lines.push('');
|
|
988
993
|
}
|
|
989
994
|
// Show medium/low findings in collapsed section
|
|
990
|
-
const mediumAndLow = allFindings.filter(f => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
|
|
995
|
+
const mediumAndLow = allFindings.filter((f) => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
|
|
991
996
|
if (mediumAndLow.length > 0) {
|
|
992
997
|
lines.push('<details>');
|
|
993
998
|
lines.push(`<summary>Medium/Low Severity Findings (${mediumAndLow.length})</summary>`);
|
|
@@ -1027,10 +1032,10 @@ function generateWorkflowTestingSection(results) {
|
|
|
1027
1032
|
if (results.length === 0) {
|
|
1028
1033
|
return [];
|
|
1029
1034
|
}
|
|
1030
|
-
const passed = results.filter(r => r.success).length;
|
|
1035
|
+
const passed = results.filter((r) => r.success).length;
|
|
1031
1036
|
const failed = results.length - passed;
|
|
1032
1037
|
const totalSteps = results.reduce((sum, r) => sum + r.workflow.steps.length, 0);
|
|
1033
|
-
const passedSteps = results.reduce((sum, r) => sum + r.steps.filter(s => s.success).length, 0);
|
|
1038
|
+
const passedSteps = results.reduce((sum, r) => sum + r.steps.filter((s) => s.success).length, 0);
|
|
1034
1039
|
const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
|
|
1035
1040
|
lines.push('## Workflow Testing');
|
|
1036
1041
|
lines.push('');
|
|
@@ -1053,7 +1058,7 @@ function generateWorkflowTestingSection(results) {
|
|
|
1053
1058
|
lines.push('|----------|--------|-------|----------|');
|
|
1054
1059
|
for (const result of results) {
|
|
1055
1060
|
const status = result.success ? '✓ Passed' : '✗ Failed';
|
|
1056
|
-
const stepsInfo = `${result.steps.filter(s => s.success).length}/${result.workflow.steps.length}`;
|
|
1061
|
+
const stepsInfo = `${result.steps.filter((s) => s.success).length}/${result.workflow.steps.length}`;
|
|
1057
1062
|
const duration = formatDuration(result.durationMs);
|
|
1058
1063
|
lines.push(`| ${escapeTableCell(result.workflow.name)} | ${status} | ${stepsInfo} | ${duration} |`);
|
|
1059
1064
|
}
|
|
@@ -1083,8 +1088,8 @@ function generateWorkflowTestingSection(results) {
|
|
|
1083
1088
|
if (stepResult.error) {
|
|
1084
1089
|
notes = escapeTableCell(truncateString(stepResult.error, 40));
|
|
1085
1090
|
}
|
|
1086
|
-
else if (stepResult.assertionResults?.some(a => !a.passed)) {
|
|
1087
|
-
const failedAssertions = stepResult.assertionResults.filter(a => !a.passed);
|
|
1091
|
+
else if (stepResult.assertionResults?.some((a) => !a.passed)) {
|
|
1092
|
+
const failedAssertions = stepResult.assertionResults.filter((a) => !a.passed);
|
|
1088
1093
|
notes = `${failedAssertions.length} assertion(s) failed`;
|
|
1089
1094
|
}
|
|
1090
1095
|
}
|
|
@@ -1177,17 +1182,17 @@ function generateSemanticTypesSection(inferences) {
|
|
|
1177
1182
|
byType.set(inf.inferredType, existing);
|
|
1178
1183
|
}
|
|
1179
1184
|
// Sort by number of parameters (most common types first)
|
|
1180
|
-
const sortedTypes = Array.from(byType.entries())
|
|
1181
|
-
.sort((a, b) => b[1].length - a[1].length);
|
|
1185
|
+
const sortedTypes = Array.from(byType.entries()).sort((a, b) => b[1].length - a[1].length);
|
|
1182
1186
|
lines.push('| Type | Parameters | Expected Format |');
|
|
1183
1187
|
lines.push('|------|------------|-----------------|');
|
|
1184
1188
|
for (const [type, params] of sortedTypes) {
|
|
1185
1189
|
const displayName = SEMANTIC_VALIDATION.TYPE_DISPLAY_NAMES[type] ?? type;
|
|
1186
|
-
const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
|
|
1190
|
+
const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
|
|
1191
|
+
'';
|
|
1187
1192
|
// Format parameters as tool.param
|
|
1188
1193
|
const paramList = params
|
|
1189
1194
|
.slice(0, 3)
|
|
1190
|
-
.map(p => `\`${p.toolName}.${p.paramName}\``)
|
|
1195
|
+
.map((p) => `\`${p.toolName}.${p.paramName}\``)
|
|
1191
1196
|
.join(', ');
|
|
1192
1197
|
const moreCount = params.length > 3 ? ` +${params.length - 3} more` : '';
|
|
1193
1198
|
lines.push(`| ${displayName} | ${paramList}${moreCount} | \`${exampleValue}\` |`);
|
|
@@ -1246,9 +1251,10 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1246
1251
|
lines.push('Response schema consistency metrics for tools with sufficient test samples:');
|
|
1247
1252
|
lines.push('');
|
|
1248
1253
|
// Summary stats
|
|
1249
|
-
const stableCount = toolsWithSchemas.filter(t => t.evolution.isStable).length;
|
|
1254
|
+
const stableCount = toolsWithSchemas.filter((t) => t.evolution.isStable).length;
|
|
1250
1255
|
const unstableCount = toolsWithSchemas.length - stableCount;
|
|
1251
|
-
const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
|
|
1256
|
+
const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
|
|
1257
|
+
toolsWithSchemas.length;
|
|
1252
1258
|
lines.push('| Metric | Value |');
|
|
1253
1259
|
lines.push('|--------|-------|');
|
|
1254
1260
|
lines.push(`| Tools Analyzed | ${toolsWithSchemas.length} |`);
|
|
@@ -1269,7 +1275,7 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1269
1275
|
lines.push('| Tool | Grade | Stability | Confidence | Samples | Issues |');
|
|
1270
1276
|
lines.push('|------|-------|-----------|------------|---------|--------|');
|
|
1271
1277
|
// Sort by grade (worst first, then by name)
|
|
1272
|
-
const gradeOrder = {
|
|
1278
|
+
const gradeOrder = { F: 0, D: 1, C: 2, B: 3, A: 4, 'N/A': 5 };
|
|
1273
1279
|
const sortedTools = [...toolsWithSchemas].sort((a, b) => {
|
|
1274
1280
|
const gradeCompare = gradeOrder[a.grade] - gradeOrder[b.grade];
|
|
1275
1281
|
if (gradeCompare !== 0)
|
|
@@ -1284,13 +1290,15 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1284
1290
|
const confidenceDisplay = `${Math.round(evolution.stabilityConfidence * 100)}%`;
|
|
1285
1291
|
const issues = evolution.inconsistentFields.length > 0
|
|
1286
1292
|
? evolution.inconsistentFields.slice(0, 2).join(', ') +
|
|
1287
|
-
(evolution.inconsistentFields.length > 2
|
|
1293
|
+
(evolution.inconsistentFields.length > 2
|
|
1294
|
+
? ` +${evolution.inconsistentFields.length - 2}`
|
|
1295
|
+
: '')
|
|
1288
1296
|
: '-';
|
|
1289
1297
|
lines.push(`| \`${escapeTableCell(name)}\` | ${gradeEmoji} ${grade} | ${stabilityStatus} | ${confidenceDisplay} | ${evolution.sampleCount} | ${escapeTableCell(issues)} |`);
|
|
1290
1298
|
}
|
|
1291
1299
|
lines.push('');
|
|
1292
1300
|
// Detailed breakdown for unstable tools
|
|
1293
|
-
const unstableTools = sortedTools.filter(t => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
|
|
1301
|
+
const unstableTools = sortedTools.filter((t) => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
|
|
1294
1302
|
if (unstableTools.length > 0) {
|
|
1295
1303
|
lines.push('<details>');
|
|
1296
1304
|
lines.push('<summary>Unstable Schema Details</summary>');
|
|
@@ -1327,12 +1335,18 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1327
1335
|
*/
|
|
1328
1336
|
function getGradeEmoji(grade) {
|
|
1329
1337
|
switch (grade) {
|
|
1330
|
-
case 'A':
|
|
1331
|
-
|
|
1332
|
-
case '
|
|
1333
|
-
|
|
1334
|
-
case '
|
|
1335
|
-
|
|
1338
|
+
case 'A':
|
|
1339
|
+
return '🟢';
|
|
1340
|
+
case 'B':
|
|
1341
|
+
return '🟢';
|
|
1342
|
+
case 'C':
|
|
1343
|
+
return '🟡';
|
|
1344
|
+
case 'D':
|
|
1345
|
+
return '🟠';
|
|
1346
|
+
case 'F':
|
|
1347
|
+
return '🔴';
|
|
1348
|
+
case 'N/A':
|
|
1349
|
+
return '⚪';
|
|
1336
1350
|
}
|
|
1337
1351
|
}
|
|
1338
1352
|
/**
|
|
@@ -1360,7 +1374,8 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1360
1374
|
const allCategories = new Set();
|
|
1361
1375
|
const transientCount = toolsWithErrors.reduce((sum, t) => sum + t.summary.transientErrors, 0);
|
|
1362
1376
|
for (const { summary } of toolsWithErrors) {
|
|
1363
|
-
|
|
1377
|
+
const counts = normalizeCategoryCounts(summary.categoryCounts);
|
|
1378
|
+
for (const cat of counts.keys()) {
|
|
1364
1379
|
allCategories.add(cat);
|
|
1365
1380
|
}
|
|
1366
1381
|
}
|
|
@@ -1374,7 +1389,8 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1374
1389
|
// Overall error breakdown by category
|
|
1375
1390
|
const globalCategoryCounts = new Map();
|
|
1376
1391
|
for (const { summary } of toolsWithErrors) {
|
|
1377
|
-
|
|
1392
|
+
const counts = normalizeCategoryCounts(summary.categoryCounts);
|
|
1393
|
+
for (const [cat, count] of counts) {
|
|
1378
1394
|
globalCategoryCounts.set(cat, (globalCategoryCounts.get(cat) ?? 0) + count);
|
|
1379
1395
|
}
|
|
1380
1396
|
}
|
|
@@ -1384,10 +1400,10 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1384
1400
|
lines.push('| Category | Count | Description |');
|
|
1385
1401
|
lines.push('|----------|-------|-------------|');
|
|
1386
1402
|
// Sort by count descending
|
|
1387
|
-
const sortedCategories = Array.from(globalCategoryCounts.entries())
|
|
1388
|
-
.sort((a, b) => b[1] - a[1]);
|
|
1403
|
+
const sortedCategories = Array.from(globalCategoryCounts.entries()).sort((a, b) => b[1] - a[1]);
|
|
1389
1404
|
for (const [category, count] of sortedCategories) {
|
|
1390
|
-
const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
|
|
1405
|
+
const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
|
|
1406
|
+
category;
|
|
1391
1407
|
const emoji = getCategoryEmoji(category);
|
|
1392
1408
|
lines.push(`| ${emoji} ${label} | ${count} | ${escapeTableCell(formatCategoryDescription(category))} |`);
|
|
1393
1409
|
}
|
|
@@ -1406,14 +1422,12 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1406
1422
|
? (ERROR_ANALYSIS.CATEGORY_LABELS[topCategory] ?? topCategory)
|
|
1407
1423
|
: '-';
|
|
1408
1424
|
const topRemediation = summary.topRemediations[0] ?? '-';
|
|
1409
|
-
const truncatedRemediation = topRemediation.length > 50
|
|
1410
|
-
? `${topRemediation.slice(0, 47)}...`
|
|
1411
|
-
: topRemediation;
|
|
1425
|
+
const truncatedRemediation = topRemediation.length > 50 ? `${topRemediation.slice(0, 47)}...` : topRemediation;
|
|
1412
1426
|
lines.push(`| \`${escapeTableCell(name)}\` | ${summary.totalErrors} | ${summary.transientErrors} | ${topCategoryLabel} | ${escapeTableCell(truncatedRemediation)} |`);
|
|
1413
1427
|
}
|
|
1414
1428
|
lines.push('');
|
|
1415
1429
|
// Detailed remediation suggestions (collapsed)
|
|
1416
|
-
const toolsWithRemediations = sortedTools.filter(t => t.summary.topRemediations.length > 0);
|
|
1430
|
+
const toolsWithRemediations = sortedTools.filter((t) => t.summary.topRemediations.length > 0);
|
|
1417
1431
|
if (toolsWithRemediations.length > 0) {
|
|
1418
1432
|
lines.push('<details>');
|
|
1419
1433
|
lines.push('<summary>Remediation Suggestions</summary>');
|
|
@@ -1463,13 +1477,20 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1463
1477
|
*/
|
|
1464
1478
|
function getCategoryEmoji(category) {
|
|
1465
1479
|
switch (category) {
|
|
1466
|
-
case 'client_error_validation':
|
|
1467
|
-
|
|
1468
|
-
case '
|
|
1469
|
-
|
|
1470
|
-
case '
|
|
1471
|
-
|
|
1472
|
-
|
|
1480
|
+
case 'client_error_validation':
|
|
1481
|
+
return '⚠️';
|
|
1482
|
+
case 'client_error_auth':
|
|
1483
|
+
return '🔐';
|
|
1484
|
+
case 'client_error_not_found':
|
|
1485
|
+
return '🔍';
|
|
1486
|
+
case 'client_error_conflict':
|
|
1487
|
+
return '💥';
|
|
1488
|
+
case 'client_error_rate_limit':
|
|
1489
|
+
return '⏱️';
|
|
1490
|
+
case 'server_error':
|
|
1491
|
+
return '🔥';
|
|
1492
|
+
default:
|
|
1493
|
+
return '❓';
|
|
1473
1494
|
}
|
|
1474
1495
|
}
|
|
1475
1496
|
/**
|
|
@@ -1493,13 +1514,30 @@ function formatCategoryDescription(category) {
|
|
|
1493
1514
|
return 'Unknown error category';
|
|
1494
1515
|
}
|
|
1495
1516
|
}
|
|
1517
|
+
/**
|
|
1518
|
+
* Get the top category from a category counts map.
|
|
1519
|
+
*/
|
|
1520
|
+
function normalizeCategoryCounts(counts) {
|
|
1521
|
+
if (!counts) {
|
|
1522
|
+
return new Map();
|
|
1523
|
+
}
|
|
1524
|
+
if (counts instanceof Map) {
|
|
1525
|
+
return counts;
|
|
1526
|
+
}
|
|
1527
|
+
if (typeof counts !== 'object') {
|
|
1528
|
+
return new Map();
|
|
1529
|
+
}
|
|
1530
|
+
const entries = Object.entries(counts).filter((entry) => typeof entry[1] === 'number');
|
|
1531
|
+
return new Map(entries);
|
|
1532
|
+
}
|
|
1496
1533
|
/**
|
|
1497
1534
|
* Get the top category from a category counts map.
|
|
1498
1535
|
*/
|
|
1499
1536
|
function getTopCategory(counts) {
|
|
1537
|
+
const normalized = normalizeCategoryCounts(counts);
|
|
1500
1538
|
let topCategory;
|
|
1501
1539
|
let topCount = 0;
|
|
1502
|
-
for (const [category, count] of
|
|
1540
|
+
for (const [category, count] of normalized) {
|
|
1503
1541
|
if (count > topCount) {
|
|
1504
1542
|
topCount = count;
|
|
1505
1543
|
topCategory = category;
|
|
@@ -1601,7 +1639,10 @@ function formatIssueTypeLabel(type) {
|
|
|
1601
1639
|
case 'no_examples':
|
|
1602
1640
|
return 'No Examples';
|
|
1603
1641
|
default:
|
|
1604
|
-
return type
|
|
1642
|
+
return type
|
|
1643
|
+
.split('_')
|
|
1644
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
1645
|
+
.join(' ');
|
|
1605
1646
|
}
|
|
1606
1647
|
}
|
|
1607
1648
|
/**
|
|
@@ -1617,10 +1658,10 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
|
|
|
1617
1658
|
return [];
|
|
1618
1659
|
}
|
|
1619
1660
|
// Find successful interactions
|
|
1620
|
-
const successful = profile.interactions.filter(i => {
|
|
1661
|
+
const successful = profile.interactions.filter((i) => {
|
|
1621
1662
|
if (i.error || i.response?.isError)
|
|
1622
1663
|
return false;
|
|
1623
|
-
const textContent = i.response?.content?.find(c => c.type === 'text');
|
|
1664
|
+
const textContent = i.response?.content?.find((c) => c.type === 'text');
|
|
1624
1665
|
if (textContent && 'text' in textContent) {
|
|
1625
1666
|
if (looksLikeError(String(textContent.text)))
|
|
1626
1667
|
return false;
|
|
@@ -1640,7 +1681,7 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
|
|
|
1640
1681
|
if (seenArgsHashes.has(argsHash))
|
|
1641
1682
|
continue;
|
|
1642
1683
|
seenArgsHashes.add(argsHash);
|
|
1643
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1684
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1644
1685
|
if (!textContent || !('text' in textContent))
|
|
1645
1686
|
continue;
|
|
1646
1687
|
const responseText = String(textContent.text);
|
|
@@ -1696,7 +1737,7 @@ function generateToolErrorPatterns(profile) {
|
|
|
1696
1737
|
continue;
|
|
1697
1738
|
}
|
|
1698
1739
|
const errorText = interaction.error || '';
|
|
1699
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1740
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1700
1741
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1701
1742
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1702
1743
|
if (!isError)
|
|
@@ -1706,7 +1747,8 @@ function generateToolErrorPatterns(profile) {
|
|
|
1706
1747
|
continue;
|
|
1707
1748
|
const category = categorizeError(errorContent);
|
|
1708
1749
|
const existing = errorCategories.get(category) || [];
|
|
1709
|
-
if (existing.length < 2) {
|
|
1750
|
+
if (existing.length < 2) {
|
|
1751
|
+
// Max 2 examples per category
|
|
1710
1752
|
const truncated = errorContent.length > 100 ? `${errorContent.slice(0, 97)}...` : errorContent;
|
|
1711
1753
|
existing.push(truncated);
|
|
1712
1754
|
}
|
|
@@ -1758,7 +1800,7 @@ function generateErrorSummarySection(profiles) {
|
|
|
1758
1800
|
continue;
|
|
1759
1801
|
}
|
|
1760
1802
|
const errorText = interaction.error || '';
|
|
1761
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1803
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1762
1804
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1763
1805
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1764
1806
|
if (!isError)
|
|
@@ -1771,7 +1813,8 @@ function generateErrorSummarySection(profiles) {
|
|
|
1771
1813
|
existing.count++;
|
|
1772
1814
|
existing.tools.add(profile.name);
|
|
1773
1815
|
if (!existing.example) {
|
|
1774
|
-
existing.example =
|
|
1816
|
+
existing.example =
|
|
1817
|
+
errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
|
|
1775
1818
|
}
|
|
1776
1819
|
categoryCounts.set(category, existing);
|
|
1777
1820
|
}
|
|
@@ -1786,7 +1829,10 @@ function generateErrorSummarySection(profiles) {
|
|
|
1786
1829
|
lines.push('| Category | Count | Affected Tools |');
|
|
1787
1830
|
lines.push('|----------|-------|----------------|');
|
|
1788
1831
|
for (const [category, data] of categoryCounts) {
|
|
1789
|
-
const toolList = Array.from(data.tools)
|
|
1832
|
+
const toolList = Array.from(data.tools)
|
|
1833
|
+
.slice(0, 3)
|
|
1834
|
+
.map((t) => `\`${t}\``)
|
|
1835
|
+
.join(', ');
|
|
1790
1836
|
const more = data.tools.size > 3 ? ` +${data.tools.size - 3} more` : '';
|
|
1791
1837
|
lines.push(`| ${category} | ${data.count} | ${toolList}${more} |`);
|
|
1792
1838
|
}
|
|
@@ -1813,7 +1859,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
|
|
|
1813
1859
|
continue;
|
|
1814
1860
|
}
|
|
1815
1861
|
const errorText = interaction.error || '';
|
|
1816
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1862
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1817
1863
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1818
1864
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1819
1865
|
if (!isError)
|
|
@@ -1843,7 +1889,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
|
|
|
1843
1889
|
});
|
|
1844
1890
|
}
|
|
1845
1891
|
if (patterns.length > 0) {
|
|
1846
|
-
const tool = tools.find(t => t.name === profile.name);
|
|
1892
|
+
const tool = tools.find((t) => t.name === profile.name);
|
|
1847
1893
|
errorInputs.push({
|
|
1848
1894
|
toolName: profile.name,
|
|
1849
1895
|
toolDescription: tool?.description,
|
|
@@ -1962,7 +2008,9 @@ function collectAssertionFailures(profile) {
|
|
|
1962
2008
|
for (const result of interaction.assertionResults ?? []) {
|
|
1963
2009
|
if (result.passed)
|
|
1964
2010
|
continue;
|
|
1965
|
-
const message = result.message
|
|
2011
|
+
const message = result.message
|
|
2012
|
+
? `${result.type}: ${result.message}`
|
|
2013
|
+
: `${result.type} failed`;
|
|
1966
2014
|
failures.add(message);
|
|
1967
2015
|
}
|
|
1968
2016
|
}
|
package/dist/docs/report.js
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import { Ajv2020 as Ajv } from 'ajv/dist/2020.js';
|
|
2
2
|
import { readFileSync } from 'fs';
|
|
3
|
+
import { isAbsolute } from 'path';
|
|
3
4
|
import { fileURLToPath } from 'url';
|
|
4
5
|
import { REPORT_SCHEMAS } from '../constants.js';
|
|
5
6
|
/**
|
|
6
7
|
* Generate a JSON report of the interview.
|
|
7
8
|
*/
|
|
8
9
|
export function generateJsonReport(result, options = {}) {
|
|
9
|
-
const report = options.schemaUrl
|
|
10
|
-
? { $schema: options.schemaUrl, ...result }
|
|
11
|
-
: { ...result };
|
|
10
|
+
const report = options.schemaUrl ? { $schema: options.schemaUrl, ...result } : { ...result };
|
|
12
11
|
const jsonReadyReport = JSON.parse(JSON.stringify(report));
|
|
13
12
|
if (options.validate) {
|
|
14
13
|
const schemaPath = resolveSchemaPath(options.schemaPath);
|
|
@@ -18,7 +17,11 @@ export function generateJsonReport(result, options = {}) {
|
|
|
18
17
|
}
|
|
19
18
|
function resolveSchemaPath(schemaPath) {
|
|
20
19
|
if (schemaPath) {
|
|
21
|
-
|
|
20
|
+
if (isAbsolute(schemaPath)) {
|
|
21
|
+
return schemaPath;
|
|
22
|
+
}
|
|
23
|
+
const base = new URL('../../', import.meta.url);
|
|
24
|
+
return fileURLToPath(new URL(schemaPath, base));
|
|
22
25
|
}
|
|
23
26
|
const url = new URL(`../../${REPORT_SCHEMAS.CHECK_REPORT_SCHEMA_FILE}`, import.meta.url);
|
|
24
27
|
return fileURLToPath(url);
|
|
@@ -38,7 +41,7 @@ function validateReportAgainstSchema(report, schemaPath) {
|
|
|
38
41
|
const validate = ajv.compile(schema);
|
|
39
42
|
if (!validate(report)) {
|
|
40
43
|
const errorText = ajv.errorsText(validate.errors, { separator: '\n' });
|
|
41
|
-
throw new Error(`
|
|
44
|
+
throw new Error(`Report schema validation failed:\n${errorText}`);
|
|
42
45
|
}
|
|
43
46
|
}
|
|
44
47
|
//# sourceMappingURL=report.js.map
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { InterviewResult } from './types.js';
|
|
2
|
+
import type { SemanticInference } from '../validation/semantic-types.js';
|
|
3
|
+
import type { ResponseSchemaEvolution } from '../baseline/response-schema-tracker.js';
|
|
4
|
+
import type { ErrorAnalysisSummary } from '../baseline/error-analyzer.js';
|
|
5
|
+
import type { DocumentationScore } from '../baseline/documentation-scorer.js';
|
|
6
|
+
export interface InterviewInsights {
|
|
7
|
+
semanticInferences?: Record<string, SemanticInference[]>;
|
|
8
|
+
schemaEvolution?: Record<string, ResponseSchemaEvolution>;
|
|
9
|
+
errorAnalysisSummaries?: Record<string, ErrorAnalysisSummary>;
|
|
10
|
+
documentationScore?: DocumentationScore;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Build derived insights from an interview result.
|
|
14
|
+
* These insights are used for documentation and JSON report enrichment.
|
|
15
|
+
*/
|
|
16
|
+
export declare function buildInterviewInsights(result: InterviewResult): InterviewInsights;
|
|
17
|
+
//# sourceMappingURL=insights.d.ts.map
|