@dotsetlabs/bellwether 1.0.3 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +17 -2
- package/dist/auth/credentials.js +2 -0
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +13 -3
- package/dist/baseline/baseline-format.d.ts +335 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +494 -13
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +97 -37
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +40 -3
- package/dist/baseline/saver.js +75 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +30 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +119 -2
- package/dist/cli/commands/baseline.js +70 -35
- package/dist/cli/commands/check.js +71 -15
- package/dist/cli/commands/explore.js +69 -16
- package/dist/cli/commands/init.js +10 -7
- package/dist/cli/commands/watch.js +5 -5
- package/dist/cli/index.js +8 -0
- package/dist/config/loader.js +2 -2
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +5 -1
- package/dist/constants/core.js +9 -20
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/discovery/discovery.js +88 -14
- package/dist/discovery/types.d.ts +5 -1
- package/dist/docs/agents.js +138 -50
- package/dist/docs/contract.js +194 -84
- package/dist/docs/report.js +8 -5
- package/dist/errors/retry.js +11 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/rate-limiter.js +7 -3
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/llm/anthropic.js +14 -4
- package/dist/llm/fallback.d.ts +1 -0
- package/dist/llm/fallback.js +7 -1
- package/dist/llm/openai.js +15 -4
- package/dist/protocol/index.d.ts +2 -0
- package/dist/protocol/index.js +2 -0
- package/dist/protocol/version-registry.d.ts +66 -0
- package/dist/protocol/version-registry.js +159 -0
- package/dist/transport/http-transport.d.ts +11 -1
- package/dist/transport/http-transport.js +21 -2
- package/dist/transport/mcp-client.d.ts +29 -1
- package/dist/transport/mcp-client.js +93 -8
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +162 -71
- package/dist/transport/types.d.ts +134 -1
- package/dist/utils/concurrency.d.ts +2 -0
- package/dist/utils/concurrency.js +9 -2
- package/dist/utils/markdown.js +13 -18
- package/dist/utils/timeout.js +2 -1
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
package/dist/docs/contract.js
CHANGED
|
@@ -5,7 +5,8 @@ import { formatDateISO, formatDuration, escapeTableCell, mermaidLabel, validateJ
|
|
|
5
5
|
import { smartTruncate, getExampleLength } from '../utils/smart-truncate.js';
|
|
6
6
|
import { calculatePerformanceMetrics, extractParameters, looksLikeError } from './shared.js';
|
|
7
7
|
import { analyzeExternalDependencies, formatExternalDependenciesMarkdown, } from '../baseline/external-dependency-detector.js';
|
|
8
|
-
import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, } from '../constants.js';
|
|
8
|
+
import { SEMANTIC_VALIDATION, SCHEMA_EVOLUTION, ERROR_ANALYSIS, PERFORMANCE_CONFIDENCE, DOCUMENTATION_SCORING, EXAMPLE_OUTPUT, EXTERNAL_DEPENDENCIES, RELIABILITY_DISPLAY, CONFIDENCE_INDICATORS, DISPLAY_LIMITS, ISSUE_CLASSIFICATION, MCP, } from '../constants.js';
|
|
9
|
+
import { getFeatureFlags } from '../protocol/index.js';
|
|
9
10
|
/**
|
|
10
11
|
* Classify issues by their source to help users understand which issues
|
|
11
12
|
* are actual bugs vs expected behavior or environment issues.
|
|
@@ -52,13 +53,15 @@ function classifyIssuesBySource(profiles) {
|
|
|
52
53
|
// but tool didn't actually reject - this shouldn't happen with outcomeAssessment.correct check above
|
|
53
54
|
// so we classify based on expected outcome and error classification
|
|
54
55
|
// 1. Check for external dependency errors (highest priority for classification)
|
|
55
|
-
if (errorClassification &&
|
|
56
|
+
if (errorClassification &&
|
|
57
|
+
errorClassification.externalServiceErrors > 0 &&
|
|
58
|
+
detectedServices.length > 0) {
|
|
56
59
|
// Check if the error message matches known external service patterns
|
|
57
|
-
const isExternalError = detectedServices.some(service => {
|
|
60
|
+
const isExternalError = detectedServices.some((service) => {
|
|
58
61
|
const serviceConfig = EXTERNAL_DEPENDENCIES.SERVICES[service];
|
|
59
62
|
if (!serviceConfig)
|
|
60
63
|
return false;
|
|
61
|
-
return serviceConfig.errorPatterns.some(pattern => pattern.test(errorMsg));
|
|
64
|
+
return serviceConfig.errorPatterns.some((pattern) => pattern.test(errorMsg));
|
|
62
65
|
});
|
|
63
66
|
if (isExternalError) {
|
|
64
67
|
issue.service = detectedServices[0];
|
|
@@ -68,7 +71,7 @@ function classifyIssuesBySource(profiles) {
|
|
|
68
71
|
}
|
|
69
72
|
// 2. Check for environment configuration errors
|
|
70
73
|
if (errorClassification && errorClassification.environmentErrors > 0) {
|
|
71
|
-
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
|
|
74
|
+
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
|
|
72
75
|
if (isEnvironmentError) {
|
|
73
76
|
result.environment.push(issue);
|
|
74
77
|
continue;
|
|
@@ -99,7 +102,7 @@ function classifyIssuesBySource(profiles) {
|
|
|
99
102
|
continue;
|
|
100
103
|
}
|
|
101
104
|
// Check if error message indicates environment issue
|
|
102
|
-
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some(pattern => pattern.test(errorMsg));
|
|
105
|
+
const isEnvironmentError = EXTERNAL_DEPENDENCIES.ENVIRONMENT_PATTERNS.some((pattern) => pattern.test(errorMsg));
|
|
103
106
|
if (isEnvironmentError) {
|
|
104
107
|
result.environment.push(issue);
|
|
105
108
|
continue;
|
|
@@ -145,11 +148,20 @@ export function generateContractMd(result, options) {
|
|
|
145
148
|
// Overview
|
|
146
149
|
lines.push('## Overview');
|
|
147
150
|
lines.push('');
|
|
151
|
+
const features = getFeatureFlags(discovery.protocolVersion);
|
|
148
152
|
lines.push(`**Server Version:** ${discovery.serverInfo.version}`);
|
|
149
153
|
lines.push(`**Protocol Version:** ${discovery.protocolVersion}`);
|
|
154
|
+
if (discovery.protocolVersion !== MCP.PROTOCOL_VERSION) {
|
|
155
|
+
lines.push(`*(Server protocol; bellwether supports up to ${MCP.PROTOCOL_VERSION})*`);
|
|
156
|
+
}
|
|
150
157
|
lines.push('');
|
|
151
158
|
const performanceMetrics = calculatePerformanceMetrics(toolProfiles);
|
|
152
|
-
const performanceByTool = new Map(performanceMetrics.map(metric => [metric.toolName, metric]));
|
|
159
|
+
const performanceByTool = new Map(performanceMetrics.map((metric) => [metric.toolName, metric]));
|
|
160
|
+
// Server instructions
|
|
161
|
+
if (discovery.instructions) {
|
|
162
|
+
lines.push(`**Server Instructions:** ${discovery.instructions}`);
|
|
163
|
+
lines.push('');
|
|
164
|
+
}
|
|
153
165
|
// Capabilities summary
|
|
154
166
|
lines.push('## Capabilities');
|
|
155
167
|
lines.push('');
|
|
@@ -162,6 +174,15 @@ export function generateContractMd(result, options) {
|
|
|
162
174
|
if (discovery.capabilities.resources) {
|
|
163
175
|
lines.push(`- **Resources:** ${(discovery.resources ?? []).length} available`);
|
|
164
176
|
}
|
|
177
|
+
if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
|
|
178
|
+
lines.push(`- **Resource Templates:** ${discovery.resourceTemplates.length} available`);
|
|
179
|
+
}
|
|
180
|
+
if (discovery.capabilities.completions && features.completions) {
|
|
181
|
+
lines.push('- **Completions:** Supported');
|
|
182
|
+
}
|
|
183
|
+
if (discovery.capabilities.tasks && features.tasks) {
|
|
184
|
+
lines.push('- **Tasks:** Supported');
|
|
185
|
+
}
|
|
165
186
|
if (discovery.capabilities.logging) {
|
|
166
187
|
lines.push('- **Logging:** Supported');
|
|
167
188
|
}
|
|
@@ -176,7 +197,7 @@ export function generateContractMd(result, options) {
|
|
|
176
197
|
const params = extractParameters(tool.inputSchema);
|
|
177
198
|
const desc = tool.description?.substring(0, 50) || 'No description';
|
|
178
199
|
const descDisplay = tool.description && tool.description.length > 50 ? `${desc}...` : desc;
|
|
179
|
-
const profile = toolProfiles.find(p => p.name === tool.name);
|
|
200
|
+
const profile = toolProfiles.find((p) => p.name === tool.name);
|
|
180
201
|
const perf = performanceByTool.get(tool.name);
|
|
181
202
|
const successRate = calculateToolSuccessRate(profile, {
|
|
182
203
|
countValidationAsSuccess,
|
|
@@ -291,7 +312,7 @@ export function generateContractMd(result, options) {
|
|
|
291
312
|
lines.push('## Tools');
|
|
292
313
|
lines.push('');
|
|
293
314
|
for (const tool of discovery.tools) {
|
|
294
|
-
const profile = toolProfiles.find(p => p.name === tool.name);
|
|
315
|
+
const profile = toolProfiles.find((p) => p.name === tool.name);
|
|
295
316
|
lines.push(`### ${tool.name}`);
|
|
296
317
|
lines.push('');
|
|
297
318
|
lines.push(tool.description || 'No description available.');
|
|
@@ -319,6 +340,22 @@ export function generateContractMd(result, options) {
|
|
|
319
340
|
lines.push('');
|
|
320
341
|
}
|
|
321
342
|
}
|
|
343
|
+
// Show tool annotations (behavioral hints) — version-gated
|
|
344
|
+
if (features.toolAnnotations && tool.annotations) {
|
|
345
|
+
const hints = [];
|
|
346
|
+
if (tool.annotations.readOnlyHint)
|
|
347
|
+
hints.push('read-only');
|
|
348
|
+
if (tool.annotations.destructiveHint)
|
|
349
|
+
hints.push('destructive');
|
|
350
|
+
if (tool.annotations.idempotentHint)
|
|
351
|
+
hints.push('idempotent');
|
|
352
|
+
if (tool.annotations.openWorldHint)
|
|
353
|
+
hints.push('open-world');
|
|
354
|
+
if (hints.length > 0) {
|
|
355
|
+
lines.push(`**Behavioral Hints:** ${hints.join(', ')}`);
|
|
356
|
+
lines.push('');
|
|
357
|
+
}
|
|
358
|
+
}
|
|
322
359
|
if (tool.inputSchema) {
|
|
323
360
|
lines.push('**Input Schema:**');
|
|
324
361
|
const schemaJson = validateJsonForCodeBlock(tool.inputSchema);
|
|
@@ -327,6 +364,15 @@ export function generateContractMd(result, options) {
|
|
|
327
364
|
lines.push('```');
|
|
328
365
|
lines.push('');
|
|
329
366
|
}
|
|
367
|
+
// Show output schema if present — version-gated
|
|
368
|
+
if (features.structuredOutput && tool.outputSchema) {
|
|
369
|
+
lines.push('**Output Schema:**');
|
|
370
|
+
const outputSchemaJson = validateJsonForCodeBlock(tool.outputSchema);
|
|
371
|
+
lines.push('```json');
|
|
372
|
+
lines.push(outputSchemaJson.content);
|
|
373
|
+
lines.push('```');
|
|
374
|
+
lines.push('');
|
|
375
|
+
}
|
|
330
376
|
// Add example usage from successful interactions
|
|
331
377
|
const examples = generateToolExamples(profile, maxExamplesPerTool, exampleLength);
|
|
332
378
|
if (examples.length > 0) {
|
|
@@ -378,6 +424,24 @@ export function generateContractMd(result, options) {
|
|
|
378
424
|
}
|
|
379
425
|
}
|
|
380
426
|
}
|
|
427
|
+
// Resource Templates section
|
|
428
|
+
if (discovery.resourceTemplates && discovery.resourceTemplates.length > 0) {
|
|
429
|
+
lines.push('## Resource Templates');
|
|
430
|
+
lines.push('');
|
|
431
|
+
for (const template of discovery.resourceTemplates) {
|
|
432
|
+
lines.push(`### ${template.name}`);
|
|
433
|
+
lines.push('');
|
|
434
|
+
lines.push(`**URI Template:** \`${template.uriTemplate}\``);
|
|
435
|
+
if (template.mimeType) {
|
|
436
|
+
lines.push(`**MIME Type:** ${template.mimeType}`);
|
|
437
|
+
}
|
|
438
|
+
lines.push('');
|
|
439
|
+
if (template.description) {
|
|
440
|
+
lines.push(template.description);
|
|
441
|
+
lines.push('');
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
381
445
|
// Error Summary section
|
|
382
446
|
const errorSummary = generateErrorSummarySection(toolProfiles);
|
|
383
447
|
if (errorSummary.length > 0) {
|
|
@@ -397,7 +461,7 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
397
461
|
if (!profile) {
|
|
398
462
|
return null;
|
|
399
463
|
}
|
|
400
|
-
const interactions = profile.interactions.filter(i => !i.mocked);
|
|
464
|
+
const interactions = profile.interactions.filter((i) => !i.mocked);
|
|
401
465
|
if (interactions.length === 0) {
|
|
402
466
|
return null;
|
|
403
467
|
}
|
|
@@ -408,7 +472,7 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
408
472
|
for (const interaction of interactions) {
|
|
409
473
|
const expected = interaction.question.expectedOutcome ?? 'success';
|
|
410
474
|
const hasError = interaction.error || interaction.response?.isError;
|
|
411
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
475
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
412
476
|
const hasErrorText = textContent && 'text' in textContent && looksLikeError(String(textContent.text));
|
|
413
477
|
const gotError = hasError || hasErrorText;
|
|
414
478
|
if (expected === 'error') {
|
|
@@ -437,7 +501,9 @@ function calculateReliabilityMetrics(profile, options) {
|
|
|
437
501
|
const reliabilityRate = total > 0 ? (correctOutcomes / total) * 100 : 0;
|
|
438
502
|
const happyPathRate = happyPathTotal > 0 ? (happyPathSuccesses / happyPathTotal) * 100 : 100;
|
|
439
503
|
const validationRate = options.separateValidationMetrics
|
|
440
|
-
?
|
|
504
|
+
? validationTotal > 0
|
|
505
|
+
? (validationSuccesses / validationTotal) * 100
|
|
506
|
+
: 100
|
|
441
507
|
: 100;
|
|
442
508
|
return {
|
|
443
509
|
total,
|
|
@@ -481,8 +547,7 @@ function formatConfidenceIndicator(level) {
|
|
|
481
547
|
function generateTransportIssuesSection(transportErrors, warnings) {
|
|
482
548
|
const lines = [];
|
|
483
549
|
// Skip if no transport issues to report
|
|
484
|
-
if ((!transportErrors || transportErrors.length === 0) &&
|
|
485
|
-
(!warnings || warnings.length === 0)) {
|
|
550
|
+
if ((!transportErrors || transportErrors.length === 0) && (!warnings || warnings.length === 0)) {
|
|
486
551
|
return lines;
|
|
487
552
|
}
|
|
488
553
|
lines.push('## Transport Issues');
|
|
@@ -507,8 +572,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
|
|
|
507
572
|
lines.push('The following transport-level errors were detected during server communication:');
|
|
508
573
|
lines.push('');
|
|
509
574
|
// Categorize errors
|
|
510
|
-
const serverBugErrors = transportErrors.filter(e => e.likelyServerBug);
|
|
511
|
-
const envErrors = transportErrors.filter(e => !e.likelyServerBug);
|
|
575
|
+
const serverBugErrors = transportErrors.filter((e) => e.likelyServerBug);
|
|
576
|
+
const envErrors = transportErrors.filter((e) => !e.likelyServerBug);
|
|
512
577
|
// Server bugs (critical)
|
|
513
578
|
if (serverBugErrors.length > 0) {
|
|
514
579
|
lines.push('#### Likely Server Bugs');
|
|
@@ -548,8 +613,8 @@ function generateTransportIssuesSection(transportErrors, warnings) {
|
|
|
548
613
|
lines.push('');
|
|
549
614
|
}
|
|
550
615
|
// Recommendations
|
|
551
|
-
const hasInvalidJson = transportErrors.some(e => e.category === 'invalid_json');
|
|
552
|
-
const hasProtocolError = transportErrors.some(e => e.category === 'protocol_violation');
|
|
616
|
+
const hasInvalidJson = transportErrors.some((e) => e.category === 'invalid_json');
|
|
617
|
+
const hasProtocolError = transportErrors.some((e) => e.category === 'protocol_violation');
|
|
553
618
|
if (hasInvalidJson || hasProtocolError) {
|
|
554
619
|
lines.push('### Recommendations');
|
|
555
620
|
lines.push('');
|
|
@@ -607,7 +672,7 @@ function generateMetricsLegendSection() {
|
|
|
607
672
|
}
|
|
608
673
|
function generateValidationTestingSection(profiles) {
|
|
609
674
|
const lines = [];
|
|
610
|
-
const validationSummary = profiles.map(profile => {
|
|
675
|
+
const validationSummary = profiles.map((profile) => {
|
|
611
676
|
const buckets = {
|
|
612
677
|
input: summarizeValidationBucket(profile, 'input'),
|
|
613
678
|
type: summarizeValidationBucket(profile, 'type'),
|
|
@@ -615,7 +680,7 @@ function generateValidationTestingSection(profiles) {
|
|
|
615
680
|
};
|
|
616
681
|
return { profile, buckets };
|
|
617
682
|
});
|
|
618
|
-
const hasValidationTests = validationSummary.some(summary => Object.values(summary.buckets).some(bucket => bucket.total > 0));
|
|
683
|
+
const hasValidationTests = validationSummary.some((summary) => Object.values(summary.buckets).some((bucket) => bucket.total > 0));
|
|
619
684
|
if (!hasValidationTests) {
|
|
620
685
|
return lines;
|
|
621
686
|
}
|
|
@@ -668,8 +733,8 @@ function generateIssuesDetectedSection(profiles) {
|
|
|
668
733
|
lines.push(`### ${ISSUE_CLASSIFICATION.ICONS.serverBug} ${ISSUE_CLASSIFICATION.HEADERS.serverBug}`);
|
|
669
734
|
lines.push('');
|
|
670
735
|
// Separate critical (accepts invalid input) from other bugs
|
|
671
|
-
const criticalBugs = classified.serverBug.filter(i => i.critical);
|
|
672
|
-
const otherBugs = classified.serverBug.filter(i => !i.critical);
|
|
736
|
+
const criticalBugs = classified.serverBug.filter((i) => i.critical);
|
|
737
|
+
const otherBugs = classified.serverBug.filter((i) => !i.critical);
|
|
673
738
|
if (criticalBugs.length > 0) {
|
|
674
739
|
lines.push('**Critical - Accepts Invalid Input:**');
|
|
675
740
|
for (const issue of criticalBugs.slice(0, DISPLAY_LIMITS.ISSUES_DISPLAY_LIMIT)) {
|
|
@@ -809,7 +874,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
809
874
|
return [];
|
|
810
875
|
}
|
|
811
876
|
// Only show if we have meaningful data
|
|
812
|
-
const hasValidMetrics = metrics.some(m => m.callCount >= 2);
|
|
877
|
+
const hasValidMetrics = metrics.some((m) => m.callCount >= 2);
|
|
813
878
|
if (!hasValidMetrics) {
|
|
814
879
|
return [];
|
|
815
880
|
}
|
|
@@ -830,11 +895,11 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
830
895
|
}
|
|
831
896
|
lines.push('');
|
|
832
897
|
// Show low confidence warning if any tools have low confidence
|
|
833
|
-
const lowConfidenceTools = metrics.filter(m => m.confidence?.confidenceLevel === 'low');
|
|
898
|
+
const lowConfidenceTools = metrics.filter((m) => m.confidence?.confidenceLevel === 'low');
|
|
834
899
|
if (lowConfidenceTools.length > 0) {
|
|
835
900
|
// Categorize low confidence by reason
|
|
836
|
-
const lowSampleTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
|
|
837
|
-
const highVariabilityTools = lowConfidenceTools.filter(m => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
|
|
901
|
+
const lowSampleTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) < PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES);
|
|
902
|
+
const highVariabilityTools = lowConfidenceTools.filter((m) => (m.confidence?.successfulSamples ?? 0) >= PERFORMANCE_CONFIDENCE.HIGH.MIN_SAMPLES &&
|
|
838
903
|
(m.confidence?.coefficientOfVariation ?? 0) > PERFORMANCE_CONFIDENCE.MEDIUM.MAX_CV);
|
|
839
904
|
lines.push(`> **⚠️ Low Confidence**: ${lowConfidenceTools.length} tool(s) have low statistical confidence.`);
|
|
840
905
|
if (lowSampleTools.length > 0) {
|
|
@@ -847,7 +912,7 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
847
912
|
lines.push('');
|
|
848
913
|
}
|
|
849
914
|
// Add confidence summary section (collapsed)
|
|
850
|
-
const hasConfidenceData = metrics.some(m => m.confidence);
|
|
915
|
+
const hasConfidenceData = metrics.some((m) => m.confidence);
|
|
851
916
|
if (hasConfidenceData) {
|
|
852
917
|
lines.push('<details>');
|
|
853
918
|
lines.push('<summary>Confidence Metrics Details</summary>');
|
|
@@ -867,7 +932,9 @@ function generateContractPerformanceSection(profiles, metricsOverride) {
|
|
|
867
932
|
// In this case, display ~0% to indicate the variability is below measurement threshold
|
|
868
933
|
const rawCV = m.confidence.coefficientOfVariation * 100;
|
|
869
934
|
const cvDisplay = successfulSamples > 0
|
|
870
|
-
?
|
|
935
|
+
? roundedStdDev === 0 && rawCV > 1
|
|
936
|
+
? '~0%'
|
|
937
|
+
: `${rawCV.toFixed(1)}%`
|
|
871
938
|
: 'N/A';
|
|
872
939
|
const levelLabel = PERFORMANCE_CONFIDENCE.LABELS[m.confidence.confidenceLevel];
|
|
873
940
|
lines.push(`| \`${escapeTableCell(m.toolName)}\` | ${successfulSamples} | ${validationSamples} | ${totalTests} | ${stdDevDisplay} | ${cvDisplay} | ${levelLabel} |`);
|
|
@@ -931,11 +998,11 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
931
998
|
lines.push(`| Average Risk Score | ${avgRiskScore}/100 |`);
|
|
932
999
|
// Count by severity
|
|
933
1000
|
const bySeverity = {
|
|
934
|
-
critical: allFindings.filter(f => f.riskLevel === 'critical').length,
|
|
935
|
-
high: allFindings.filter(f => f.riskLevel === 'high').length,
|
|
936
|
-
medium: allFindings.filter(f => f.riskLevel === 'medium').length,
|
|
937
|
-
low: allFindings.filter(f => f.riskLevel === 'low').length,
|
|
938
|
-
info: allFindings.filter(f => f.riskLevel === 'info').length,
|
|
1001
|
+
critical: allFindings.filter((f) => f.riskLevel === 'critical').length,
|
|
1002
|
+
high: allFindings.filter((f) => f.riskLevel === 'high').length,
|
|
1003
|
+
medium: allFindings.filter((f) => f.riskLevel === 'medium').length,
|
|
1004
|
+
low: allFindings.filter((f) => f.riskLevel === 'low').length,
|
|
1005
|
+
info: allFindings.filter((f) => f.riskLevel === 'info').length,
|
|
939
1006
|
};
|
|
940
1007
|
if (bySeverity.critical > 0) {
|
|
941
1008
|
lines.push(`| Critical Findings | ${bySeverity.critical} |`);
|
|
@@ -954,7 +1021,7 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
954
1021
|
return lines;
|
|
955
1022
|
}
|
|
956
1023
|
// Show findings by severity
|
|
957
|
-
const criticalAndHigh = allFindings.filter(f => f.riskLevel === 'critical' || f.riskLevel === 'high');
|
|
1024
|
+
const criticalAndHigh = allFindings.filter((f) => f.riskLevel === 'critical' || f.riskLevel === 'high');
|
|
958
1025
|
if (criticalAndHigh.length > 0) {
|
|
959
1026
|
lines.push('### Critical and High Severity Findings');
|
|
960
1027
|
lines.push('');
|
|
@@ -987,7 +1054,7 @@ function generateContractSecuritySection(fingerprints) {
|
|
|
987
1054
|
lines.push('');
|
|
988
1055
|
}
|
|
989
1056
|
// Show medium/low findings in collapsed section
|
|
990
|
-
const mediumAndLow = allFindings.filter(f => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
|
|
1057
|
+
const mediumAndLow = allFindings.filter((f) => f.riskLevel === 'medium' || f.riskLevel === 'low' || f.riskLevel === 'info');
|
|
991
1058
|
if (mediumAndLow.length > 0) {
|
|
992
1059
|
lines.push('<details>');
|
|
993
1060
|
lines.push(`<summary>Medium/Low Severity Findings (${mediumAndLow.length})</summary>`);
|
|
@@ -1027,10 +1094,10 @@ function generateWorkflowTestingSection(results) {
|
|
|
1027
1094
|
if (results.length === 0) {
|
|
1028
1095
|
return [];
|
|
1029
1096
|
}
|
|
1030
|
-
const passed = results.filter(r => r.success).length;
|
|
1097
|
+
const passed = results.filter((r) => r.success).length;
|
|
1031
1098
|
const failed = results.length - passed;
|
|
1032
1099
|
const totalSteps = results.reduce((sum, r) => sum + r.workflow.steps.length, 0);
|
|
1033
|
-
const passedSteps = results.reduce((sum, r) => sum + r.steps.filter(s => s.success).length, 0);
|
|
1100
|
+
const passedSteps = results.reduce((sum, r) => sum + r.steps.filter((s) => s.success).length, 0);
|
|
1034
1101
|
const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
|
|
1035
1102
|
lines.push('## Workflow Testing');
|
|
1036
1103
|
lines.push('');
|
|
@@ -1053,7 +1120,7 @@ function generateWorkflowTestingSection(results) {
|
|
|
1053
1120
|
lines.push('|----------|--------|-------|----------|');
|
|
1054
1121
|
for (const result of results) {
|
|
1055
1122
|
const status = result.success ? '✓ Passed' : '✗ Failed';
|
|
1056
|
-
const stepsInfo = `${result.steps.filter(s => s.success).length}/${result.workflow.steps.length}`;
|
|
1123
|
+
const stepsInfo = `${result.steps.filter((s) => s.success).length}/${result.workflow.steps.length}`;
|
|
1057
1124
|
const duration = formatDuration(result.durationMs);
|
|
1058
1125
|
lines.push(`| ${escapeTableCell(result.workflow.name)} | ${status} | ${stepsInfo} | ${duration} |`);
|
|
1059
1126
|
}
|
|
@@ -1083,8 +1150,8 @@ function generateWorkflowTestingSection(results) {
|
|
|
1083
1150
|
if (stepResult.error) {
|
|
1084
1151
|
notes = escapeTableCell(truncateString(stepResult.error, 40));
|
|
1085
1152
|
}
|
|
1086
|
-
else if (stepResult.assertionResults?.some(a => !a.passed)) {
|
|
1087
|
-
const failedAssertions = stepResult.assertionResults.filter(a => !a.passed);
|
|
1153
|
+
else if (stepResult.assertionResults?.some((a) => !a.passed)) {
|
|
1154
|
+
const failedAssertions = stepResult.assertionResults.filter((a) => !a.passed);
|
|
1088
1155
|
notes = `${failedAssertions.length} assertion(s) failed`;
|
|
1089
1156
|
}
|
|
1090
1157
|
}
|
|
@@ -1177,17 +1244,17 @@ function generateSemanticTypesSection(inferences) {
|
|
|
1177
1244
|
byType.set(inf.inferredType, existing);
|
|
1178
1245
|
}
|
|
1179
1246
|
// Sort by number of parameters (most common types first)
|
|
1180
|
-
const sortedTypes = Array.from(byType.entries())
|
|
1181
|
-
.sort((a, b) => b[1].length - a[1].length);
|
|
1247
|
+
const sortedTypes = Array.from(byType.entries()).sort((a, b) => b[1].length - a[1].length);
|
|
1182
1248
|
lines.push('| Type | Parameters | Expected Format |');
|
|
1183
1249
|
lines.push('|------|------------|-----------------|');
|
|
1184
1250
|
for (const [type, params] of sortedTypes) {
|
|
1185
1251
|
const displayName = SEMANTIC_VALIDATION.TYPE_DISPLAY_NAMES[type] ?? type;
|
|
1186
|
-
const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
|
|
1252
|
+
const exampleValue = SEMANTIC_VALIDATION.EXAMPLE_VALUES[type] ??
|
|
1253
|
+
'';
|
|
1187
1254
|
// Format parameters as tool.param
|
|
1188
1255
|
const paramList = params
|
|
1189
1256
|
.slice(0, 3)
|
|
1190
|
-
.map(p => `\`${p.toolName}.${p.paramName}\``)
|
|
1257
|
+
.map((p) => `\`${p.toolName}.${p.paramName}\``)
|
|
1191
1258
|
.join(', ');
|
|
1192
1259
|
const moreCount = params.length > 3 ? ` +${params.length - 3} more` : '';
|
|
1193
1260
|
lines.push(`| ${displayName} | ${paramList}${moreCount} | \`${exampleValue}\` |`);
|
|
@@ -1246,9 +1313,10 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1246
1313
|
lines.push('Response schema consistency metrics for tools with sufficient test samples:');
|
|
1247
1314
|
lines.push('');
|
|
1248
1315
|
// Summary stats
|
|
1249
|
-
const stableCount = toolsWithSchemas.filter(t => t.evolution.isStable).length;
|
|
1316
|
+
const stableCount = toolsWithSchemas.filter((t) => t.evolution.isStable).length;
|
|
1250
1317
|
const unstableCount = toolsWithSchemas.length - stableCount;
|
|
1251
|
-
const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
|
|
1318
|
+
const avgConfidence = toolsWithSchemas.reduce((sum, t) => sum + t.evolution.stabilityConfidence, 0) /
|
|
1319
|
+
toolsWithSchemas.length;
|
|
1252
1320
|
lines.push('| Metric | Value |');
|
|
1253
1321
|
lines.push('|--------|-------|');
|
|
1254
1322
|
lines.push(`| Tools Analyzed | ${toolsWithSchemas.length} |`);
|
|
@@ -1269,7 +1337,7 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1269
1337
|
lines.push('| Tool | Grade | Stability | Confidence | Samples | Issues |');
|
|
1270
1338
|
lines.push('|------|-------|-----------|------------|---------|--------|');
|
|
1271
1339
|
// Sort by grade (worst first, then by name)
|
|
1272
|
-
const gradeOrder = {
|
|
1340
|
+
const gradeOrder = { F: 0, D: 1, C: 2, B: 3, A: 4, 'N/A': 5 };
|
|
1273
1341
|
const sortedTools = [...toolsWithSchemas].sort((a, b) => {
|
|
1274
1342
|
const gradeCompare = gradeOrder[a.grade] - gradeOrder[b.grade];
|
|
1275
1343
|
if (gradeCompare !== 0)
|
|
@@ -1284,13 +1352,15 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1284
1352
|
const confidenceDisplay = `${Math.round(evolution.stabilityConfidence * 100)}%`;
|
|
1285
1353
|
const issues = evolution.inconsistentFields.length > 0
|
|
1286
1354
|
? evolution.inconsistentFields.slice(0, 2).join(', ') +
|
|
1287
|
-
(evolution.inconsistentFields.length > 2
|
|
1355
|
+
(evolution.inconsistentFields.length > 2
|
|
1356
|
+
? ` +${evolution.inconsistentFields.length - 2}`
|
|
1357
|
+
: '')
|
|
1288
1358
|
: '-';
|
|
1289
1359
|
lines.push(`| \`${escapeTableCell(name)}\` | ${gradeEmoji} ${grade} | ${stabilityStatus} | ${confidenceDisplay} | ${evolution.sampleCount} | ${escapeTableCell(issues)} |`);
|
|
1290
1360
|
}
|
|
1291
1361
|
lines.push('');
|
|
1292
1362
|
// Detailed breakdown for unstable tools
|
|
1293
|
-
const unstableTools = sortedTools.filter(t => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
|
|
1363
|
+
const unstableTools = sortedTools.filter((t) => !t.evolution.isStable && t.evolution.inconsistentFields.length > 0);
|
|
1294
1364
|
if (unstableTools.length > 0) {
|
|
1295
1365
|
lines.push('<details>');
|
|
1296
1366
|
lines.push('<summary>Unstable Schema Details</summary>');
|
|
@@ -1327,12 +1397,18 @@ function generateSchemaStabilitySection(schemaEvolution) {
|
|
|
1327
1397
|
*/
|
|
1328
1398
|
function getGradeEmoji(grade) {
|
|
1329
1399
|
switch (grade) {
|
|
1330
|
-
case 'A':
|
|
1331
|
-
|
|
1332
|
-
case '
|
|
1333
|
-
|
|
1334
|
-
case '
|
|
1335
|
-
|
|
1400
|
+
case 'A':
|
|
1401
|
+
return '🟢';
|
|
1402
|
+
case 'B':
|
|
1403
|
+
return '🟢';
|
|
1404
|
+
case 'C':
|
|
1405
|
+
return '🟡';
|
|
1406
|
+
case 'D':
|
|
1407
|
+
return '🟠';
|
|
1408
|
+
case 'F':
|
|
1409
|
+
return '🔴';
|
|
1410
|
+
case 'N/A':
|
|
1411
|
+
return '⚪';
|
|
1336
1412
|
}
|
|
1337
1413
|
}
|
|
1338
1414
|
/**
|
|
@@ -1360,7 +1436,8 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1360
1436
|
const allCategories = new Set();
|
|
1361
1437
|
const transientCount = toolsWithErrors.reduce((sum, t) => sum + t.summary.transientErrors, 0);
|
|
1362
1438
|
for (const { summary } of toolsWithErrors) {
|
|
1363
|
-
|
|
1439
|
+
const counts = normalizeCategoryCounts(summary.categoryCounts);
|
|
1440
|
+
for (const cat of counts.keys()) {
|
|
1364
1441
|
allCategories.add(cat);
|
|
1365
1442
|
}
|
|
1366
1443
|
}
|
|
@@ -1374,7 +1451,8 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1374
1451
|
// Overall error breakdown by category
|
|
1375
1452
|
const globalCategoryCounts = new Map();
|
|
1376
1453
|
for (const { summary } of toolsWithErrors) {
|
|
1377
|
-
|
|
1454
|
+
const counts = normalizeCategoryCounts(summary.categoryCounts);
|
|
1455
|
+
for (const [cat, count] of counts) {
|
|
1378
1456
|
globalCategoryCounts.set(cat, (globalCategoryCounts.get(cat) ?? 0) + count);
|
|
1379
1457
|
}
|
|
1380
1458
|
}
|
|
@@ -1384,10 +1462,10 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1384
1462
|
lines.push('| Category | Count | Description |');
|
|
1385
1463
|
lines.push('|----------|-------|-------------|');
|
|
1386
1464
|
// Sort by count descending
|
|
1387
|
-
const sortedCategories = Array.from(globalCategoryCounts.entries())
|
|
1388
|
-
.sort((a, b) => b[1] - a[1]);
|
|
1465
|
+
const sortedCategories = Array.from(globalCategoryCounts.entries()).sort((a, b) => b[1] - a[1]);
|
|
1389
1466
|
for (const [category, count] of sortedCategories) {
|
|
1390
|
-
const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
|
|
1467
|
+
const label = ERROR_ANALYSIS.CATEGORY_LABELS[category] ??
|
|
1468
|
+
category;
|
|
1391
1469
|
const emoji = getCategoryEmoji(category);
|
|
1392
1470
|
lines.push(`| ${emoji} ${label} | ${count} | ${escapeTableCell(formatCategoryDescription(category))} |`);
|
|
1393
1471
|
}
|
|
@@ -1406,14 +1484,12 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1406
1484
|
? (ERROR_ANALYSIS.CATEGORY_LABELS[topCategory] ?? topCategory)
|
|
1407
1485
|
: '-';
|
|
1408
1486
|
const topRemediation = summary.topRemediations[0] ?? '-';
|
|
1409
|
-
const truncatedRemediation = topRemediation.length > 50
|
|
1410
|
-
? `${topRemediation.slice(0, 47)}...`
|
|
1411
|
-
: topRemediation;
|
|
1487
|
+
const truncatedRemediation = topRemediation.length > 50 ? `${topRemediation.slice(0, 47)}...` : topRemediation;
|
|
1412
1488
|
lines.push(`| \`${escapeTableCell(name)}\` | ${summary.totalErrors} | ${summary.transientErrors} | ${topCategoryLabel} | ${escapeTableCell(truncatedRemediation)} |`);
|
|
1413
1489
|
}
|
|
1414
1490
|
lines.push('');
|
|
1415
1491
|
// Detailed remediation suggestions (collapsed)
|
|
1416
|
-
const toolsWithRemediations = sortedTools.filter(t => t.summary.topRemediations.length > 0);
|
|
1492
|
+
const toolsWithRemediations = sortedTools.filter((t) => t.summary.topRemediations.length > 0);
|
|
1417
1493
|
if (toolsWithRemediations.length > 0) {
|
|
1418
1494
|
lines.push('<details>');
|
|
1419
1495
|
lines.push('<summary>Remediation Suggestions</summary>');
|
|
@@ -1463,13 +1539,20 @@ function generateErrorAnalysisSection(summaries) {
|
|
|
1463
1539
|
*/
|
|
1464
1540
|
function getCategoryEmoji(category) {
|
|
1465
1541
|
switch (category) {
|
|
1466
|
-
case 'client_error_validation':
|
|
1467
|
-
|
|
1468
|
-
case '
|
|
1469
|
-
|
|
1470
|
-
case '
|
|
1471
|
-
|
|
1472
|
-
|
|
1542
|
+
case 'client_error_validation':
|
|
1543
|
+
return '⚠️';
|
|
1544
|
+
case 'client_error_auth':
|
|
1545
|
+
return '🔐';
|
|
1546
|
+
case 'client_error_not_found':
|
|
1547
|
+
return '🔍';
|
|
1548
|
+
case 'client_error_conflict':
|
|
1549
|
+
return '💥';
|
|
1550
|
+
case 'client_error_rate_limit':
|
|
1551
|
+
return '⏱️';
|
|
1552
|
+
case 'server_error':
|
|
1553
|
+
return '🔥';
|
|
1554
|
+
default:
|
|
1555
|
+
return '❓';
|
|
1473
1556
|
}
|
|
1474
1557
|
}
|
|
1475
1558
|
/**
|
|
@@ -1493,13 +1576,30 @@ function formatCategoryDescription(category) {
|
|
|
1493
1576
|
return 'Unknown error category';
|
|
1494
1577
|
}
|
|
1495
1578
|
}
|
|
1579
|
+
/**
|
|
1580
|
+
* Get the top category from a category counts map.
|
|
1581
|
+
*/
|
|
1582
|
+
function normalizeCategoryCounts(counts) {
|
|
1583
|
+
if (!counts) {
|
|
1584
|
+
return new Map();
|
|
1585
|
+
}
|
|
1586
|
+
if (counts instanceof Map) {
|
|
1587
|
+
return counts;
|
|
1588
|
+
}
|
|
1589
|
+
if (typeof counts !== 'object') {
|
|
1590
|
+
return new Map();
|
|
1591
|
+
}
|
|
1592
|
+
const entries = Object.entries(counts).filter((entry) => typeof entry[1] === 'number');
|
|
1593
|
+
return new Map(entries);
|
|
1594
|
+
}
|
|
1496
1595
|
/**
|
|
1497
1596
|
* Get the top category from a category counts map.
|
|
1498
1597
|
*/
|
|
1499
1598
|
function getTopCategory(counts) {
|
|
1599
|
+
const normalized = normalizeCategoryCounts(counts);
|
|
1500
1600
|
let topCategory;
|
|
1501
1601
|
let topCount = 0;
|
|
1502
|
-
for (const [category, count] of
|
|
1602
|
+
for (const [category, count] of normalized) {
|
|
1503
1603
|
if (count > topCount) {
|
|
1504
1604
|
topCount = count;
|
|
1505
1605
|
topCategory = category;
|
|
@@ -1601,7 +1701,10 @@ function formatIssueTypeLabel(type) {
|
|
|
1601
1701
|
case 'no_examples':
|
|
1602
1702
|
return 'No Examples';
|
|
1603
1703
|
default:
|
|
1604
|
-
return type
|
|
1704
|
+
return type
|
|
1705
|
+
.split('_')
|
|
1706
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
1707
|
+
.join(' ');
|
|
1605
1708
|
}
|
|
1606
1709
|
}
|
|
1607
1710
|
/**
|
|
@@ -1617,10 +1720,10 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
|
|
|
1617
1720
|
return [];
|
|
1618
1721
|
}
|
|
1619
1722
|
// Find successful interactions
|
|
1620
|
-
const successful = profile.interactions.filter(i => {
|
|
1723
|
+
const successful = profile.interactions.filter((i) => {
|
|
1621
1724
|
if (i.error || i.response?.isError)
|
|
1622
1725
|
return false;
|
|
1623
|
-
const textContent = i.response?.content?.find(c => c.type === 'text');
|
|
1726
|
+
const textContent = i.response?.content?.find((c) => c.type === 'text');
|
|
1624
1727
|
if (textContent && 'text' in textContent) {
|
|
1625
1728
|
if (looksLikeError(String(textContent.text)))
|
|
1626
1729
|
return false;
|
|
@@ -1640,7 +1743,7 @@ function generateToolExamples(profile, maxExamples, maxExampleLength = EXAMPLE_O
|
|
|
1640
1743
|
if (seenArgsHashes.has(argsHash))
|
|
1641
1744
|
continue;
|
|
1642
1745
|
seenArgsHashes.add(argsHash);
|
|
1643
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1746
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1644
1747
|
if (!textContent || !('text' in textContent))
|
|
1645
1748
|
continue;
|
|
1646
1749
|
const responseText = String(textContent.text);
|
|
@@ -1696,7 +1799,7 @@ function generateToolErrorPatterns(profile) {
|
|
|
1696
1799
|
continue;
|
|
1697
1800
|
}
|
|
1698
1801
|
const errorText = interaction.error || '';
|
|
1699
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1802
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1700
1803
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1701
1804
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1702
1805
|
if (!isError)
|
|
@@ -1706,7 +1809,8 @@ function generateToolErrorPatterns(profile) {
|
|
|
1706
1809
|
continue;
|
|
1707
1810
|
const category = categorizeError(errorContent);
|
|
1708
1811
|
const existing = errorCategories.get(category) || [];
|
|
1709
|
-
if (existing.length < 2) {
|
|
1812
|
+
if (existing.length < 2) {
|
|
1813
|
+
// Max 2 examples per category
|
|
1710
1814
|
const truncated = errorContent.length > 100 ? `${errorContent.slice(0, 97)}...` : errorContent;
|
|
1711
1815
|
existing.push(truncated);
|
|
1712
1816
|
}
|
|
@@ -1758,7 +1862,7 @@ function generateErrorSummarySection(profiles) {
|
|
|
1758
1862
|
continue;
|
|
1759
1863
|
}
|
|
1760
1864
|
const errorText = interaction.error || '';
|
|
1761
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1865
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1762
1866
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1763
1867
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1764
1868
|
if (!isError)
|
|
@@ -1771,7 +1875,8 @@ function generateErrorSummarySection(profiles) {
|
|
|
1771
1875
|
existing.count++;
|
|
1772
1876
|
existing.tools.add(profile.name);
|
|
1773
1877
|
if (!existing.example) {
|
|
1774
|
-
existing.example =
|
|
1878
|
+
existing.example =
|
|
1879
|
+
errorContent.length > 80 ? `${errorContent.slice(0, 77)}...` : errorContent;
|
|
1775
1880
|
}
|
|
1776
1881
|
categoryCounts.set(category, existing);
|
|
1777
1882
|
}
|
|
@@ -1786,7 +1891,10 @@ function generateErrorSummarySection(profiles) {
|
|
|
1786
1891
|
lines.push('| Category | Count | Affected Tools |');
|
|
1787
1892
|
lines.push('|----------|-------|----------------|');
|
|
1788
1893
|
for (const [category, data] of categoryCounts) {
|
|
1789
|
-
const toolList = Array.from(data.tools)
|
|
1894
|
+
const toolList = Array.from(data.tools)
|
|
1895
|
+
.slice(0, 3)
|
|
1896
|
+
.map((t) => `\`${t}\``)
|
|
1897
|
+
.join(', ');
|
|
1790
1898
|
const more = data.tools.size > 3 ? ` +${data.tools.size - 3} more` : '';
|
|
1791
1899
|
lines.push(`| ${category} | ${data.count} | ${toolList}${more} |`);
|
|
1792
1900
|
}
|
|
@@ -1813,7 +1921,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
|
|
|
1813
1921
|
continue;
|
|
1814
1922
|
}
|
|
1815
1923
|
const errorText = interaction.error || '';
|
|
1816
|
-
const textContent = interaction.response?.content?.find(c => c.type === 'text');
|
|
1924
|
+
const textContent = interaction.response?.content?.find((c) => c.type === 'text');
|
|
1817
1925
|
const responseText = textContent && 'text' in textContent ? String(textContent.text) : '';
|
|
1818
1926
|
const isError = interaction.error || interaction.response?.isError || looksLikeError(responseText);
|
|
1819
1927
|
if (!isError)
|
|
@@ -1843,7 +1951,7 @@ function analyzeToolsForExternalDependencies(profiles, tools) {
|
|
|
1843
1951
|
});
|
|
1844
1952
|
}
|
|
1845
1953
|
if (patterns.length > 0) {
|
|
1846
|
-
const tool = tools.find(t => t.name === profile.name);
|
|
1954
|
+
const tool = tools.find((t) => t.name === profile.name);
|
|
1847
1955
|
errorInputs.push({
|
|
1848
1956
|
toolName: profile.name,
|
|
1849
1957
|
toolDescription: tool?.description,
|
|
@@ -1962,7 +2070,9 @@ function collectAssertionFailures(profile) {
|
|
|
1962
2070
|
for (const result of interaction.assertionResults ?? []) {
|
|
1963
2071
|
if (result.passed)
|
|
1964
2072
|
continue;
|
|
1965
|
-
const message = result.message
|
|
2073
|
+
const message = result.message
|
|
2074
|
+
? `${result.type}: ${result.message}`
|
|
2075
|
+
: `${result.type} failed`;
|
|
1966
2076
|
failures.add(message);
|
|
1967
2077
|
}
|
|
1968
2078
|
}
|