@dotsetlabs/bellwether 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/README.md +8 -2
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +1 -3
- package/dist/baseline/baseline-format.d.ts +287 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +249 -11
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +46 -34
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +39 -2
- package/dist/baseline/saver.js +41 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +10 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +110 -0
- package/dist/cli/commands/check.js +23 -6
- package/dist/cli/commands/explore.js +34 -14
- package/dist/cli/index.js +8 -0
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +4 -0
- package/dist/constants/core.js +8 -19
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/docs/contract.js +131 -83
- package/dist/docs/report.js +8 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/transport/mcp-client.js +1 -1
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +157 -67
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
package/dist/baseline/diff.js
CHANGED
|
@@ -73,16 +73,15 @@ export function formatDiffText(diff, useColors = true) {
|
|
|
73
73
|
lines.push(red('─── Performance Regressions ───'));
|
|
74
74
|
for (const regression of diff.performanceReport.regressions) {
|
|
75
75
|
const percentStr = (regression.regressionPercent * 100).toFixed(1);
|
|
76
|
-
const confidenceNote = regression.isReliable
|
|
77
|
-
? ''
|
|
78
|
-
: ` ${yellow('(low confidence)')}`;
|
|
76
|
+
const confidenceNote = regression.isReliable ? '' : ` ${yellow('(low confidence)')}`;
|
|
79
77
|
lines.push(` ${red('!')} ${regression.toolName}: ` +
|
|
80
78
|
`${regression.previousP50Ms.toFixed(0)}ms → ` +
|
|
81
79
|
`${regression.currentP50Ms.toFixed(0)}ms (+${percentStr}%)${confidenceNote}`);
|
|
82
80
|
}
|
|
83
81
|
lines.push('');
|
|
84
82
|
// Show low confidence tools warning
|
|
85
|
-
if (diff.performanceReport.lowConfidenceTools &&
|
|
83
|
+
if (diff.performanceReport.lowConfidenceTools &&
|
|
84
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
86
85
|
lines.push(yellow(' Note: Some tools have low confidence metrics.'));
|
|
87
86
|
lines.push(yellow(` Run with more samples for reliable baselines: ${diff.performanceReport.lowConfidenceTools.join(', ')}`));
|
|
88
87
|
lines.push('');
|
|
@@ -94,7 +93,8 @@ export function formatDiffText(diff, useColors = true) {
|
|
|
94
93
|
lines.push('');
|
|
95
94
|
}
|
|
96
95
|
// Performance confidence changes
|
|
97
|
-
if (diff.performanceReport?.confidenceChanges &&
|
|
96
|
+
if (diff.performanceReport?.confidenceChanges &&
|
|
97
|
+
diff.performanceReport.confidenceChanges.length > 0) {
|
|
98
98
|
lines.push(cyan('─── Confidence Changes ───'));
|
|
99
99
|
for (const change of diff.performanceReport.confidenceChanges) {
|
|
100
100
|
const icon = change.improved ? green('↑') : change.degraded ? yellow('↓') : '→';
|
|
@@ -212,7 +212,8 @@ export function formatDiffText(diff, useColors = true) {
|
|
|
212
212
|
lines.push(` Info: ${diff.infoCount}`);
|
|
213
213
|
if (diff.performanceReport) {
|
|
214
214
|
lines.push(` Performance regressions: ${diff.performanceReport.regressionCount}`);
|
|
215
|
-
if (diff.performanceReport.lowConfidenceTools &&
|
|
215
|
+
if (diff.performanceReport.lowConfidenceTools &&
|
|
216
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
216
217
|
lines.push(` Low confidence tools: ${diff.performanceReport.lowConfidenceTools.length}`);
|
|
217
218
|
}
|
|
218
219
|
}
|
|
@@ -276,7 +277,8 @@ export function formatDiffCompact(diff) {
|
|
|
276
277
|
if (diff.performanceReport?.regressionCount ?? 0 > 0) {
|
|
277
278
|
parts.push(`perf_regressions=${diff.performanceReport?.regressionCount}`);
|
|
278
279
|
}
|
|
279
|
-
if (diff.performanceReport?.lowConfidenceTools &&
|
|
280
|
+
if (diff.performanceReport?.lowConfidenceTools &&
|
|
281
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
280
282
|
parts.push(`low_confidence_tools=${diff.performanceReport.lowConfidenceTools.length}`);
|
|
281
283
|
}
|
|
282
284
|
if (diff.securityReport) {
|
|
@@ -336,8 +338,11 @@ export function formatDiffGitHubActions(diff) {
|
|
|
336
338
|
lines.push(`::notice::Minor changes: ${diff.summary}`);
|
|
337
339
|
}
|
|
338
340
|
for (const change of diff.behaviorChanges) {
|
|
339
|
-
const level = change.severity === 'breaking'
|
|
340
|
-
|
|
341
|
+
const level = change.severity === 'breaking'
|
|
342
|
+
? 'error'
|
|
343
|
+
: change.severity === 'warning'
|
|
344
|
+
? 'warning'
|
|
345
|
+
: 'notice';
|
|
341
346
|
lines.push(`::${level}::${change.tool} - ${change.description}`);
|
|
342
347
|
}
|
|
343
348
|
for (const tool of diff.toolsRemoved) {
|
|
@@ -355,7 +360,8 @@ export function formatDiffGitHubActions(diff) {
|
|
|
355
360
|
}
|
|
356
361
|
}
|
|
357
362
|
// Low confidence warning
|
|
358
|
-
if (diff.performanceReport?.lowConfidenceTools &&
|
|
363
|
+
if (diff.performanceReport?.lowConfidenceTools &&
|
|
364
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
359
365
|
lines.push(`::notice::Low confidence metrics for ${diff.performanceReport.lowConfidenceTools.length} tool(s): ${diff.performanceReport.lowConfidenceTools.join(', ')}`);
|
|
360
366
|
}
|
|
361
367
|
// Security findings
|
|
@@ -440,7 +446,9 @@ export function formatDiffMarkdown(diff) {
|
|
|
440
446
|
toolDiff.schemaChanged ? 'Schema changed' : '',
|
|
441
447
|
toolDiff.descriptionChanged ? 'Description changed' : '',
|
|
442
448
|
`${toolDiff.changes.length} change(s)`,
|
|
443
|
-
]
|
|
449
|
+
]
|
|
450
|
+
.filter(Boolean)
|
|
451
|
+
.join(', ');
|
|
444
452
|
lines.push(`| ${toolDiff.tool} | ⚠️ Modified | ${details} |`);
|
|
445
453
|
}
|
|
446
454
|
lines.push('');
|
|
@@ -451,8 +459,7 @@ export function formatDiffMarkdown(diff) {
|
|
|
451
459
|
lines.push('| Tool | Aspect | Severity | Description |');
|
|
452
460
|
lines.push('|------|--------|----------|-------------|');
|
|
453
461
|
for (const change of diff.behaviorChanges) {
|
|
454
|
-
const sevEmoji = change.severity === 'breaking' ? '🔴' :
|
|
455
|
-
change.severity === 'warning' ? '🟡' : '🟢';
|
|
462
|
+
const sevEmoji = change.severity === 'breaking' ? '🔴' : change.severity === 'warning' ? '🟡' : '🟢';
|
|
456
463
|
lines.push(`| ${change.tool} | ${change.aspect} | ${sevEmoji} ${change.severity} | ${change.description} |`);
|
|
457
464
|
}
|
|
458
465
|
lines.push('');
|
|
@@ -503,7 +510,11 @@ export function formatDiffMarkdown(diff) {
|
|
|
503
510
|
lines.push('|------|--------|---------|');
|
|
504
511
|
for (const issue of schemaReport.toolsWithIssues) {
|
|
505
512
|
const statusIcon = issue.isBreaking ? '🔴' : issue.becameUnstable ? '🟡' : '🔵';
|
|
506
|
-
const status = issue.isBreaking
|
|
513
|
+
const status = issue.isBreaking
|
|
514
|
+
? 'Breaking'
|
|
515
|
+
: issue.becameUnstable
|
|
516
|
+
? 'Unstable'
|
|
517
|
+
: 'Changed';
|
|
507
518
|
lines.push(`| ${issue.toolName} | ${statusIcon} ${status} | ${issue.summary} |`);
|
|
508
519
|
}
|
|
509
520
|
lines.push('');
|
|
@@ -525,11 +536,13 @@ export function formatDiffMarkdown(diff) {
|
|
|
525
536
|
lines.push(`⚠️ **Error behavior changed**: ${et.summary}`);
|
|
526
537
|
lines.push('');
|
|
527
538
|
}
|
|
528
|
-
if (et.newCategories.length > 0 ||
|
|
529
|
-
et.
|
|
539
|
+
if (et.newCategories.length > 0 ||
|
|
540
|
+
et.resolvedCategories.length > 0 ||
|
|
541
|
+
et.increasingCategories.length > 0 ||
|
|
542
|
+
et.decreasingCategories.length > 0) {
|
|
530
543
|
lines.push('| Category | Trend | Previous | Current | Change |');
|
|
531
544
|
lines.push('|----------|-------|----------|---------|--------|');
|
|
532
|
-
for (const trend of et.trends.filter(t => t.trend !== 'stable')) {
|
|
545
|
+
for (const trend of et.trends.filter((t) => t.trend !== 'stable')) {
|
|
533
546
|
const trendEmoji = getTrendEmoji(trend.trend);
|
|
534
547
|
const changeStr = trend.changePercent !== 0
|
|
535
548
|
? `${trend.changePercent > 0 ? '+' : ''}${trend.changePercent}%`
|
|
@@ -543,7 +556,8 @@ export function formatDiffMarkdown(diff) {
|
|
|
543
556
|
// Performance section
|
|
544
557
|
if (diff.performanceReport) {
|
|
545
558
|
const perfReport = diff.performanceReport;
|
|
546
|
-
if (perfReport.hasRegressions ||
|
|
559
|
+
if (perfReport.hasRegressions ||
|
|
560
|
+
perfReport.improvementCount > 0 ||
|
|
547
561
|
(perfReport.lowConfidenceTools && perfReport.lowConfidenceTools.length > 0)) {
|
|
548
562
|
lines.push('### Performance');
|
|
549
563
|
lines.push('');
|
|
@@ -605,7 +619,8 @@ export function formatDiffMarkdown(diff) {
|
|
|
605
619
|
lines.push(`- Info: **${diff.infoCount}**`);
|
|
606
620
|
if (diff.performanceReport) {
|
|
607
621
|
lines.push(`- Performance regressions: **${diff.performanceReport.regressionCount}**`);
|
|
608
|
-
if (diff.performanceReport.lowConfidenceTools &&
|
|
622
|
+
if (diff.performanceReport.lowConfidenceTools &&
|
|
623
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
609
624
|
lines.push(`- Low confidence tools: **${diff.performanceReport.lowConfidenceTools.length}**`);
|
|
610
625
|
}
|
|
611
626
|
}
|
|
@@ -674,9 +689,7 @@ function getTrendEmoji(trend) {
|
|
|
674
689
|
*/
|
|
675
690
|
export function formatDiffJUnit(diff, suiteName = 'bellwether') {
|
|
676
691
|
const timestamp = new Date().toISOString();
|
|
677
|
-
const totalTests = diff.toolsAdded.length +
|
|
678
|
-
diff.toolsRemoved.length +
|
|
679
|
-
diff.behaviorChanges.length;
|
|
692
|
+
const totalTests = diff.toolsAdded.length + diff.toolsRemoved.length + diff.behaviorChanges.length;
|
|
680
693
|
const failures = diff.breakingCount;
|
|
681
694
|
const errors = 0;
|
|
682
695
|
const skipped = 0;
|
|
@@ -743,7 +756,8 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
|
|
|
743
756
|
lines.push(' </testcase>');
|
|
744
757
|
}
|
|
745
758
|
// Low confidence tools
|
|
746
|
-
if (diff.performanceReport.lowConfidenceTools &&
|
|
759
|
+
if (diff.performanceReport.lowConfidenceTools &&
|
|
760
|
+
diff.performanceReport.lowConfidenceTools.length > 0) {
|
|
747
761
|
for (const tool of diff.performanceReport.lowConfidenceTools) {
|
|
748
762
|
const name = escapeXml(`confidence-${tool}`);
|
|
749
763
|
lines.push(` <testcase name="${name}" classname="drift.confidence">`);
|
|
@@ -805,7 +819,8 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
|
|
|
805
819
|
lines.push(' </testcase>');
|
|
806
820
|
}
|
|
807
821
|
// Show stable schemas as passing tests
|
|
808
|
-
if (diff.schemaEvolutionReport.stableCount > 0 &&
|
|
822
|
+
if (diff.schemaEvolutionReport.stableCount > 0 &&
|
|
823
|
+
diff.schemaEvolutionReport.toolsWithIssues.length === 0) {
|
|
809
824
|
lines.push(` <testcase name="schema-stability-check" classname="drift.schema">`);
|
|
810
825
|
lines.push(` <system-out>${diff.schemaEvolutionReport.stableCount} tool(s) have stable response schemas</system-out>`);
|
|
811
826
|
lines.push(' </testcase>');
|
|
@@ -830,7 +845,7 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
|
|
|
830
845
|
}
|
|
831
846
|
// Increasing error types (warnings)
|
|
832
847
|
for (const category of et.increasingCategories) {
|
|
833
|
-
const trend = et.trends.find(t => t.category === category);
|
|
848
|
+
const trend = et.trends.find((t) => t.category === category);
|
|
834
849
|
const name = escapeXml(`error-trend-increasing-${category}`);
|
|
835
850
|
lines.push(` <testcase name="${name}" classname="drift.errors">`);
|
|
836
851
|
lines.push(` <system-err>[WARNING] Error frequency increasing: ${escapeXml(category)}${trend ? ` (+${trend.changePercent}%)` : ''}</system-err>`);
|
|
@@ -878,7 +893,7 @@ export function formatDiffJUnit(diff, suiteName = 'bellwether') {
|
|
|
878
893
|
* Format diff as SARIF (Static Analysis Results Interchange Format) for GitHub Code Scanning.
|
|
879
894
|
*
|
|
880
895
|
* SARIF is the standard format for GitHub's code scanning feature and can be
|
|
881
|
-
*
|
|
896
|
+
* used to show drift detection results in pull request reviews.
|
|
882
897
|
*
|
|
883
898
|
* @see https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html
|
|
884
899
|
*
|
|
@@ -1313,11 +1328,13 @@ export function formatDiffSarif(diff, baselinePath = 'bellwether-baseline.json')
|
|
|
1313
1328
|
}
|
|
1314
1329
|
// Increasing error types
|
|
1315
1330
|
for (const category of et.increasingCategories) {
|
|
1316
|
-
const trend = et.trends.find(t => t.category === category);
|
|
1331
|
+
const trend = et.trends.find((t) => t.category === category);
|
|
1317
1332
|
results.push({
|
|
1318
1333
|
ruleId: 'BWH011',
|
|
1319
1334
|
level: 'warning',
|
|
1320
|
-
message: {
|
|
1335
|
+
message: {
|
|
1336
|
+
text: `Error frequency increasing: ${category}${trend ? ` (+${trend.changePercent}%)` : ''}`,
|
|
1337
|
+
},
|
|
1321
1338
|
locations: [
|
|
1322
1339
|
{
|
|
1323
1340
|
physicalLocation: {
|
|
@@ -69,7 +69,7 @@ export interface ErrorAnalysisSummary {
|
|
|
69
69
|
/** Unique remediations suggested */
|
|
70
70
|
remediations: string[];
|
|
71
71
|
/** Counts by error category */
|
|
72
|
-
categoryCounts: Map<string, number>;
|
|
72
|
+
categoryCounts: Map<string, number> | Record<string, number>;
|
|
73
73
|
/** Top root causes (most common) */
|
|
74
74
|
topRootCauses: string[];
|
|
75
75
|
/** Top remediations (most actionable) */
|
|
@@ -22,7 +22,8 @@ export function analyzeError(errorMessage, context) {
|
|
|
22
22
|
let statusCategory = categorizeHttpStatus(httpStatus);
|
|
23
23
|
const wasExpected = context?.wasExpected ?? context?.expectedOutcome === 'error';
|
|
24
24
|
// If the error was expected (validation test), recategorize it
|
|
25
|
-
if (wasExpected &&
|
|
25
|
+
if (wasExpected &&
|
|
26
|
+
(statusCategory === 'client_error_validation' || statusCategory === 'unknown')) {
|
|
26
27
|
statusCategory = 'validation_expected';
|
|
27
28
|
}
|
|
28
29
|
const rootCause = wasExpected
|
|
@@ -96,7 +97,9 @@ export function generateErrorSummary(toolName, patterns) {
|
|
|
96
97
|
}
|
|
97
98
|
}
|
|
98
99
|
// Count transient and actionable errors
|
|
99
|
-
const transientErrors = analyses
|
|
100
|
+
const transientErrors = analyses
|
|
101
|
+
.filter((a) => a.transient)
|
|
102
|
+
.reduce((sum, a) => sum + a.pattern.count, 0);
|
|
100
103
|
const actionableCount = analyses.filter((a) => a.remediation && !a.remediation.includes('Review')).length;
|
|
101
104
|
// Collect unique remediations with frequency
|
|
102
105
|
const remediationCounts = new Map();
|
|
@@ -324,7 +327,9 @@ export function inferRootCause(message, category) {
|
|
|
324
327
|
if (lower.includes('invalid') || lower.includes('malformed')) {
|
|
325
328
|
return 'Invalid input format or value';
|
|
326
329
|
}
|
|
327
|
-
if (lower.includes('not found') ||
|
|
330
|
+
if (lower.includes('not found') ||
|
|
331
|
+
lower.includes('does not exist') ||
|
|
332
|
+
lower.includes("doesn't exist")) {
|
|
328
333
|
return 'Referenced resource does not exist';
|
|
329
334
|
}
|
|
330
335
|
if (lower.includes('already exists') || lower.includes('duplicate')) {
|
|
@@ -333,7 +338,9 @@ export function inferRootCause(message, category) {
|
|
|
333
338
|
if (lower.includes('unauthorized') || lower.includes('authentication')) {
|
|
334
339
|
return 'Authentication credentials missing or invalid';
|
|
335
340
|
}
|
|
336
|
-
if (lower.includes('forbidden') ||
|
|
341
|
+
if (lower.includes('forbidden') ||
|
|
342
|
+
lower.includes('permission') ||
|
|
343
|
+
lower.includes('access denied')) {
|
|
337
344
|
return 'Insufficient permissions for this operation';
|
|
338
345
|
}
|
|
339
346
|
if (lower.includes('rate') || lower.includes('throttl') || lower.includes('too many')) {
|
|
@@ -480,15 +487,69 @@ export function extractRelatedParameters(message) {
|
|
|
480
487
|
*/
|
|
481
488
|
function isCommonWord(word) {
|
|
482
489
|
const commonWords = new Set([
|
|
483
|
-
'the',
|
|
484
|
-
'
|
|
485
|
-
'
|
|
486
|
-
'
|
|
487
|
-
'
|
|
488
|
-
'
|
|
489
|
-
'
|
|
490
|
-
'
|
|
491
|
-
'
|
|
490
|
+
'the',
|
|
491
|
+
'is',
|
|
492
|
+
'are',
|
|
493
|
+
'was',
|
|
494
|
+
'were',
|
|
495
|
+
'be',
|
|
496
|
+
'been',
|
|
497
|
+
'being',
|
|
498
|
+
'have',
|
|
499
|
+
'has',
|
|
500
|
+
'had',
|
|
501
|
+
'do',
|
|
502
|
+
'does',
|
|
503
|
+
'did',
|
|
504
|
+
'will',
|
|
505
|
+
'would',
|
|
506
|
+
'could',
|
|
507
|
+
'should',
|
|
508
|
+
'may',
|
|
509
|
+
'might',
|
|
510
|
+
'must',
|
|
511
|
+
'shall',
|
|
512
|
+
'can',
|
|
513
|
+
'need',
|
|
514
|
+
'not',
|
|
515
|
+
'and',
|
|
516
|
+
'but',
|
|
517
|
+
'or',
|
|
518
|
+
'if',
|
|
519
|
+
'then',
|
|
520
|
+
'else',
|
|
521
|
+
'for',
|
|
522
|
+
'with',
|
|
523
|
+
'from',
|
|
524
|
+
'this',
|
|
525
|
+
'that',
|
|
526
|
+
'these',
|
|
527
|
+
'those',
|
|
528
|
+
'error',
|
|
529
|
+
'message',
|
|
530
|
+
'failed',
|
|
531
|
+
'invalid',
|
|
532
|
+
'missing',
|
|
533
|
+
'required',
|
|
534
|
+
'found',
|
|
535
|
+
'exist',
|
|
536
|
+
'exists',
|
|
537
|
+
'value',
|
|
538
|
+
'input',
|
|
539
|
+
'output',
|
|
540
|
+
'type',
|
|
541
|
+
'string',
|
|
542
|
+
'number',
|
|
543
|
+
'boolean',
|
|
544
|
+
'object',
|
|
545
|
+
'array',
|
|
546
|
+
'null',
|
|
547
|
+
'undefined',
|
|
548
|
+
'field',
|
|
549
|
+
'parameter',
|
|
550
|
+
'property',
|
|
551
|
+
'argument',
|
|
552
|
+
'key',
|
|
492
553
|
]);
|
|
493
554
|
return commonWords.has(word.toLowerCase());
|
|
494
555
|
}
|
|
@@ -510,9 +571,19 @@ export function isTransientError(category, message) {
|
|
|
510
571
|
return true;
|
|
511
572
|
// Check for transient keywords
|
|
512
573
|
const transientKeywords = [
|
|
513
|
-
'timeout',
|
|
514
|
-
'
|
|
515
|
-
'
|
|
574
|
+
'timeout',
|
|
575
|
+
'timed out',
|
|
576
|
+
'temporarily',
|
|
577
|
+
'retry',
|
|
578
|
+
'unavailable',
|
|
579
|
+
'connection',
|
|
580
|
+
'network',
|
|
581
|
+
'service unavailable',
|
|
582
|
+
'too many requests',
|
|
583
|
+
'try again',
|
|
584
|
+
'overloaded',
|
|
585
|
+
'busy',
|
|
586
|
+
'maintenance',
|
|
516
587
|
];
|
|
517
588
|
return transientKeywords.some((keyword) => lower.includes(keyword));
|
|
518
589
|
}
|
|
@@ -661,7 +732,9 @@ export function formatErrorTrendReport(report, useColors = false) {
|
|
|
661
732
|
lines.push(' Trend details:');
|
|
662
733
|
for (const trend of report.trends.filter((t) => t.trend !== 'stable')) {
|
|
663
734
|
const arrow = getTrendArrow(trend.trend);
|
|
664
|
-
const changeText = trend.changePercent !== 0
|
|
735
|
+
const changeText = trend.changePercent !== 0
|
|
736
|
+
? ` (${trend.changePercent > 0 ? '+' : ''}${trend.changePercent}%)`
|
|
737
|
+
: '';
|
|
665
738
|
lines.push(` ${arrow} ${trend.category}: ${trend.previousCount} → ${trend.currentCount}${changeText}`);
|
|
666
739
|
}
|
|
667
740
|
return lines.join('\n');
|
|
@@ -26,7 +26,7 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
|
|
|
26
26
|
// If no baseline or force retest, test everything
|
|
27
27
|
if (!baseline || forceRetest) {
|
|
28
28
|
return {
|
|
29
|
-
toolsToTest: currentTools.map(t => t.name),
|
|
29
|
+
toolsToTest: currentTools.map((t) => t.name),
|
|
30
30
|
toolsToSkip: [],
|
|
31
31
|
cachedFingerprints: [],
|
|
32
32
|
changeSummary: {
|
|
@@ -34,7 +34,7 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
|
|
|
34
34
|
changedTools: 0,
|
|
35
35
|
unchangedTools: 0,
|
|
36
36
|
removedTools: 0,
|
|
37
|
-
newToolNames: baseline ? [] : currentTools.map(t => t.name),
|
|
37
|
+
newToolNames: baseline ? [] : currentTools.map((t) => t.name),
|
|
38
38
|
changedToolNames: [],
|
|
39
39
|
removedToolNames: [],
|
|
40
40
|
},
|
|
@@ -47,8 +47,8 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
|
|
|
47
47
|
const changedToolNames = [];
|
|
48
48
|
const removedToolNames = [];
|
|
49
49
|
// Build maps for comparison
|
|
50
|
-
const baselineToolMap = new Map(getToolFingerprints(baseline).map(t => [t.name, t]));
|
|
51
|
-
const currentToolSet = new Set(currentTools.map(t => t.name));
|
|
50
|
+
const baselineToolMap = new Map(getToolFingerprints(baseline).map((t) => [t.name, t]));
|
|
51
|
+
const currentToolSet = new Set(currentTools.map((t) => t.name));
|
|
52
52
|
// Check current tools against baseline
|
|
53
53
|
for (const tool of currentTools) {
|
|
54
54
|
const baselineTool = baselineToolMap.get(tool.name);
|
|
@@ -65,7 +65,10 @@ export function analyzeForIncremental(currentTools, baseline, options = {}) {
|
|
|
65
65
|
}
|
|
66
66
|
// Check if schema changed
|
|
67
67
|
const currentSchemaHash = computeSchemaHash(tool.inputSchema);
|
|
68
|
-
const baselineSchemaHash = baselineTool.
|
|
68
|
+
const baselineSchemaHash = baselineTool.inputSchemaHashAtTest ??
|
|
69
|
+
(baselineTool.inputSchema
|
|
70
|
+
? computeSchemaHash(baselineTool.inputSchema)
|
|
71
|
+
: baselineTool.schemaHash);
|
|
69
72
|
if (currentSchemaHash !== baselineSchemaHash) {
|
|
70
73
|
// Schema changed - needs retesting
|
|
71
74
|
toolsToTest.push(tool.name);
|
package/dist/baseline/index.d.ts
CHANGED
|
@@ -6,26 +6,16 @@ export { getBaselineGeneratedAt, getBaselineHash, getBaselineServerCommand, getB
|
|
|
6
6
|
export { createBaseline, saveBaseline, loadBaseline, verifyBaselineHash, baselineExists, recalculateBaselineHash, acceptDrift, hasAcceptance, clearAcceptance, type LoadBaselineOptions, type AcceptDriftOptions, } from './saver.js';
|
|
7
7
|
export { compareWithBaseline, compareBaselines, hasBreakingChanges, hasSecurityChanges, filterByMinimumSeverity, checkBaselineVersionCompatibility, compareSeverity, severityMeetsThreshold, applyAspectOverride, applySeverityConfig, shouldFailOnDiff, } from './comparator.js';
|
|
8
8
|
export { formatDiffText, formatDiffJson, formatDiffCompact, formatDiffGitHubActions, formatDiffMarkdown, formatDiffJUnit, formatDiffSarif, formatSecurityReport, } from './diff.js';
|
|
9
|
-
export {
|
|
9
|
+
export { createBaselineFromInterview } from './converter.js';
|
|
10
10
|
export { computeSchemaHash, compareSchemas, computeConsensusSchemaHash, type SchemaChangeType, type SchemaChange, type SchemaComparisonResult, } from './schema-compare.js';
|
|
11
11
|
export { getBaselineVersion, parseVersion, areVersionsCompatible, compareVersions, getCompatibilityWarning, checkVersionCompatibility, assertVersionCompatibility, formatVersion, isCurrentVersion, isOlderVersion, isNewerVersion, requiresMigration, BaselineVersionError, type FormatVersion, type VersionCompatibility, } from './version.js';
|
|
12
12
|
export { analyzeForIncremental, mergeFingerprints, formatIncrementalSummary, isIncrementalWorthwhile, addIncrementalMetadata, type IncrementalCheckResult, type IncrementalChangeSummary, type IncrementalCheckOptions, } from './incremental-checker.js';
|
|
13
13
|
export { analyzeResponses, inferSchemaFromValue, compareFingerprints, compareErrorPatterns, computeInferredSchemaHash, type ResponseFingerprint, type ResponseContentType, type ResponseSize, type InferredSchema, type ErrorPattern, type ResponseAnalysis, type FingerprintDiff, type FingerprintChange, type ErrorPatternDiff, } from './response-fingerprint.js';
|
|
14
14
|
export { compareInferredSchemas, buildSchemaEvolution, compareSchemaEvolution, formatSchemaEvolution, formatSchemaEvolutionDiff, hasSchemaEvolutionIssues, getSchemaStabilityGrade, type ResponseSchemaEvolution, type SchemaVersion as SchemaEvolutionVersion, type SchemaEvolutionDiff, type SchemaTypeChange, } from './response-schema-tracker.js';
|
|
15
|
-
export {
|
|
16
|
-
export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, PERFORMANCE, type LatencyTrend, type ToolPerformanceMetrics, type PerformanceBaseline, type PerformanceComparison, type PerformanceReport, type LatencySample, } from './performance-tracker.js';
|
|
17
|
-
export { checkDeprecations, checkToolDeprecation, markAsDeprecated, clearDeprecation, getDeprecatedTools, getExpiredTools, getUpcomingRemovals, formatDeprecationWarning, formatDeprecationReport, shouldFailOnDeprecation, DEPRECATION, DEPRECATION_DEFAULTS, DEPRECATION_THRESHOLDS, type DeprecationStatus, type DeprecationWarning, type DeprecationReport, type DeprecationConfig, } from './deprecation-tracker.js';
|
|
18
|
-
export { calculateHealthScore, formatHealthScore, meetsHealthThreshold, getHealthBadgeColor, createHealthHistoryEntry, HEALTH_SCORING, HEALTH_WEIGHTS, GRADE_THRESHOLDS, SEVERITY_THRESHOLDS, HEALTH_PENALTIES, type HealthTrend, type ActionPriority, type HealthActionItem, type HealthComponents, type HealthScore, type HealthHistory, type HealthInput, } from './health-scorer.js';
|
|
19
|
-
export { buildServerTimeline, buildToolTimeline, formatTimeline, formatServerTimelineSummary, generateVisualTimeline, serializeTimeline, deserializeTimeline, serializeServerTimeline, deserializeServerTimeline, getMostActiveTools, getMostBreakingTools, getBreakingChanges, getVersionAtTime, getChangesBetween, hadBreakingChanges, type SchemaEventType, type SchemaVersion, type SchemaTimeline, type ServerTimeline, type DeprecationEvent, type TimelineStats, type TimelineBuildOptions, } from './schema-evolution.js';
|
|
20
|
-
export { generateMigrationGuide, formatMigrationGuideMarkdown, formatMigrationGuideText, hasBreakingMigrationChanges, getBreakingTools, type MigrationEffort, type MigrationStepType, type CodeExample, type BreakingChange, type MigrationStep, type MigrationGuide, type MigrationStats, } from './migration-generator.js';
|
|
21
|
-
export { generateToolScenarios, generateBaselineScenarios, formatScenariosAsYaml, formatScenariosReport, getScenariosByPriority, getScenariosByCategory, getCriticalScenarios, getSecurityScenarios, type ScenarioCategory, type ScenarioPriority, type TestScenario, type AutoGeneratedScenarios, type ScenarioGenerationSummary, type ScenarioGenerationResult, type ScenarioGenerationConfig, } from './scenario-generator.js';
|
|
22
|
-
export { generatePRComment, generateCompactPRComment, generateCIStatusSummary, generateDiffTable, generateBadgeUrl, generateBadgeMarkdown, getBadgeColor, shouldBlockMerge, getSeverityEmoji, type BadgeColor, type CommentSection, type AffectedWorkflow, type PRComment, type PRCommentConfig, } from './pr-comment-generator.js';
|
|
15
|
+
export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, type LatencyTrend, type ToolPerformanceMetrics, type PerformanceBaseline, type PerformanceComparison, type PerformanceReport, type LatencySample, } from './performance-tracker.js';
|
|
23
16
|
export type { SecurityCategory, RiskLevel, SecurityPayload, SecurityTestResult, SecurityFinding, SecurityFingerprint, SecurityDiff, SecurityTestOptions, SecurityTestContext, SecurityToolCallResult, SecurityReport, } from '../security/types.js';
|
|
24
17
|
export { runSecurityTests, compareSecurityFingerprints, getRiskLevelFromScore, parseSecurityCategories, getPayloadsForCategory, getAllSecurityPayloads, getAllSecurityCategories, } from '../security/index.js';
|
|
25
18
|
export type { HttpStatusCategory, ErrorSeverity, EnhancedErrorAnalysis, ErrorAnalysisSummary, ErrorTrend, ErrorTrendReport, } from './error-analyzer.js';
|
|
26
19
|
export { analyzeError, analyzeErrorPatterns, generateErrorSummary, analyzeErrorTrends, extractHttpStatus, categorizeHttpStatus, inferRootCause, generateRemediation, extractRelatedParameters, isTransientError, assessErrorSeverity, mapStatusToErrorCategory, formatEnhancedError, formatErrorTrendReport, formatCategoryName, } from './error-analyzer.js';
|
|
27
20
|
export { scoreDocumentation, scoreToolDocumentation, calculateDescriptionCoverage, calculateDescriptionQuality, calculateParameterDocumentation, calculateExampleCoverage, hasExamples, scoreToGrade, generateSuggestions, compareDocumentationScores, formatDocumentationScore, formatDocumentationScoreCompact, formatDocumentationScoreChange, toDocumentationScoreSummary, getGradeIndicator, getGradeBadgeColor, meetsDocumentationThreshold, meetsDocumentationGrade, type DocumentationIssueSeverity, type DocumentationIssueType, } from './documentation-scorer.js';
|
|
28
|
-
export { calculateAICompatibilityScore, generateAICompatibilityMarkdown, type AICompatibilityScore, type ScoreComponent, type AICompatibilityRecommendation, type ToolAIScore, type AICompatibilityInput, } from './ai-compatibility-scorer.js';
|
|
29
|
-
export { calculateRiskScore, generateRiskScoreMarkdown, type RegressionRiskScore, type RiskFactor, } from './risk-scorer.js';
|
|
30
|
-
export { calculatePruningDecisions, calculateToolPruning, prioritizeTools, generatePruningSummary, generatePruningMarkdown, type TestCategory, type TestCategoryDecision, type ToolPruningDecision, type ToolCharacteristics, type PruningInput, type PruningSummary, } from './test-pruner.js';
|
|
31
21
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/baseline/index.js
CHANGED
|
@@ -5,39 +5,19 @@ export { getBaselineGeneratedAt, getBaselineHash, getBaselineServerCommand, getB
|
|
|
5
5
|
export { createBaseline, saveBaseline, loadBaseline, verifyBaselineHash, baselineExists, recalculateBaselineHash, acceptDrift, hasAcceptance, clearAcceptance, } from './saver.js';
|
|
6
6
|
export { compareWithBaseline, compareBaselines, hasBreakingChanges, hasSecurityChanges, filterByMinimumSeverity, checkBaselineVersionCompatibility, compareSeverity, severityMeetsThreshold, applyAspectOverride, applySeverityConfig, shouldFailOnDiff, } from './comparator.js';
|
|
7
7
|
export { formatDiffText, formatDiffJson, formatDiffCompact, formatDiffGitHubActions, formatDiffMarkdown, formatDiffJUnit, formatDiffSarif, formatSecurityReport, } from './diff.js';
|
|
8
|
-
export {
|
|
8
|
+
export { createBaselineFromInterview } from './converter.js';
|
|
9
9
|
export { computeSchemaHash, compareSchemas, computeConsensusSchemaHash, } from './schema-compare.js';
|
|
10
10
|
export { getBaselineVersion, parseVersion, areVersionsCompatible, compareVersions, getCompatibilityWarning, checkVersionCompatibility, assertVersionCompatibility, formatVersion, isCurrentVersion, isOlderVersion, isNewerVersion, requiresMigration, BaselineVersionError, } from './version.js';
|
|
11
|
-
// Legacy baseline migrations removed;
|
|
11
|
+
// Legacy baseline migrations removed; current baseline format is canonical.
|
|
12
12
|
// Incremental checking
|
|
13
13
|
export { analyzeForIncremental, mergeFingerprints, formatIncrementalSummary, isIncrementalWorthwhile, addIncrementalMetadata, } from './incremental-checker.js';
|
|
14
14
|
export { analyzeResponses, inferSchemaFromValue, compareFingerprints, compareErrorPatterns, computeInferredSchemaHash, } from './response-fingerprint.js';
|
|
15
15
|
// Response schema evolution tracking
|
|
16
16
|
export { compareInferredSchemas, buildSchemaEvolution, compareSchemaEvolution, formatSchemaEvolution, formatSchemaEvolutionDiff, hasSchemaEvolutionIssues, getSchemaStabilityGrade, } from './response-schema-tracker.js';
|
|
17
|
-
// Change impact analysis
|
|
18
|
-
export { analyzeToolChangeImpact, analyzeDiffImpact, analyzeSchemaChanges, isBreakingChange, getBreakingChangeSummary, CHANGE_IMPACT, } from './change-impact-analyzer.js';
|
|
19
17
|
// Performance tracking
|
|
20
|
-
export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence,
|
|
21
|
-
// Deprecation tracking
|
|
22
|
-
export { checkDeprecations, checkToolDeprecation, markAsDeprecated, clearDeprecation, getDeprecatedTools, getExpiredTools, getUpcomingRemovals, formatDeprecationWarning, formatDeprecationReport, shouldFailOnDeprecation, DEPRECATION, DEPRECATION_DEFAULTS, DEPRECATION_THRESHOLDS, } from './deprecation-tracker.js';
|
|
23
|
-
// Health scoring
|
|
24
|
-
export { calculateHealthScore, formatHealthScore, meetsHealthThreshold, getHealthBadgeColor, createHealthHistoryEntry, HEALTH_SCORING, HEALTH_WEIGHTS, GRADE_THRESHOLDS, SEVERITY_THRESHOLDS, HEALTH_PENALTIES, } from './health-scorer.js';
|
|
25
|
-
// Schema evolution timeline
|
|
26
|
-
export { buildServerTimeline, buildToolTimeline, formatTimeline, formatServerTimelineSummary, generateVisualTimeline, serializeTimeline, deserializeTimeline, serializeServerTimeline, deserializeServerTimeline, getMostActiveTools, getMostBreakingTools, getBreakingChanges, getVersionAtTime, getChangesBetween, hadBreakingChanges, } from './schema-evolution.js';
|
|
27
|
-
// Migration guide generation
|
|
28
|
-
export { generateMigrationGuide, formatMigrationGuideMarkdown, formatMigrationGuideText, hasBreakingMigrationChanges, getBreakingTools, } from './migration-generator.js';
|
|
29
|
-
// Auto-generated test scenarios
|
|
30
|
-
export { generateToolScenarios, generateBaselineScenarios, formatScenariosAsYaml, formatScenariosReport, getScenariosByPriority, getScenariosByCategory, getCriticalScenarios, getSecurityScenarios, } from './scenario-generator.js';
|
|
31
|
-
// Enhanced PR comments
|
|
32
|
-
export { generatePRComment, generateCompactPRComment, generateCIStatusSummary, generateDiffTable, generateBadgeUrl, generateBadgeMarkdown, getBadgeColor, shouldBlockMerge, getSeverityEmoji, } from './pr-comment-generator.js';
|
|
18
|
+
export { calculateMetrics, createPerformanceBaseline, extractPerformanceBaselines, comparePerformance, generatePerformanceReport, formatMetrics, formatComparison, isPerformanceAcceptable, aggregateSamplesByTool, calculatePerformanceConfidence, calculateConfidenceFromMetrics, formatConfidenceLevel, hasReliableConfidence, } from './performance-tracker.js';
|
|
33
19
|
export { runSecurityTests, compareSecurityFingerprints, getRiskLevelFromScore, parseSecurityCategories, getPayloadsForCategory, getAllSecurityPayloads, getAllSecurityCategories, } from '../security/index.js';
|
|
34
20
|
export { analyzeError, analyzeErrorPatterns, generateErrorSummary, analyzeErrorTrends, extractHttpStatus, categorizeHttpStatus, inferRootCause, generateRemediation, extractRelatedParameters, isTransientError, assessErrorSeverity, mapStatusToErrorCategory, formatEnhancedError, formatErrorTrendReport, formatCategoryName, } from './error-analyzer.js';
|
|
35
21
|
// Documentation quality scoring
|
|
36
22
|
export { scoreDocumentation, scoreToolDocumentation, calculateDescriptionCoverage, calculateDescriptionQuality, calculateParameterDocumentation, calculateExampleCoverage, hasExamples, scoreToGrade, generateSuggestions, compareDocumentationScores, formatDocumentationScore, formatDocumentationScoreCompact, formatDocumentationScoreChange, toDocumentationScoreSummary, getGradeIndicator, getGradeBadgeColor, meetsDocumentationThreshold, meetsDocumentationGrade, } from './documentation-scorer.js';
|
|
37
|
-
// AI Agent Compatibility Scoring
|
|
38
|
-
export { calculateAICompatibilityScore, generateAICompatibilityMarkdown, } from './ai-compatibility-scorer.js';
|
|
39
|
-
// Regression Risk Scoring
|
|
40
|
-
export { calculateRiskScore, generateRiskScoreMarkdown, } from './risk-scorer.js';
|
|
41
|
-
// Intelligent Test Pruning
|
|
42
|
-
export { calculatePruningDecisions, calculateToolPruning, prioritizeTools, generatePruningSummary, generatePruningMarkdown, } from './test-pruner.js';
|
|
43
23
|
//# sourceMappingURL=index.js.map
|
|
@@ -6,8 +6,6 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { getBaselineGeneratedAt, getToolFingerprints } from './accessors.js';
|
|
8
8
|
import { PERFORMANCE_TRACKING, PERFORMANCE_CONFIDENCE } from '../constants.js';
|
|
9
|
-
// Re-export centralized constant for backwards compatibility
|
|
10
|
-
export { PERFORMANCE_TRACKING as PERFORMANCE } from '../constants.js';
|
|
11
9
|
/**
|
|
12
10
|
* Calculate statistical confidence for performance metrics.
|
|
13
11
|
*
|
|
@@ -45,14 +43,14 @@ export function calculatePerformanceConfidence(samples, options = {}) {
|
|
|
45
43
|
}
|
|
46
44
|
// Categorize samples by expected outcome
|
|
47
45
|
// Happy path tests: expectedOutcome === 'success' or undefined (backward compat)
|
|
48
|
-
const happyPathSamples = samples.filter(s => s.expectedOutcome === 'success' || s.expectedOutcome === undefined);
|
|
46
|
+
const happyPathSamples = samples.filter((s) => s.expectedOutcome === 'success' || s.expectedOutcome === undefined);
|
|
49
47
|
// Validation tests: expectedOutcome === 'error'
|
|
50
|
-
const validationTestSamples = samples.filter(s => s.expectedOutcome === 'error');
|
|
48
|
+
const validationTestSamples = samples.filter((s) => s.expectedOutcome === 'error');
|
|
51
49
|
// Count validation samples that correctly rejected (error as expected = success)
|
|
52
|
-
const validationSuccesses = validationTestSamples.filter(s => !s.success && (s.outcomeCorrect === undefined || s.outcomeCorrect === true)).length;
|
|
50
|
+
const validationSuccesses = validationTestSamples.filter((s) => !s.success && (s.outcomeCorrect === undefined || s.outcomeCorrect === true)).length;
|
|
53
51
|
// For confidence, only use happy path samples that succeeded
|
|
54
|
-
const successfulHappyPath = happyPathSamples.filter(s => s.success);
|
|
55
|
-
const allDurations = successfulHappyPath.map(s => s.durationMs);
|
|
52
|
+
const successfulHappyPath = happyPathSamples.filter((s) => s.success);
|
|
53
|
+
const allDurations = successfulHappyPath.map((s) => s.durationMs);
|
|
56
54
|
// Handle all failures case
|
|
57
55
|
if (allDurations.length === 0) {
|
|
58
56
|
return {
|
|
@@ -69,12 +67,10 @@ export function calculatePerformanceConfidence(samples, options = {}) {
|
|
|
69
67
|
// For variance calculation, exclude the first sample (cold start warmup)
|
|
70
68
|
// This prevents JIT compilation, connection setup, and cache warming from
|
|
71
69
|
// inflating the coefficient of variation and lowering confidence scores.
|
|
72
|
-
const durationsForVariance = excludeWarmup && allDurations.length > 1
|
|
73
|
-
? allDurations.slice(1)
|
|
74
|
-
: allDurations;
|
|
70
|
+
const durationsForVariance = excludeWarmup && allDurations.length > 1 ? allDurations.slice(1) : allDurations;
|
|
75
71
|
// Calculate variance using post-warmup samples only
|
|
76
72
|
const meanForVariance = durationsForVariance.reduce((sum, d) => sum + d, 0) / durationsForVariance.length;
|
|
77
|
-
const squaredDiffs = durationsForVariance.map(d => Math.pow(d - meanForVariance, 2));
|
|
73
|
+
const squaredDiffs = durationsForVariance.map((d) => Math.pow(d - meanForVariance, 2));
|
|
78
74
|
const variance = squaredDiffs.reduce((sum, d) => sum + d, 0) / durationsForVariance.length;
|
|
79
75
|
const standardDeviation = Math.sqrt(variance);
|
|
80
76
|
// Calculate coefficient of variation (CV = stdDev / mean)
|
|
@@ -186,8 +182,8 @@ export function calculateMetrics(samples) {
|
|
|
186
182
|
return null;
|
|
187
183
|
}
|
|
188
184
|
const toolName = samples[0].toolName;
|
|
189
|
-
const successfulSamples = samples.filter(s => s.success);
|
|
190
|
-
const durations = successfulSamples.map(s => s.durationMs).sort((a, b) => a - b);
|
|
185
|
+
const successfulSamples = samples.filter((s) => s.success);
|
|
186
|
+
const durations = successfulSamples.map((s) => s.durationMs).sort((a, b) => a - b);
|
|
191
187
|
if (durations.length === 0) {
|
|
192
188
|
// All calls failed
|
|
193
189
|
const confidence = calculatePerformanceConfidence(samples);
|
|
@@ -213,7 +209,7 @@ export function calculateMetrics(samples) {
|
|
|
213
209
|
const minMs = durations[0];
|
|
214
210
|
const maxMs = durations[durations.length - 1];
|
|
215
211
|
// Calculate standard deviation
|
|
216
|
-
const squaredDiffs = durations.map(d => Math.pow(d - avgMs, 2));
|
|
212
|
+
const squaredDiffs = durations.map((d) => Math.pow(d - avgMs, 2));
|
|
217
213
|
const avgSquaredDiff = squaredDiffs.reduce((sum, d) => sum + d, 0) / squaredDiffs.length;
|
|
218
214
|
const stdDevMs = Math.sqrt(avgSquaredDiff);
|
|
219
215
|
// Calculate confidence from samples
|
|
@@ -323,8 +319,8 @@ export function comparePerformance(current, baseline, regressionThreshold = PERF
|
|
|
323
319
|
const trend = determineTrend(p50Regression);
|
|
324
320
|
// Check for regression
|
|
325
321
|
const maxRegression = baseline.maxAllowedRegression ?? regressionThreshold;
|
|
326
|
-
const hasRegression = p50Regression !== null && p50Regression > maxRegression ||
|
|
327
|
-
p95Regression !== null && p95Regression > maxRegression;
|
|
322
|
+
const hasRegression = (p50Regression !== null && p50Regression > maxRegression) ||
|
|
323
|
+
(p95Regression !== null && p95Regression > maxRegression);
|
|
328
324
|
// Determine severity
|
|
329
325
|
const severity = determinePerformanceSeverity(p50Regression, p95Regression, maxRegression);
|
|
330
326
|
// Generate summary (include confidence note if low)
|
|
@@ -520,10 +516,7 @@ export function formatMetrics(metrics) {
|
|
|
520
516
|
* Format performance comparison for display.
|
|
521
517
|
*/
|
|
522
518
|
export function formatComparison(comparison) {
|
|
523
|
-
const lines = [
|
|
524
|
-
`Tool: ${comparison.toolName}`,
|
|
525
|
-
` Trend: ${comparison.trend.toUpperCase()}`,
|
|
526
|
-
];
|
|
519
|
+
const lines = [`Tool: ${comparison.toolName}`, ` Trend: ${comparison.trend.toUpperCase()}`];
|
|
527
520
|
if (comparison.p50RegressionPercent !== null) {
|
|
528
521
|
const sign = comparison.p50RegressionPercent >= 0 ? '+' : '';
|
|
529
522
|
lines.push(` p50 change: ${sign}${(comparison.p50RegressionPercent * 100).toFixed(1)}%`);
|