@dotsetlabs/bellwether 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/README.md +8 -2
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +1 -3
- package/dist/baseline/baseline-format.d.ts +287 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +249 -11
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +46 -34
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +39 -2
- package/dist/baseline/saver.js +41 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +10 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +110 -0
- package/dist/cli/commands/check.js +23 -6
- package/dist/cli/commands/explore.js +34 -14
- package/dist/cli/index.js +8 -0
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +4 -0
- package/dist/constants/core.js +8 -19
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/docs/contract.js +131 -83
- package/dist/docs/report.js +8 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/transport/mcp-client.js +1 -1
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +157 -67
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import { createBaseline } from './saver.js';
|
|
13
13
|
import { getToolFingerprints } from './accessors.js';
|
|
14
|
-
import { compareFingerprints, compareErrorPatterns
|
|
14
|
+
import { compareFingerprints, compareErrorPatterns } from './response-fingerprint.js';
|
|
15
15
|
import { analyzeErrorTrends } from './error-analyzer.js';
|
|
16
16
|
import { compareSecurityFingerprints } from '../security/security-tester.js';
|
|
17
17
|
import { compareSchemaEvolution } from './response-schema-tracker.js';
|
|
18
18
|
import { checkVersionCompatibility, BaselineVersionError, parseVersion, areVersionsCompatible, getCompatibilityWarning, } from './version.js';
|
|
19
|
-
import { compareSchemas } from './schema-compare.js';
|
|
19
|
+
import { compareSchemas, computeSchemaHash } from './schema-compare.js';
|
|
20
20
|
import { PERFORMANCE_TRACKING } from '../constants.js';
|
|
21
21
|
import { hasReliableConfidence } from './performance-tracker.js';
|
|
22
22
|
import { compareDocumentationScores, scoreDocumentation } from './documentation-scorer.js';
|
|
@@ -89,6 +89,11 @@ export function compareBaselines(previous, current, options = {}) {
|
|
|
89
89
|
behaviorChanges.push(...toolDiff.changes);
|
|
90
90
|
}
|
|
91
91
|
}
|
|
92
|
+
// Compare prompts and resources
|
|
93
|
+
behaviorChanges.push(...comparePrompts(previous.capabilities.prompts, current.capabilities.prompts));
|
|
94
|
+
behaviorChanges.push(...compareResources(previous.capabilities.resources, current.capabilities.resources));
|
|
95
|
+
// Compare server metadata and capabilities
|
|
96
|
+
behaviorChanges.push(...compareServerInfo(previous.server, current.server));
|
|
92
97
|
// Compare workflows
|
|
93
98
|
const workflowChanges = compareWorkflows(previous.workflows || [], current.workflows || []);
|
|
94
99
|
behaviorChanges.push(...workflowChanges);
|
|
@@ -131,8 +136,10 @@ function compareTool(previous, current, options) {
|
|
|
131
136
|
let responseSchemaEvolutionChanged = false;
|
|
132
137
|
let securityChanged = false;
|
|
133
138
|
let schemaEvolutionDiff;
|
|
134
|
-
// Compare input schema with detailed diff
|
|
135
|
-
|
|
139
|
+
// Compare input schema with detailed diff (declared schema hash)
|
|
140
|
+
const previousDeclaredHash = getDeclaredSchemaHash(previous);
|
|
141
|
+
const currentDeclaredHash = getDeclaredSchemaHash(current);
|
|
142
|
+
if (previousDeclaredHash !== currentDeclaredHash && !options.ignoreSchemaChanges) {
|
|
136
143
|
schemaChanged = true;
|
|
137
144
|
// Get detailed schema comparison if inputSchema is available on both
|
|
138
145
|
const schemaComparison = compareSchemas(previous.inputSchema, current.inputSchema);
|
|
@@ -155,8 +162,8 @@ function compareTool(previous, current, options) {
|
|
|
155
162
|
changes.push({
|
|
156
163
|
tool: current.name,
|
|
157
164
|
aspect: 'schema',
|
|
158
|
-
before: `Schema hash: ${
|
|
159
|
-
after: `Schema hash: ${
|
|
165
|
+
before: `Schema hash: ${previousDeclaredHash}`,
|
|
166
|
+
after: `Schema hash: ${currentDeclaredHash}`,
|
|
160
167
|
severity: 'breaking',
|
|
161
168
|
description: `Schema for ${current.name} has changed`,
|
|
162
169
|
});
|
|
@@ -314,6 +321,235 @@ function compareTool(previous, current, options) {
|
|
|
314
321
|
schemaEvolutionDiff,
|
|
315
322
|
};
|
|
316
323
|
}
|
|
324
|
+
function comparePrompts(previous, current) {
|
|
325
|
+
const changes = [];
|
|
326
|
+
const prevMap = new Map((previous ?? []).map((p) => [p.name, p]));
|
|
327
|
+
const currMap = new Map((current ?? []).map((p) => [p.name, p]));
|
|
328
|
+
for (const [name, currPrompt] of currMap) {
|
|
329
|
+
const prevPrompt = prevMap.get(name);
|
|
330
|
+
if (!prevPrompt) {
|
|
331
|
+
changes.push({
|
|
332
|
+
tool: `prompt:${name}`,
|
|
333
|
+
aspect: 'prompt',
|
|
334
|
+
before: 'absent',
|
|
335
|
+
after: 'present',
|
|
336
|
+
severity: 'info',
|
|
337
|
+
description: `Prompt "${name}" added`,
|
|
338
|
+
});
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
if (prevPrompt.description !== currPrompt.description) {
|
|
342
|
+
changes.push({
|
|
343
|
+
tool: `prompt:${name}`,
|
|
344
|
+
aspect: 'prompt',
|
|
345
|
+
before: prevPrompt.description ?? 'none',
|
|
346
|
+
after: currPrompt.description ?? 'none',
|
|
347
|
+
severity: 'info',
|
|
348
|
+
description: `Prompt "${name}" description changed`,
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
const prevArgs = prevPrompt.arguments ?? [];
|
|
352
|
+
const currArgs = currPrompt.arguments ?? [];
|
|
353
|
+
const prevArgMap = new Map(prevArgs.map((a) => [a.name, a]));
|
|
354
|
+
const currArgMap = new Map(currArgs.map((a) => [a.name, a]));
|
|
355
|
+
for (const [argName, currArg] of currArgMap) {
|
|
356
|
+
const prevArg = prevArgMap.get(argName);
|
|
357
|
+
if (!prevArg) {
|
|
358
|
+
changes.push({
|
|
359
|
+
tool: `prompt:${name}`,
|
|
360
|
+
aspect: 'prompt',
|
|
361
|
+
before: 'absent',
|
|
362
|
+
after: 'present',
|
|
363
|
+
severity: currArg.required ? 'breaking' : 'info',
|
|
364
|
+
description: `Prompt "${name}" argument "${argName}" added`,
|
|
365
|
+
});
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
if (prevArg.required !== currArg.required) {
|
|
369
|
+
changes.push({
|
|
370
|
+
tool: `prompt:${name}`,
|
|
371
|
+
aspect: 'prompt',
|
|
372
|
+
before: String(prevArg.required ?? false),
|
|
373
|
+
after: String(currArg.required ?? false),
|
|
374
|
+
severity: currArg.required ? 'breaking' : 'warning',
|
|
375
|
+
description: `Prompt "${name}" argument "${argName}" requirement changed`,
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
if (prevArg.description !== currArg.description) {
|
|
379
|
+
changes.push({
|
|
380
|
+
tool: `prompt:${name}`,
|
|
381
|
+
aspect: 'prompt',
|
|
382
|
+
before: prevArg.description ?? 'none',
|
|
383
|
+
after: currArg.description ?? 'none',
|
|
384
|
+
severity: 'info',
|
|
385
|
+
description: `Prompt "${name}" argument "${argName}" description changed`,
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
for (const [argName] of prevArgMap) {
|
|
390
|
+
if (!currArgMap.has(argName)) {
|
|
391
|
+
changes.push({
|
|
392
|
+
tool: `prompt:${name}`,
|
|
393
|
+
aspect: 'prompt',
|
|
394
|
+
before: 'present',
|
|
395
|
+
after: 'absent',
|
|
396
|
+
severity: 'breaking',
|
|
397
|
+
description: `Prompt "${name}" argument "${argName}" removed`,
|
|
398
|
+
});
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
for (const [name] of prevMap) {
|
|
403
|
+
if (!currMap.has(name)) {
|
|
404
|
+
changes.push({
|
|
405
|
+
tool: `prompt:${name}`,
|
|
406
|
+
aspect: 'prompt',
|
|
407
|
+
before: 'present',
|
|
408
|
+
after: 'absent',
|
|
409
|
+
severity: 'breaking',
|
|
410
|
+
description: `Prompt "${name}" removed`,
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
return changes;
|
|
415
|
+
}
|
|
416
|
+
function compareResources(previous, current) {
|
|
417
|
+
const changes = [];
|
|
418
|
+
const prevMap = new Map((previous ?? []).map((r) => [r.uri, r]));
|
|
419
|
+
const currMap = new Map((current ?? []).map((r) => [r.uri, r]));
|
|
420
|
+
for (const [uri, currResource] of currMap) {
|
|
421
|
+
const prevResource = prevMap.get(uri);
|
|
422
|
+
if (!prevResource) {
|
|
423
|
+
changes.push({
|
|
424
|
+
tool: `resource:${currResource.name ?? uri}`,
|
|
425
|
+
aspect: 'resource',
|
|
426
|
+
before: 'absent',
|
|
427
|
+
after: 'present',
|
|
428
|
+
severity: 'info',
|
|
429
|
+
description: `Resource "${uri}" added`,
|
|
430
|
+
});
|
|
431
|
+
continue;
|
|
432
|
+
}
|
|
433
|
+
if (prevResource.name !== currResource.name) {
|
|
434
|
+
changes.push({
|
|
435
|
+
tool: `resource:${currResource.name ?? uri}`,
|
|
436
|
+
aspect: 'resource',
|
|
437
|
+
before: prevResource.name ?? 'none',
|
|
438
|
+
after: currResource.name ?? 'none',
|
|
439
|
+
severity: 'info',
|
|
440
|
+
description: `Resource "${uri}" name changed`,
|
|
441
|
+
});
|
|
442
|
+
}
|
|
443
|
+
if (prevResource.description !== currResource.description) {
|
|
444
|
+
changes.push({
|
|
445
|
+
tool: `resource:${currResource.name ?? uri}`,
|
|
446
|
+
aspect: 'resource',
|
|
447
|
+
before: prevResource.description ?? 'none',
|
|
448
|
+
after: currResource.description ?? 'none',
|
|
449
|
+
severity: 'info',
|
|
450
|
+
description: `Resource "${uri}" description changed`,
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
if (prevResource.mimeType !== currResource.mimeType) {
|
|
454
|
+
changes.push({
|
|
455
|
+
tool: `resource:${currResource.name ?? uri}`,
|
|
456
|
+
aspect: 'resource',
|
|
457
|
+
before: prevResource.mimeType ?? 'none',
|
|
458
|
+
after: currResource.mimeType ?? 'none',
|
|
459
|
+
severity: 'warning',
|
|
460
|
+
description: `Resource "${uri}" mime type changed`,
|
|
461
|
+
});
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
for (const [uri, prevResource] of prevMap) {
|
|
465
|
+
if (!currMap.has(uri)) {
|
|
466
|
+
changes.push({
|
|
467
|
+
tool: `resource:${prevResource.name ?? uri}`,
|
|
468
|
+
aspect: 'resource',
|
|
469
|
+
before: 'present',
|
|
470
|
+
after: 'absent',
|
|
471
|
+
severity: 'breaking',
|
|
472
|
+
description: `Resource "${uri}" removed`,
|
|
473
|
+
});
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
return changes;
|
|
477
|
+
}
|
|
478
|
+
function compareServerInfo(previous, current) {
|
|
479
|
+
const changes = [];
|
|
480
|
+
if (previous.name !== current.name) {
|
|
481
|
+
changes.push({
|
|
482
|
+
tool: 'server',
|
|
483
|
+
aspect: 'server',
|
|
484
|
+
before: previous.name,
|
|
485
|
+
after: current.name,
|
|
486
|
+
severity: 'info',
|
|
487
|
+
description: 'Server name changed',
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
if (previous.version !== current.version) {
|
|
491
|
+
changes.push({
|
|
492
|
+
tool: 'server',
|
|
493
|
+
aspect: 'server',
|
|
494
|
+
before: previous.version,
|
|
495
|
+
after: current.version,
|
|
496
|
+
severity: 'info',
|
|
497
|
+
description: 'Server version changed',
|
|
498
|
+
});
|
|
499
|
+
}
|
|
500
|
+
if (previous.protocolVersion !== current.protocolVersion) {
|
|
501
|
+
const breaking = isMajorVersionChange(previous.protocolVersion, current.protocolVersion);
|
|
502
|
+
changes.push({
|
|
503
|
+
tool: 'server',
|
|
504
|
+
aspect: 'server',
|
|
505
|
+
before: previous.protocolVersion,
|
|
506
|
+
after: current.protocolVersion,
|
|
507
|
+
severity: breaking ? 'breaking' : 'warning',
|
|
508
|
+
description: 'Protocol version changed',
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
const prevCaps = new Set(previous.capabilities);
|
|
512
|
+
const currCaps = new Set(current.capabilities);
|
|
513
|
+
for (const cap of prevCaps) {
|
|
514
|
+
if (!currCaps.has(cap)) {
|
|
515
|
+
changes.push({
|
|
516
|
+
tool: 'server',
|
|
517
|
+
aspect: 'capability',
|
|
518
|
+
before: cap,
|
|
519
|
+
after: 'removed',
|
|
520
|
+
severity: 'breaking',
|
|
521
|
+
description: `Capability "${cap}" removed`,
|
|
522
|
+
});
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
for (const cap of currCaps) {
|
|
526
|
+
if (!prevCaps.has(cap)) {
|
|
527
|
+
changes.push({
|
|
528
|
+
tool: 'server',
|
|
529
|
+
aspect: 'capability',
|
|
530
|
+
before: 'absent',
|
|
531
|
+
after: cap,
|
|
532
|
+
severity: 'info',
|
|
533
|
+
description: `Capability "${cap}" added`,
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
return changes;
|
|
538
|
+
}
|
|
539
|
+
function isMajorVersionChange(previous, current) {
|
|
540
|
+
const prevMajor = parseInt(previous.split('.')[0] ?? '0', 10);
|
|
541
|
+
const currMajor = parseInt(current.split('.')[0] ?? '0', 10);
|
|
542
|
+
if (Number.isNaN(prevMajor) || Number.isNaN(currMajor)) {
|
|
543
|
+
return previous !== current;
|
|
544
|
+
}
|
|
545
|
+
return prevMajor !== currMajor;
|
|
546
|
+
}
|
|
547
|
+
function getDeclaredSchemaHash(tool) {
|
|
548
|
+
if (tool.inputSchema && Object.keys(tool.inputSchema).length > 0) {
|
|
549
|
+
return computeSchemaHash(tool.inputSchema);
|
|
550
|
+
}
|
|
551
|
+
return tool.schemaHash;
|
|
552
|
+
}
|
|
317
553
|
/**
|
|
318
554
|
* Format a schema change value for display in BehaviorChange.
|
|
319
555
|
* Converts unknown values to human-readable strings.
|
|
@@ -331,8 +567,11 @@ function formatSchemaChangeValue(value) {
|
|
|
331
567
|
if (value.length === 0)
|
|
332
568
|
return '[]';
|
|
333
569
|
if (value.length <= 3)
|
|
334
|
-
return `[${value.map(v => formatSchemaChangeValue(v)).join(', ')}]`;
|
|
335
|
-
return `[${value
|
|
570
|
+
return `[${value.map((v) => formatSchemaChangeValue(v)).join(', ')}]`;
|
|
571
|
+
return `[${value
|
|
572
|
+
.slice(0, 3)
|
|
573
|
+
.map((v) => formatSchemaChangeValue(v))
|
|
574
|
+
.join(', ')}, ...]`;
|
|
336
575
|
}
|
|
337
576
|
// For objects, show a compact representation
|
|
338
577
|
try {
|
|
@@ -474,7 +713,7 @@ export function applyAspectOverride(change, aspectOverrides) {
|
|
|
474
713
|
* Returns a new diff with filtered/modified changes based on config.
|
|
475
714
|
*/
|
|
476
715
|
export function applySeverityConfig(diff, config) {
|
|
477
|
-
const { minimumSeverity = 'none', suppressWarnings = false, aspectOverrides
|
|
716
|
+
const { minimumSeverity = 'none', suppressWarnings = false, aspectOverrides } = config;
|
|
478
717
|
// Apply aspect overrides and filter by minimum severity
|
|
479
718
|
const filteredChanges = diff.behaviorChanges
|
|
480
719
|
.map((change) => {
|
|
@@ -495,8 +734,7 @@ export function applySeverityConfig(diff, config) {
|
|
|
495
734
|
// Filter toolsModified to only include those with remaining changes
|
|
496
735
|
const toolsWithChanges = new Set(filteredChanges.map((c) => c.tool));
|
|
497
736
|
const filteredToolsModified = diff.toolsModified.filter((td) => toolsWithChanges.has(td.tool) ||
|
|
498
|
-
(td.schemaChanged &&
|
|
499
|
-
(!aspectOverrides?.schema || aspectOverrides.schema !== 'none')) ||
|
|
737
|
+
(td.schemaChanged && (!aspectOverrides?.schema || aspectOverrides.schema !== 'none')) ||
|
|
500
738
|
(td.descriptionChanged &&
|
|
501
739
|
(!aspectOverrides?.description || aspectOverrides.description !== 'none')));
|
|
502
740
|
// Recalculate counts
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Baseline builder.
|
|
3
3
|
*
|
|
4
4
|
* ## Severity Type Mappings
|
|
5
5
|
*
|
|
@@ -13,19 +13,19 @@
|
|
|
13
13
|
* Used for error severity classification in error handling.
|
|
14
14
|
* Values: 'low' | 'medium' | 'high' | 'critical'
|
|
15
15
|
*
|
|
16
|
-
* ###
|
|
17
|
-
* Used for
|
|
16
|
+
* ### BaselineAssertionSeverity (baseline-format.ts)
|
|
17
|
+
* Used for baseline assertions and PersonaFinding severity levels.
|
|
18
18
|
* Values: 'info' | 'low' | 'medium' | 'high' | 'critical'
|
|
19
19
|
*
|
|
20
20
|
* ### Conversion Mappings
|
|
21
21
|
*
|
|
22
|
-
* ChangeSeverity →
|
|
22
|
+
* ChangeSeverity → BaselineAssertionSeverity:
|
|
23
23
|
* - 'none' → 'info' (no change, informational)
|
|
24
24
|
* - 'info' → 'low' (minor changes)
|
|
25
25
|
* - 'warning' → 'medium' (moderate changes)
|
|
26
26
|
* - 'breaking' → 'critical' (breaking changes)
|
|
27
27
|
*
|
|
28
|
-
*
|
|
28
|
+
* BaselineAssertionSeverity → ChangeSeverity (for display/filtering):
|
|
29
29
|
* - 'info' → 'info'
|
|
30
30
|
* - 'low' → 'info'
|
|
31
31
|
* - 'medium' → 'warning'
|
|
@@ -34,25 +34,25 @@
|
|
|
34
34
|
*/
|
|
35
35
|
import type { BehavioralAssertion, BehavioralBaseline, ChangeSeverity } from './types.js';
|
|
36
36
|
import type { InterviewResult } from '../interview/types.js';
|
|
37
|
-
import type {
|
|
37
|
+
import type { BaselineAssertion, BaselineAssertionSeverity } from './baseline-format.js';
|
|
38
38
|
/**
|
|
39
|
-
* Map ChangeSeverity to
|
|
40
|
-
* Used when mapping CLI assertions to
|
|
39
|
+
* Map ChangeSeverity to BaselineAssertionSeverity.
|
|
40
|
+
* Used when mapping CLI assertions to baseline severity levels.
|
|
41
41
|
*/
|
|
42
|
-
export declare const
|
|
42
|
+
export declare const CHANGE_TO_BASELINE_SEVERITY: Record<ChangeSeverity, BaselineAssertionSeverity>;
|
|
43
43
|
/**
|
|
44
|
-
* Map
|
|
45
|
-
* Used when filtering or displaying
|
|
44
|
+
* Map BaselineAssertionSeverity to ChangeSeverity.
|
|
45
|
+
* Used when filtering or displaying baseline data locally.
|
|
46
46
|
*/
|
|
47
|
-
export declare const
|
|
47
|
+
export declare const BASELINE_TO_CHANGE_SEVERITY: Record<BaselineAssertionSeverity, ChangeSeverity>;
|
|
48
48
|
/**
|
|
49
|
-
* Convert an array of BehavioralAssertions to
|
|
49
|
+
* Convert an array of BehavioralAssertions to baseline assertions.
|
|
50
50
|
*/
|
|
51
|
-
export declare function convertAssertions(assertions: BehavioralAssertion[]):
|
|
51
|
+
export declare function convertAssertions(assertions: BehavioralAssertion[]): BaselineAssertion[];
|
|
52
52
|
/**
|
|
53
53
|
* Create a BellwetherBaseline directly from InterviewResult.
|
|
54
54
|
*
|
|
55
55
|
* This is the preferred method when you have fresh interview results.
|
|
56
56
|
*/
|
|
57
|
-
export declare function
|
|
57
|
+
export declare function createBaselineFromInterview(result: InterviewResult, serverCommand: string): BehavioralBaseline;
|
|
58
58
|
//# sourceMappingURL=converter.d.ts.map
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Baseline builder.
|
|
3
3
|
*
|
|
4
4
|
* ## Severity Type Mappings
|
|
5
5
|
*
|
|
@@ -13,19 +13,19 @@
|
|
|
13
13
|
* Used for error severity classification in error handling.
|
|
14
14
|
* Values: 'low' | 'medium' | 'high' | 'critical'
|
|
15
15
|
*
|
|
16
|
-
* ###
|
|
17
|
-
* Used for
|
|
16
|
+
* ### BaselineAssertionSeverity (baseline-format.ts)
|
|
17
|
+
* Used for baseline assertions and PersonaFinding severity levels.
|
|
18
18
|
* Values: 'info' | 'low' | 'medium' | 'high' | 'critical'
|
|
19
19
|
*
|
|
20
20
|
* ### Conversion Mappings
|
|
21
21
|
*
|
|
22
|
-
* ChangeSeverity →
|
|
22
|
+
* ChangeSeverity → BaselineAssertionSeverity:
|
|
23
23
|
* - 'none' → 'info' (no change, informational)
|
|
24
24
|
* - 'info' → 'low' (minor changes)
|
|
25
25
|
* - 'warning' → 'medium' (moderate changes)
|
|
26
26
|
* - 'breaking' → 'critical' (breaking changes)
|
|
27
27
|
*
|
|
28
|
-
*
|
|
28
|
+
* BaselineAssertionSeverity → ChangeSeverity (for display/filtering):
|
|
29
29
|
* - 'info' → 'info'
|
|
30
30
|
* - 'low' → 'info'
|
|
31
31
|
* - 'medium' → 'warning'
|
|
@@ -35,27 +35,27 @@
|
|
|
35
35
|
import { createHash } from 'crypto';
|
|
36
36
|
import { analyzeResponses } from './response-fingerprint.js';
|
|
37
37
|
import { buildSchemaEvolution } from './response-schema-tracker.js';
|
|
38
|
-
import { calculateMetrics, calculatePerformanceConfidence } from './performance-tracker.js';
|
|
39
|
-
import { computeConsensusSchemaHash } from './schema-compare.js';
|
|
38
|
+
import { calculateMetrics, calculatePerformanceConfidence, } from './performance-tracker.js';
|
|
39
|
+
import { computeConsensusSchemaHash, computeSchemaHash } from './schema-compare.js';
|
|
40
40
|
import { calculateBaselineHash } from './baseline-hash.js';
|
|
41
41
|
import { getBaselineVersion } from './version.js';
|
|
42
42
|
import { VERSION } from '../version.js';
|
|
43
43
|
import { scoreDocumentation, toDocumentationScoreSummary } from './documentation-scorer.js';
|
|
44
44
|
/**
|
|
45
|
-
* Map ChangeSeverity to
|
|
46
|
-
* Used when mapping CLI assertions to
|
|
45
|
+
* Map ChangeSeverity to BaselineAssertionSeverity.
|
|
46
|
+
* Used when mapping CLI assertions to baseline severity levels.
|
|
47
47
|
*/
|
|
48
|
-
export const
|
|
48
|
+
export const CHANGE_TO_BASELINE_SEVERITY = {
|
|
49
49
|
none: 'info',
|
|
50
50
|
info: 'low',
|
|
51
51
|
warning: 'medium',
|
|
52
52
|
breaking: 'critical',
|
|
53
53
|
};
|
|
54
54
|
/**
|
|
55
|
-
* Map
|
|
56
|
-
* Used when filtering or displaying
|
|
55
|
+
* Map BaselineAssertionSeverity to ChangeSeverity.
|
|
56
|
+
* Used when filtering or displaying baseline data locally.
|
|
57
57
|
*/
|
|
58
|
-
export const
|
|
58
|
+
export const BASELINE_TO_CHANGE_SEVERITY = {
|
|
59
59
|
info: 'info',
|
|
60
60
|
low: 'info',
|
|
61
61
|
medium: 'warning',
|
|
@@ -69,7 +69,7 @@ function hashString(input) {
|
|
|
69
69
|
return createHash('sha256').update(input).digest('hex').slice(0, 16);
|
|
70
70
|
}
|
|
71
71
|
/**
|
|
72
|
-
* Convert a local BehavioralAssertion to
|
|
72
|
+
* Convert a local BehavioralAssertion to baseline assertion format.
|
|
73
73
|
*
|
|
74
74
|
* Mapping:
|
|
75
75
|
* - isPositive=true + security aspect → 'requires' (critical security requirement)
|
|
@@ -123,7 +123,7 @@ function convertAssertion(assertion) {
|
|
|
123
123
|
};
|
|
124
124
|
}
|
|
125
125
|
/**
|
|
126
|
-
* Convert an array of BehavioralAssertions to
|
|
126
|
+
* Convert an array of BehavioralAssertions to baseline assertions.
|
|
127
127
|
*/
|
|
128
128
|
export function convertAssertions(assertions) {
|
|
129
129
|
return assertions.map(convertAssertion);
|
|
@@ -132,17 +132,15 @@ export function convertAssertions(assertions) {
|
|
|
132
132
|
* Derive baseline mode from result metadata.
|
|
133
133
|
* Returns 'check' for check mode results, 'explore' for explore mode results.
|
|
134
134
|
* Note: Baselines should only be created from check mode results,
|
|
135
|
-
* but explore
|
|
135
|
+
* but explore mode baselines are still supported for documentation tracking.
|
|
136
136
|
*/
|
|
137
|
-
function
|
|
137
|
+
function deriveBaselineMode(resultModel) {
|
|
138
138
|
// Check mode results have model === 'check'
|
|
139
139
|
if (resultModel === 'check')
|
|
140
140
|
return 'check';
|
|
141
141
|
// LLM model names indicate explore mode
|
|
142
142
|
if (resultModel)
|
|
143
143
|
return 'explore';
|
|
144
|
-
if (baselineMode === 'check')
|
|
145
|
-
return 'check';
|
|
146
144
|
// Default to check for legacy baselines without explicit mode
|
|
147
145
|
return 'check';
|
|
148
146
|
}
|
|
@@ -296,9 +294,7 @@ function classifySeverity(note) {
|
|
|
296
294
|
lowerNote.includes('leak')) {
|
|
297
295
|
return 'medium';
|
|
298
296
|
}
|
|
299
|
-
if (lowerNote.includes('low') ||
|
|
300
|
-
lowerNote.includes('minor') ||
|
|
301
|
-
lowerNote.includes('potential')) {
|
|
297
|
+
if (lowerNote.includes('low') || lowerNote.includes('minor') || lowerNote.includes('potential')) {
|
|
302
298
|
return 'low';
|
|
303
299
|
}
|
|
304
300
|
return 'info';
|
|
@@ -308,9 +304,9 @@ function classifySeverity(note) {
|
|
|
308
304
|
*
|
|
309
305
|
* This is the preferred method when you have fresh interview results.
|
|
310
306
|
*/
|
|
311
|
-
export function
|
|
307
|
+
export function createBaselineFromInterview(result, serverCommand) {
|
|
312
308
|
// Derive mode from result metadata
|
|
313
|
-
const mode =
|
|
309
|
+
const mode = deriveBaselineMode(result.metadata.model);
|
|
314
310
|
// Build metadata
|
|
315
311
|
const metadata = {
|
|
316
312
|
mode,
|
|
@@ -338,11 +334,12 @@ export function createCloudBaseline(result, serverCommand) {
|
|
|
338
334
|
}
|
|
339
335
|
}
|
|
340
336
|
const tools = result.toolProfiles.map((profile) => {
|
|
341
|
-
const interactions = profile.interactions.map(i => ({ args: i.question.args }));
|
|
342
|
-
const
|
|
337
|
+
const interactions = profile.interactions.map((i) => ({ args: i.question.args }));
|
|
338
|
+
const observedSchema = computeConsensusSchemaHash(interactions);
|
|
339
|
+
const declaredSchemaHash = computeSchemaHash(schemaMap.get(profile.name) ?? {});
|
|
343
340
|
const responseData = profile.interactions
|
|
344
|
-
.filter(i => !i.mocked)
|
|
345
|
-
.map(i => ({
|
|
341
|
+
.filter((i) => !i.mocked)
|
|
342
|
+
.map((i) => ({
|
|
346
343
|
response: i.response,
|
|
347
344
|
error: i.error,
|
|
348
345
|
}));
|
|
@@ -351,8 +348,8 @@ export function createCloudBaseline(result, serverCommand) {
|
|
|
351
348
|
? buildSchemaEvolution(responseAnalysis.schemas)
|
|
352
349
|
: undefined;
|
|
353
350
|
const latencySamples = profile.interactions
|
|
354
|
-
.filter(i => i.toolExecutionMs !== undefined && !i.mocked)
|
|
355
|
-
.map(i => ({
|
|
351
|
+
.filter((i) => i.toolExecutionMs !== undefined && !i.mocked)
|
|
352
|
+
.map((i) => ({
|
|
356
353
|
toolName: profile.name,
|
|
357
354
|
durationMs: i.toolExecutionMs ?? 0,
|
|
358
355
|
success: !i.error && !i.response?.isError,
|
|
@@ -378,16 +375,23 @@ export function createCloudBaseline(result, serverCommand) {
|
|
|
378
375
|
name: profile.name,
|
|
379
376
|
description: profile.description ?? '',
|
|
380
377
|
inputSchema: schemaMap.get(profile.name) ?? {},
|
|
381
|
-
schemaHash,
|
|
378
|
+
schemaHash: declaredSchemaHash,
|
|
379
|
+
observedArgsSchemaHash: observedSchema.hash,
|
|
380
|
+
observedArgsSchemaConsistency: observedSchema.consistency,
|
|
381
|
+
observedArgsSchemaVariations: observedSchema.variations,
|
|
382
382
|
responseFingerprint: responseAnalysis.fingerprint,
|
|
383
383
|
inferredOutputSchema: responseAnalysis.inferredSchema,
|
|
384
384
|
responseSchemaEvolution,
|
|
385
|
-
errorPatterns: responseAnalysis.errorPatterns.length
|
|
385
|
+
errorPatterns: responseAnalysis.errorPatterns.length
|
|
386
|
+
? responseAnalysis.errorPatterns
|
|
387
|
+
: undefined,
|
|
386
388
|
baselineP50Ms,
|
|
387
389
|
baselineP95Ms,
|
|
388
390
|
baselineP99Ms,
|
|
389
391
|
baselineSuccessRate,
|
|
390
392
|
performanceConfidence,
|
|
393
|
+
lastTestedAt: metadata.generatedAt,
|
|
394
|
+
inputSchemaHashAtTest: declaredSchemaHash,
|
|
391
395
|
};
|
|
392
396
|
});
|
|
393
397
|
const prompts = result.discovery.prompts.length > 0
|
|
@@ -401,6 +405,14 @@ export function createCloudBaseline(result, serverCommand) {
|
|
|
401
405
|
})),
|
|
402
406
|
}))
|
|
403
407
|
: undefined;
|
|
408
|
+
const resources = result.discovery.resources && result.discovery.resources.length > 0
|
|
409
|
+
? result.discovery.resources.map((r) => ({
|
|
410
|
+
uri: r.uri,
|
|
411
|
+
name: r.name,
|
|
412
|
+
description: r.description,
|
|
413
|
+
mimeType: r.mimeType,
|
|
414
|
+
}))
|
|
415
|
+
: undefined;
|
|
404
416
|
// Build interviews
|
|
405
417
|
const interviews = buildInterviews(result, mode);
|
|
406
418
|
// Build tool profiles (with converted assertions)
|
|
@@ -425,13 +437,13 @@ export function createCloudBaseline(result, serverCommand) {
|
|
|
425
437
|
summary: wr.summary,
|
|
426
438
|
}));
|
|
427
439
|
const documentationScore = toDocumentationScoreSummary(scoreDocumentation(result.discovery.tools));
|
|
428
|
-
// Build assertions (convert to
|
|
440
|
+
// Build assertions (convert to baseline format)
|
|
429
441
|
const assertions = convertAssertions(extractAllAssertions(result));
|
|
430
442
|
const baselineWithoutHash = {
|
|
431
443
|
version: getBaselineVersion(),
|
|
432
444
|
metadata,
|
|
433
445
|
server,
|
|
434
|
-
capabilities: { tools, prompts },
|
|
446
|
+
capabilities: { tools, prompts, resources },
|
|
435
447
|
interviews,
|
|
436
448
|
toolProfiles,
|
|
437
449
|
workflows,
|
package/dist/baseline/diff.d.ts
CHANGED
|
@@ -38,7 +38,7 @@ export declare function formatDiffJUnit(diff: BehavioralDiff, suiteName?: string
|
|
|
38
38
|
* Format diff as SARIF (Static Analysis Results Interchange Format) for GitHub Code Scanning.
|
|
39
39
|
*
|
|
40
40
|
* SARIF is the standard format for GitHub's code scanning feature and can be
|
|
41
|
-
*
|
|
41
|
+
* used to show drift detection results in pull request reviews.
|
|
42
42
|
*
|
|
43
43
|
* @see https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html
|
|
44
44
|
*
|