@dotsetlabs/bellwether 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +74 -0
  2. package/README.md +8 -2
  3. package/dist/baseline/accessors.d.ts +1 -1
  4. package/dist/baseline/accessors.js +1 -3
  5. package/dist/baseline/baseline-format.d.ts +287 -0
  6. package/dist/baseline/baseline-format.js +12 -0
  7. package/dist/baseline/comparator.js +249 -11
  8. package/dist/baseline/converter.d.ts +15 -15
  9. package/dist/baseline/converter.js +46 -34
  10. package/dist/baseline/diff.d.ts +1 -1
  11. package/dist/baseline/diff.js +45 -28
  12. package/dist/baseline/error-analyzer.d.ts +1 -1
  13. package/dist/baseline/error-analyzer.js +90 -17
  14. package/dist/baseline/incremental-checker.js +8 -5
  15. package/dist/baseline/index.d.ts +2 -12
  16. package/dist/baseline/index.js +3 -23
  17. package/dist/baseline/performance-tracker.d.ts +0 -1
  18. package/dist/baseline/performance-tracker.js +13 -20
  19. package/dist/baseline/response-fingerprint.js +39 -2
  20. package/dist/baseline/saver.js +41 -10
  21. package/dist/baseline/schema-compare.d.ts +22 -0
  22. package/dist/baseline/schema-compare.js +259 -16
  23. package/dist/baseline/types.d.ts +10 -7
  24. package/dist/cache/response-cache.d.ts +8 -0
  25. package/dist/cache/response-cache.js +110 -0
  26. package/dist/cli/commands/check.js +23 -6
  27. package/dist/cli/commands/explore.js +34 -14
  28. package/dist/cli/index.js +8 -0
  29. package/dist/config/template.js +8 -7
  30. package/dist/config/validator.d.ts +59 -59
  31. package/dist/config/validator.js +245 -90
  32. package/dist/constants/core.d.ts +4 -0
  33. package/dist/constants/core.js +8 -19
  34. package/dist/constants/registry.d.ts +17 -0
  35. package/dist/constants/registry.js +18 -0
  36. package/dist/constants/testing.d.ts +0 -369
  37. package/dist/constants/testing.js +18 -456
  38. package/dist/constants.d.ts +1 -1
  39. package/dist/constants.js +1 -1
  40. package/dist/docs/contract.js +131 -83
  41. package/dist/docs/report.js +8 -5
  42. package/dist/interview/insights.d.ts +17 -0
  43. package/dist/interview/insights.js +52 -0
  44. package/dist/interview/interviewer.js +52 -10
  45. package/dist/interview/prompt-test-generator.d.ts +12 -0
  46. package/dist/interview/prompt-test-generator.js +77 -0
  47. package/dist/interview/resource-test-generator.d.ts +12 -0
  48. package/dist/interview/resource-test-generator.js +20 -0
  49. package/dist/interview/schema-inferrer.js +26 -4
  50. package/dist/interview/schema-test-generator.js +278 -31
  51. package/dist/interview/stateful-test-runner.d.ts +3 -0
  52. package/dist/interview/stateful-test-runner.js +80 -0
  53. package/dist/interview/types.d.ts +12 -0
  54. package/dist/transport/mcp-client.js +1 -1
  55. package/dist/transport/sse-transport.d.ts +7 -3
  56. package/dist/transport/sse-transport.js +157 -67
  57. package/dist/version.js +1 -1
  58. package/man/bellwether.1 +1 -1
  59. package/man/bellwether.1.md +2 -2
  60. package/package.json +1 -1
  61. package/schemas/bellwether-check.schema.json +185 -0
  62. package/schemas/bellwether-explore.schema.json +837 -0
  63. package/scripts/completions/bellwether.bash +10 -4
  64. package/scripts/completions/bellwether.zsh +55 -2
@@ -11,12 +11,12 @@
11
11
  */
12
12
  import { createBaseline } from './saver.js';
13
13
  import { getToolFingerprints } from './accessors.js';
14
- import { compareFingerprints, compareErrorPatterns, } from './response-fingerprint.js';
14
+ import { compareFingerprints, compareErrorPatterns } from './response-fingerprint.js';
15
15
  import { analyzeErrorTrends } from './error-analyzer.js';
16
16
  import { compareSecurityFingerprints } from '../security/security-tester.js';
17
17
  import { compareSchemaEvolution } from './response-schema-tracker.js';
18
18
  import { checkVersionCompatibility, BaselineVersionError, parseVersion, areVersionsCompatible, getCompatibilityWarning, } from './version.js';
19
- import { compareSchemas } from './schema-compare.js';
19
+ import { compareSchemas, computeSchemaHash } from './schema-compare.js';
20
20
  import { PERFORMANCE_TRACKING } from '../constants.js';
21
21
  import { hasReliableConfidence } from './performance-tracker.js';
22
22
  import { compareDocumentationScores, scoreDocumentation } from './documentation-scorer.js';
@@ -89,6 +89,11 @@ export function compareBaselines(previous, current, options = {}) {
89
89
  behaviorChanges.push(...toolDiff.changes);
90
90
  }
91
91
  }
92
+ // Compare prompts and resources
93
+ behaviorChanges.push(...comparePrompts(previous.capabilities.prompts, current.capabilities.prompts));
94
+ behaviorChanges.push(...compareResources(previous.capabilities.resources, current.capabilities.resources));
95
+ // Compare server metadata and capabilities
96
+ behaviorChanges.push(...compareServerInfo(previous.server, current.server));
92
97
  // Compare workflows
93
98
  const workflowChanges = compareWorkflows(previous.workflows || [], current.workflows || []);
94
99
  behaviorChanges.push(...workflowChanges);
@@ -131,8 +136,10 @@ function compareTool(previous, current, options) {
131
136
  let responseSchemaEvolutionChanged = false;
132
137
  let securityChanged = false;
133
138
  let schemaEvolutionDiff;
134
- // Compare input schema with detailed diff
135
- if (previous.schemaHash !== current.schemaHash && !options.ignoreSchemaChanges) {
139
+ // Compare input schema with detailed diff (declared schema hash)
140
+ const previousDeclaredHash = getDeclaredSchemaHash(previous);
141
+ const currentDeclaredHash = getDeclaredSchemaHash(current);
142
+ if (previousDeclaredHash !== currentDeclaredHash && !options.ignoreSchemaChanges) {
136
143
  schemaChanged = true;
137
144
  // Get detailed schema comparison if inputSchema is available on both
138
145
  const schemaComparison = compareSchemas(previous.inputSchema, current.inputSchema);
@@ -155,8 +162,8 @@ function compareTool(previous, current, options) {
155
162
  changes.push({
156
163
  tool: current.name,
157
164
  aspect: 'schema',
158
- before: `Schema hash: ${previous.schemaHash}`,
159
- after: `Schema hash: ${current.schemaHash}`,
165
+ before: `Schema hash: ${previousDeclaredHash}`,
166
+ after: `Schema hash: ${currentDeclaredHash}`,
160
167
  severity: 'breaking',
161
168
  description: `Schema for ${current.name} has changed`,
162
169
  });
@@ -314,6 +321,235 @@ function compareTool(previous, current, options) {
314
321
  schemaEvolutionDiff,
315
322
  };
316
323
  }
324
+ function comparePrompts(previous, current) {
325
+ const changes = [];
326
+ const prevMap = new Map((previous ?? []).map((p) => [p.name, p]));
327
+ const currMap = new Map((current ?? []).map((p) => [p.name, p]));
328
+ for (const [name, currPrompt] of currMap) {
329
+ const prevPrompt = prevMap.get(name);
330
+ if (!prevPrompt) {
331
+ changes.push({
332
+ tool: `prompt:${name}`,
333
+ aspect: 'prompt',
334
+ before: 'absent',
335
+ after: 'present',
336
+ severity: 'info',
337
+ description: `Prompt "${name}" added`,
338
+ });
339
+ continue;
340
+ }
341
+ if (prevPrompt.description !== currPrompt.description) {
342
+ changes.push({
343
+ tool: `prompt:${name}`,
344
+ aspect: 'prompt',
345
+ before: prevPrompt.description ?? 'none',
346
+ after: currPrompt.description ?? 'none',
347
+ severity: 'info',
348
+ description: `Prompt "${name}" description changed`,
349
+ });
350
+ }
351
+ const prevArgs = prevPrompt.arguments ?? [];
352
+ const currArgs = currPrompt.arguments ?? [];
353
+ const prevArgMap = new Map(prevArgs.map((a) => [a.name, a]));
354
+ const currArgMap = new Map(currArgs.map((a) => [a.name, a]));
355
+ for (const [argName, currArg] of currArgMap) {
356
+ const prevArg = prevArgMap.get(argName);
357
+ if (!prevArg) {
358
+ changes.push({
359
+ tool: `prompt:${name}`,
360
+ aspect: 'prompt',
361
+ before: 'absent',
362
+ after: 'present',
363
+ severity: currArg.required ? 'breaking' : 'info',
364
+ description: `Prompt "${name}" argument "${argName}" added`,
365
+ });
366
+ continue;
367
+ }
368
+ if (prevArg.required !== currArg.required) {
369
+ changes.push({
370
+ tool: `prompt:${name}`,
371
+ aspect: 'prompt',
372
+ before: String(prevArg.required ?? false),
373
+ after: String(currArg.required ?? false),
374
+ severity: currArg.required ? 'breaking' : 'warning',
375
+ description: `Prompt "${name}" argument "${argName}" requirement changed`,
376
+ });
377
+ }
378
+ if (prevArg.description !== currArg.description) {
379
+ changes.push({
380
+ tool: `prompt:${name}`,
381
+ aspect: 'prompt',
382
+ before: prevArg.description ?? 'none',
383
+ after: currArg.description ?? 'none',
384
+ severity: 'info',
385
+ description: `Prompt "${name}" argument "${argName}" description changed`,
386
+ });
387
+ }
388
+ }
389
+ for (const [argName] of prevArgMap) {
390
+ if (!currArgMap.has(argName)) {
391
+ changes.push({
392
+ tool: `prompt:${name}`,
393
+ aspect: 'prompt',
394
+ before: 'present',
395
+ after: 'absent',
396
+ severity: 'breaking',
397
+ description: `Prompt "${name}" argument "${argName}" removed`,
398
+ });
399
+ }
400
+ }
401
+ }
402
+ for (const [name] of prevMap) {
403
+ if (!currMap.has(name)) {
404
+ changes.push({
405
+ tool: `prompt:${name}`,
406
+ aspect: 'prompt',
407
+ before: 'present',
408
+ after: 'absent',
409
+ severity: 'breaking',
410
+ description: `Prompt "${name}" removed`,
411
+ });
412
+ }
413
+ }
414
+ return changes;
415
+ }
416
+ function compareResources(previous, current) {
417
+ const changes = [];
418
+ const prevMap = new Map((previous ?? []).map((r) => [r.uri, r]));
419
+ const currMap = new Map((current ?? []).map((r) => [r.uri, r]));
420
+ for (const [uri, currResource] of currMap) {
421
+ const prevResource = prevMap.get(uri);
422
+ if (!prevResource) {
423
+ changes.push({
424
+ tool: `resource:${currResource.name ?? uri}`,
425
+ aspect: 'resource',
426
+ before: 'absent',
427
+ after: 'present',
428
+ severity: 'info',
429
+ description: `Resource "${uri}" added`,
430
+ });
431
+ continue;
432
+ }
433
+ if (prevResource.name !== currResource.name) {
434
+ changes.push({
435
+ tool: `resource:${currResource.name ?? uri}`,
436
+ aspect: 'resource',
437
+ before: prevResource.name ?? 'none',
438
+ after: currResource.name ?? 'none',
439
+ severity: 'info',
440
+ description: `Resource "${uri}" name changed`,
441
+ });
442
+ }
443
+ if (prevResource.description !== currResource.description) {
444
+ changes.push({
445
+ tool: `resource:${currResource.name ?? uri}`,
446
+ aspect: 'resource',
447
+ before: prevResource.description ?? 'none',
448
+ after: currResource.description ?? 'none',
449
+ severity: 'info',
450
+ description: `Resource "${uri}" description changed`,
451
+ });
452
+ }
453
+ if (prevResource.mimeType !== currResource.mimeType) {
454
+ changes.push({
455
+ tool: `resource:${currResource.name ?? uri}`,
456
+ aspect: 'resource',
457
+ before: prevResource.mimeType ?? 'none',
458
+ after: currResource.mimeType ?? 'none',
459
+ severity: 'warning',
460
+ description: `Resource "${uri}" mime type changed`,
461
+ });
462
+ }
463
+ }
464
+ for (const [uri, prevResource] of prevMap) {
465
+ if (!currMap.has(uri)) {
466
+ changes.push({
467
+ tool: `resource:${prevResource.name ?? uri}`,
468
+ aspect: 'resource',
469
+ before: 'present',
470
+ after: 'absent',
471
+ severity: 'breaking',
472
+ description: `Resource "${uri}" removed`,
473
+ });
474
+ }
475
+ }
476
+ return changes;
477
+ }
478
+ function compareServerInfo(previous, current) {
479
+ const changes = [];
480
+ if (previous.name !== current.name) {
481
+ changes.push({
482
+ tool: 'server',
483
+ aspect: 'server',
484
+ before: previous.name,
485
+ after: current.name,
486
+ severity: 'info',
487
+ description: 'Server name changed',
488
+ });
489
+ }
490
+ if (previous.version !== current.version) {
491
+ changes.push({
492
+ tool: 'server',
493
+ aspect: 'server',
494
+ before: previous.version,
495
+ after: current.version,
496
+ severity: 'info',
497
+ description: 'Server version changed',
498
+ });
499
+ }
500
+ if (previous.protocolVersion !== current.protocolVersion) {
501
+ const breaking = isMajorVersionChange(previous.protocolVersion, current.protocolVersion);
502
+ changes.push({
503
+ tool: 'server',
504
+ aspect: 'server',
505
+ before: previous.protocolVersion,
506
+ after: current.protocolVersion,
507
+ severity: breaking ? 'breaking' : 'warning',
508
+ description: 'Protocol version changed',
509
+ });
510
+ }
511
+ const prevCaps = new Set(previous.capabilities);
512
+ const currCaps = new Set(current.capabilities);
513
+ for (const cap of prevCaps) {
514
+ if (!currCaps.has(cap)) {
515
+ changes.push({
516
+ tool: 'server',
517
+ aspect: 'capability',
518
+ before: cap,
519
+ after: 'removed',
520
+ severity: 'breaking',
521
+ description: `Capability "${cap}" removed`,
522
+ });
523
+ }
524
+ }
525
+ for (const cap of currCaps) {
526
+ if (!prevCaps.has(cap)) {
527
+ changes.push({
528
+ tool: 'server',
529
+ aspect: 'capability',
530
+ before: 'absent',
531
+ after: cap,
532
+ severity: 'info',
533
+ description: `Capability "${cap}" added`,
534
+ });
535
+ }
536
+ }
537
+ return changes;
538
+ }
539
+ function isMajorVersionChange(previous, current) {
540
+ const prevMajor = parseInt(previous.split('.')[0] ?? '0', 10);
541
+ const currMajor = parseInt(current.split('.')[0] ?? '0', 10);
542
+ if (Number.isNaN(prevMajor) || Number.isNaN(currMajor)) {
543
+ return previous !== current;
544
+ }
545
+ return prevMajor !== currMajor;
546
+ }
547
+ function getDeclaredSchemaHash(tool) {
548
+ if (tool.inputSchema && Object.keys(tool.inputSchema).length > 0) {
549
+ return computeSchemaHash(tool.inputSchema);
550
+ }
551
+ return tool.schemaHash;
552
+ }
317
553
  /**
318
554
  * Format a schema change value for display in BehaviorChange.
319
555
  * Converts unknown values to human-readable strings.
@@ -331,8 +567,11 @@ function formatSchemaChangeValue(value) {
331
567
  if (value.length === 0)
332
568
  return '[]';
333
569
  if (value.length <= 3)
334
- return `[${value.map(v => formatSchemaChangeValue(v)).join(', ')}]`;
335
- return `[${value.slice(0, 3).map(v => formatSchemaChangeValue(v)).join(', ')}, ...]`;
570
+ return `[${value.map((v) => formatSchemaChangeValue(v)).join(', ')}]`;
571
+ return `[${value
572
+ .slice(0, 3)
573
+ .map((v) => formatSchemaChangeValue(v))
574
+ .join(', ')}, ...]`;
336
575
  }
337
576
  // For objects, show a compact representation
338
577
  try {
@@ -474,7 +713,7 @@ export function applyAspectOverride(change, aspectOverrides) {
474
713
  * Returns a new diff with filtered/modified changes based on config.
475
714
  */
476
715
  export function applySeverityConfig(diff, config) {
477
- const { minimumSeverity = 'none', suppressWarnings = false, aspectOverrides, } = config;
716
+ const { minimumSeverity = 'none', suppressWarnings = false, aspectOverrides } = config;
478
717
  // Apply aspect overrides and filter by minimum severity
479
718
  const filteredChanges = diff.behaviorChanges
480
719
  .map((change) => {
@@ -495,8 +734,7 @@ export function applySeverityConfig(diff, config) {
495
734
  // Filter toolsModified to only include those with remaining changes
496
735
  const toolsWithChanges = new Set(filteredChanges.map((c) => c.tool));
497
736
  const filteredToolsModified = diff.toolsModified.filter((td) => toolsWithChanges.has(td.tool) ||
498
- (td.schemaChanged &&
499
- (!aspectOverrides?.schema || aspectOverrides.schema !== 'none')) ||
737
+ (td.schemaChanged && (!aspectOverrides?.schema || aspectOverrides.schema !== 'none')) ||
500
738
  (td.descriptionChanged &&
501
739
  (!aspectOverrides?.description || aspectOverrides.description !== 'none')));
502
740
  // Recalculate counts
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Cloud baseline builder.
2
+ * Baseline builder.
3
3
  *
4
4
  * ## Severity Type Mappings
5
5
  *
@@ -13,19 +13,19 @@
13
13
  * Used for error severity classification in error handling.
14
14
  * Values: 'low' | 'medium' | 'high' | 'critical'
15
15
  *
16
- * ### CloudAssertionSeverity (cloud/types.ts)
17
- * Used for cloud assertions and PersonaFinding severity levels.
16
+ * ### BaselineAssertionSeverity (baseline-format.ts)
17
+ * Used for baseline assertions and PersonaFinding severity levels.
18
18
  * Values: 'info' | 'low' | 'medium' | 'high' | 'critical'
19
19
  *
20
20
  * ### Conversion Mappings
21
21
  *
22
- * ChangeSeverity → CloudAssertionSeverity:
22
+ * ChangeSeverity → BaselineAssertionSeverity:
23
23
  * - 'none' → 'info' (no change, informational)
24
24
  * - 'info' → 'low' (minor changes)
25
25
  * - 'warning' → 'medium' (moderate changes)
26
26
  * - 'breaking' → 'critical' (breaking changes)
27
27
  *
28
- * CloudAssertionSeverity → ChangeSeverity (for display/filtering):
28
+ * BaselineAssertionSeverity → ChangeSeverity (for display/filtering):
29
29
  * - 'info' → 'info'
30
30
  * - 'low' → 'info'
31
31
  * - 'medium' → 'warning'
@@ -34,25 +34,25 @@
34
34
  */
35
35
  import type { BehavioralAssertion, BehavioralBaseline, ChangeSeverity } from './types.js';
36
36
  import type { InterviewResult } from '../interview/types.js';
37
- import type { CloudAssertion, CloudAssertionSeverity } from './cloud-types.js';
37
+ import type { BaselineAssertion, BaselineAssertionSeverity } from './baseline-format.js';
38
38
  /**
39
- * Map ChangeSeverity to CloudAssertionSeverity.
40
- * Used when mapping CLI assertions to cloud severity levels.
39
+ * Map ChangeSeverity to BaselineAssertionSeverity.
40
+ * Used when mapping CLI assertions to baseline severity levels.
41
41
  */
42
- export declare const CHANGE_TO_CLOUD_SEVERITY: Record<ChangeSeverity, CloudAssertionSeverity>;
42
+ export declare const CHANGE_TO_BASELINE_SEVERITY: Record<ChangeSeverity, BaselineAssertionSeverity>;
43
43
  /**
44
- * Map CloudAssertionSeverity to ChangeSeverity.
45
- * Used when filtering or displaying cloud data locally.
44
+ * Map BaselineAssertionSeverity to ChangeSeverity.
45
+ * Used when filtering or displaying baseline data locally.
46
46
  */
47
- export declare const CLOUD_TO_CHANGE_SEVERITY: Record<CloudAssertionSeverity, ChangeSeverity>;
47
+ export declare const BASELINE_TO_CHANGE_SEVERITY: Record<BaselineAssertionSeverity, ChangeSeverity>;
48
48
  /**
49
- * Convert an array of BehavioralAssertions to CloudAssertions.
49
+ * Convert an array of BehavioralAssertions to baseline assertions.
50
50
  */
51
- export declare function convertAssertions(assertions: BehavioralAssertion[]): CloudAssertion[];
51
+ export declare function convertAssertions(assertions: BehavioralAssertion[]): BaselineAssertion[];
52
52
  /**
53
53
  * Create a BellwetherBaseline directly from InterviewResult.
54
54
  *
55
55
  * This is the preferred method when you have fresh interview results.
56
56
  */
57
- export declare function createCloudBaseline(result: InterviewResult, serverCommand: string): BehavioralBaseline;
57
+ export declare function createBaselineFromInterview(result: InterviewResult, serverCommand: string): BehavioralBaseline;
58
58
  //# sourceMappingURL=converter.d.ts.map
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Cloud baseline builder.
2
+ * Baseline builder.
3
3
  *
4
4
  * ## Severity Type Mappings
5
5
  *
@@ -13,19 +13,19 @@
13
13
  * Used for error severity classification in error handling.
14
14
  * Values: 'low' | 'medium' | 'high' | 'critical'
15
15
  *
16
- * ### CloudAssertionSeverity (cloud/types.ts)
17
- * Used for cloud assertions and PersonaFinding severity levels.
16
+ * ### BaselineAssertionSeverity (baseline-format.ts)
17
+ * Used for baseline assertions and PersonaFinding severity levels.
18
18
  * Values: 'info' | 'low' | 'medium' | 'high' | 'critical'
19
19
  *
20
20
  * ### Conversion Mappings
21
21
  *
22
- * ChangeSeverity → CloudAssertionSeverity:
22
+ * ChangeSeverity → BaselineAssertionSeverity:
23
23
  * - 'none' → 'info' (no change, informational)
24
24
  * - 'info' → 'low' (minor changes)
25
25
  * - 'warning' → 'medium' (moderate changes)
26
26
  * - 'breaking' → 'critical' (breaking changes)
27
27
  *
28
- * CloudAssertionSeverity → ChangeSeverity (for display/filtering):
28
+ * BaselineAssertionSeverity → ChangeSeverity (for display/filtering):
29
29
  * - 'info' → 'info'
30
30
  * - 'low' → 'info'
31
31
  * - 'medium' → 'warning'
@@ -35,27 +35,27 @@
35
35
  import { createHash } from 'crypto';
36
36
  import { analyzeResponses } from './response-fingerprint.js';
37
37
  import { buildSchemaEvolution } from './response-schema-tracker.js';
38
- import { calculateMetrics, calculatePerformanceConfidence } from './performance-tracker.js';
39
- import { computeConsensusSchemaHash } from './schema-compare.js';
38
+ import { calculateMetrics, calculatePerformanceConfidence, } from './performance-tracker.js';
39
+ import { computeConsensusSchemaHash, computeSchemaHash } from './schema-compare.js';
40
40
  import { calculateBaselineHash } from './baseline-hash.js';
41
41
  import { getBaselineVersion } from './version.js';
42
42
  import { VERSION } from '../version.js';
43
43
  import { scoreDocumentation, toDocumentationScoreSummary } from './documentation-scorer.js';
44
44
  /**
45
- * Map ChangeSeverity to CloudAssertionSeverity.
46
- * Used when mapping CLI assertions to cloud severity levels.
45
+ * Map ChangeSeverity to BaselineAssertionSeverity.
46
+ * Used when mapping CLI assertions to baseline severity levels.
47
47
  */
48
- export const CHANGE_TO_CLOUD_SEVERITY = {
48
+ export const CHANGE_TO_BASELINE_SEVERITY = {
49
49
  none: 'info',
50
50
  info: 'low',
51
51
  warning: 'medium',
52
52
  breaking: 'critical',
53
53
  };
54
54
  /**
55
- * Map CloudAssertionSeverity to ChangeSeverity.
56
- * Used when filtering or displaying cloud data locally.
55
+ * Map BaselineAssertionSeverity to ChangeSeverity.
56
+ * Used when filtering or displaying baseline data locally.
57
57
  */
58
- export const CLOUD_TO_CHANGE_SEVERITY = {
58
+ export const BASELINE_TO_CHANGE_SEVERITY = {
59
59
  info: 'info',
60
60
  low: 'info',
61
61
  medium: 'warning',
@@ -69,7 +69,7 @@ function hashString(input) {
69
69
  return createHash('sha256').update(input).digest('hex').slice(0, 16);
70
70
  }
71
71
  /**
72
- * Convert a local BehavioralAssertion to cloud CloudAssertion format.
72
+ * Convert a local BehavioralAssertion to baseline assertion format.
73
73
  *
74
74
  * Mapping:
75
75
  * - isPositive=true + security aspect → 'requires' (critical security requirement)
@@ -123,7 +123,7 @@ function convertAssertion(assertion) {
123
123
  };
124
124
  }
125
125
  /**
126
- * Convert an array of BehavioralAssertions to CloudAssertions.
126
+ * Convert an array of BehavioralAssertions to baseline assertions.
127
127
  */
128
128
  export function convertAssertions(assertions) {
129
129
  return assertions.map(convertAssertion);
@@ -132,17 +132,15 @@ export function convertAssertions(assertions) {
132
132
  * Derive baseline mode from result metadata.
133
133
  * Returns 'check' for check mode results, 'explore' for explore mode results.
134
134
  * Note: Baselines should only be created from check mode results,
135
- * but explore uploads are still supported for documentation tracking.
135
+ * but explore mode baselines are still supported for documentation tracking.
136
136
  */
137
- function deriveCloudMode(resultModel, baselineMode) {
137
+ function deriveBaselineMode(resultModel) {
138
138
  // Check mode results have model === 'check'
139
139
  if (resultModel === 'check')
140
140
  return 'check';
141
141
  // LLM model names indicate explore mode
142
142
  if (resultModel)
143
143
  return 'explore';
144
- if (baselineMode === 'check')
145
- return 'check';
146
144
  // Default to check for legacy baselines without explicit mode
147
145
  return 'check';
148
146
  }
@@ -296,9 +294,7 @@ function classifySeverity(note) {
296
294
  lowerNote.includes('leak')) {
297
295
  return 'medium';
298
296
  }
299
- if (lowerNote.includes('low') ||
300
- lowerNote.includes('minor') ||
301
- lowerNote.includes('potential')) {
297
+ if (lowerNote.includes('low') || lowerNote.includes('minor') || lowerNote.includes('potential')) {
302
298
  return 'low';
303
299
  }
304
300
  return 'info';
@@ -308,9 +304,9 @@ function classifySeverity(note) {
308
304
  *
309
305
  * This is the preferred method when you have fresh interview results.
310
306
  */
311
- export function createCloudBaseline(result, serverCommand) {
307
+ export function createBaselineFromInterview(result, serverCommand) {
312
308
  // Derive mode from result metadata
313
- const mode = deriveCloudMode(result.metadata.model);
309
+ const mode = deriveBaselineMode(result.metadata.model);
314
310
  // Build metadata
315
311
  const metadata = {
316
312
  mode,
@@ -338,11 +334,12 @@ export function createCloudBaseline(result, serverCommand) {
338
334
  }
339
335
  }
340
336
  const tools = result.toolProfiles.map((profile) => {
341
- const interactions = profile.interactions.map(i => ({ args: i.question.args }));
342
- const { hash: schemaHash } = computeConsensusSchemaHash(interactions);
337
+ const interactions = profile.interactions.map((i) => ({ args: i.question.args }));
338
+ const observedSchema = computeConsensusSchemaHash(interactions);
339
+ const declaredSchemaHash = computeSchemaHash(schemaMap.get(profile.name) ?? {});
343
340
  const responseData = profile.interactions
344
- .filter(i => !i.mocked)
345
- .map(i => ({
341
+ .filter((i) => !i.mocked)
342
+ .map((i) => ({
346
343
  response: i.response,
347
344
  error: i.error,
348
345
  }));
@@ -351,8 +348,8 @@ export function createCloudBaseline(result, serverCommand) {
351
348
  ? buildSchemaEvolution(responseAnalysis.schemas)
352
349
  : undefined;
353
350
  const latencySamples = profile.interactions
354
- .filter(i => i.toolExecutionMs !== undefined && !i.mocked)
355
- .map(i => ({
351
+ .filter((i) => i.toolExecutionMs !== undefined && !i.mocked)
352
+ .map((i) => ({
356
353
  toolName: profile.name,
357
354
  durationMs: i.toolExecutionMs ?? 0,
358
355
  success: !i.error && !i.response?.isError,
@@ -378,16 +375,23 @@ export function createCloudBaseline(result, serverCommand) {
378
375
  name: profile.name,
379
376
  description: profile.description ?? '',
380
377
  inputSchema: schemaMap.get(profile.name) ?? {},
381
- schemaHash,
378
+ schemaHash: declaredSchemaHash,
379
+ observedArgsSchemaHash: observedSchema.hash,
380
+ observedArgsSchemaConsistency: observedSchema.consistency,
381
+ observedArgsSchemaVariations: observedSchema.variations,
382
382
  responseFingerprint: responseAnalysis.fingerprint,
383
383
  inferredOutputSchema: responseAnalysis.inferredSchema,
384
384
  responseSchemaEvolution,
385
- errorPatterns: responseAnalysis.errorPatterns.length ? responseAnalysis.errorPatterns : undefined,
385
+ errorPatterns: responseAnalysis.errorPatterns.length
386
+ ? responseAnalysis.errorPatterns
387
+ : undefined,
386
388
  baselineP50Ms,
387
389
  baselineP95Ms,
388
390
  baselineP99Ms,
389
391
  baselineSuccessRate,
390
392
  performanceConfidence,
393
+ lastTestedAt: metadata.generatedAt,
394
+ inputSchemaHashAtTest: declaredSchemaHash,
391
395
  };
392
396
  });
393
397
  const prompts = result.discovery.prompts.length > 0
@@ -401,6 +405,14 @@ export function createCloudBaseline(result, serverCommand) {
401
405
  })),
402
406
  }))
403
407
  : undefined;
408
+ const resources = result.discovery.resources && result.discovery.resources.length > 0
409
+ ? result.discovery.resources.map((r) => ({
410
+ uri: r.uri,
411
+ name: r.name,
412
+ description: r.description,
413
+ mimeType: r.mimeType,
414
+ }))
415
+ : undefined;
404
416
  // Build interviews
405
417
  const interviews = buildInterviews(result, mode);
406
418
  // Build tool profiles (with converted assertions)
@@ -425,13 +437,13 @@ export function createCloudBaseline(result, serverCommand) {
425
437
  summary: wr.summary,
426
438
  }));
427
439
  const documentationScore = toDocumentationScoreSummary(scoreDocumentation(result.discovery.tools));
428
- // Build assertions (convert to cloud format)
440
+ // Build assertions (convert to baseline format)
429
441
  const assertions = convertAssertions(extractAllAssertions(result));
430
442
  const baselineWithoutHash = {
431
443
  version: getBaselineVersion(),
432
444
  metadata,
433
445
  server,
434
- capabilities: { tools, prompts },
446
+ capabilities: { tools, prompts, resources },
435
447
  interviews,
436
448
  toolProfiles,
437
449
  workflows,
@@ -38,7 +38,7 @@ export declare function formatDiffJUnit(diff: BehavioralDiff, suiteName?: string
38
38
  * Format diff as SARIF (Static Analysis Results Interchange Format) for GitHub Code Scanning.
39
39
  *
40
40
  * SARIF is the standard format for GitHub's code scanning feature and can be
41
- * uploaded to show drift detection results in pull request reviews.
41
+ * used to show drift detection results in pull request reviews.
42
42
  *
43
43
  * @see https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html
44
44
  *