@dotsetlabs/bellwether 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +74 -0
  2. package/README.md +8 -2
  3. package/dist/baseline/accessors.d.ts +1 -1
  4. package/dist/baseline/accessors.js +1 -3
  5. package/dist/baseline/baseline-format.d.ts +287 -0
  6. package/dist/baseline/baseline-format.js +12 -0
  7. package/dist/baseline/comparator.js +249 -11
  8. package/dist/baseline/converter.d.ts +15 -15
  9. package/dist/baseline/converter.js +46 -34
  10. package/dist/baseline/diff.d.ts +1 -1
  11. package/dist/baseline/diff.js +45 -28
  12. package/dist/baseline/error-analyzer.d.ts +1 -1
  13. package/dist/baseline/error-analyzer.js +90 -17
  14. package/dist/baseline/incremental-checker.js +8 -5
  15. package/dist/baseline/index.d.ts +2 -12
  16. package/dist/baseline/index.js +3 -23
  17. package/dist/baseline/performance-tracker.d.ts +0 -1
  18. package/dist/baseline/performance-tracker.js +13 -20
  19. package/dist/baseline/response-fingerprint.js +39 -2
  20. package/dist/baseline/saver.js +41 -10
  21. package/dist/baseline/schema-compare.d.ts +22 -0
  22. package/dist/baseline/schema-compare.js +259 -16
  23. package/dist/baseline/types.d.ts +10 -7
  24. package/dist/cache/response-cache.d.ts +8 -0
  25. package/dist/cache/response-cache.js +110 -0
  26. package/dist/cli/commands/check.js +23 -6
  27. package/dist/cli/commands/explore.js +34 -14
  28. package/dist/cli/index.js +8 -0
  29. package/dist/config/template.js +8 -7
  30. package/dist/config/validator.d.ts +59 -59
  31. package/dist/config/validator.js +245 -90
  32. package/dist/constants/core.d.ts +4 -0
  33. package/dist/constants/core.js +8 -19
  34. package/dist/constants/registry.d.ts +17 -0
  35. package/dist/constants/registry.js +18 -0
  36. package/dist/constants/testing.d.ts +0 -369
  37. package/dist/constants/testing.js +18 -456
  38. package/dist/constants.d.ts +1 -1
  39. package/dist/constants.js +1 -1
  40. package/dist/docs/contract.js +131 -83
  41. package/dist/docs/report.js +8 -5
  42. package/dist/interview/insights.d.ts +17 -0
  43. package/dist/interview/insights.js +52 -0
  44. package/dist/interview/interviewer.js +52 -10
  45. package/dist/interview/prompt-test-generator.d.ts +12 -0
  46. package/dist/interview/prompt-test-generator.js +77 -0
  47. package/dist/interview/resource-test-generator.d.ts +12 -0
  48. package/dist/interview/resource-test-generator.js +20 -0
  49. package/dist/interview/schema-inferrer.js +26 -4
  50. package/dist/interview/schema-test-generator.js +278 -31
  51. package/dist/interview/stateful-test-runner.d.ts +3 -0
  52. package/dist/interview/stateful-test-runner.js +80 -0
  53. package/dist/interview/types.d.ts +12 -0
  54. package/dist/transport/mcp-client.js +1 -1
  55. package/dist/transport/sse-transport.d.ts +7 -3
  56. package/dist/transport/sse-transport.js +157 -67
  57. package/dist/version.js +1 -1
  58. package/man/bellwether.1 +1 -1
  59. package/man/bellwether.1.md +2 -2
  60. package/package.json +1 -1
  61. package/schemas/bellwether-check.schema.json +185 -0
  62. package/schemas/bellwether-explore.schema.json +837 -0
  63. package/scripts/completions/bellwether.bash +10 -4
  64. package/scripts/completions/bellwether.zsh +55 -2
@@ -0,0 +1,52 @@
1
+ import { generateSemanticTests } from '../validation/semantic-test-generator.js';
2
+ import { SEMANTIC_VALIDATION } from '../constants.js';
3
+ import { analyzeResponses } from '../baseline/response-fingerprint.js';
4
+ import { buildSchemaEvolution } from '../baseline/response-schema-tracker.js';
5
+ import { generateErrorSummary } from '../baseline/error-analyzer.js';
6
+ import { scoreDocumentation } from '../baseline/documentation-scorer.js';
7
+ /**
8
+ * Build derived insights from an interview result.
9
+ * These insights are used for documentation and JSON report enrichment.
10
+ */
11
+ export function buildInterviewInsights(result) {
12
+ const semanticInferences = {};
13
+ for (const tool of result.discovery.tools) {
14
+ const inferenceResult = generateSemanticTests(tool, {
15
+ minConfidence: SEMANTIC_VALIDATION.MIN_CONFIDENCE_THRESHOLD,
16
+ maxInvalidValuesPerParam: SEMANTIC_VALIDATION.MAX_INVALID_VALUES_PER_PARAM,
17
+ skipSemanticTests: false,
18
+ });
19
+ if (inferenceResult.inferences.length > 0) {
20
+ semanticInferences[tool.name] = inferenceResult.inferences;
21
+ }
22
+ }
23
+ const schemaEvolution = {};
24
+ const errorAnalysisSummaries = {};
25
+ for (const profile of result.toolProfiles) {
26
+ const responseData = profile.interactions
27
+ .filter((i) => !i.mocked)
28
+ .map((i) => ({ response: i.response, error: i.error }));
29
+ const responseAnalysis = analyzeResponses(responseData);
30
+ if (responseAnalysis.schemas.length > 0) {
31
+ schemaEvolution[profile.name] = buildSchemaEvolution(responseAnalysis.schemas);
32
+ }
33
+ if (responseAnalysis.errorPatterns.length > 0) {
34
+ const summary = generateErrorSummary(profile.name, responseAnalysis.errorPatterns);
35
+ const categoryCounts = summary.categoryCounts instanceof Map
36
+ ? Object.fromEntries(summary.categoryCounts.entries())
37
+ : summary.categoryCounts;
38
+ errorAnalysisSummaries[profile.name] = {
39
+ ...summary,
40
+ categoryCounts,
41
+ };
42
+ }
43
+ }
44
+ const documentationScore = scoreDocumentation(result.discovery.tools);
45
+ return {
46
+ semanticInferences: Object.keys(semanticInferences).length > 0 ? semanticInferences : undefined,
47
+ schemaEvolution: Object.keys(schemaEvolution).length > 0 ? schemaEvolution : undefined,
48
+ errorAnalysisSummaries: Object.keys(errorAnalysisSummaries).length > 0 ? errorAnalysisSummaries : undefined,
49
+ documentationScore,
50
+ };
51
+ }
52
+ //# sourceMappingURL=insights.js.map
@@ -6,6 +6,8 @@ import { evaluateAssertions } from '../scenarios/evaluator.js';
6
6
  import { withTimeout, DEFAULT_TIMEOUTS, parallelLimit, createMutex } from '../utils/index.js';
7
7
  import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT, } from '../constants.js';
8
8
  import { generateSchemaTests } from './schema-test-generator.js';
9
+ import { generatePromptTests } from './prompt-test-generator.js';
10
+ import { generateResourceTests } from './resource-test-generator.js';
9
11
  import { WorkflowDiscoverer } from '../workflow/discovery.js';
10
12
  import { WorkflowExecutor } from '../workflow/executor.js';
11
13
  import { RateLimiter, calculateBackoffMs, isRateLimitError } from './rate-limiter.js';
@@ -682,6 +684,9 @@ export class Interviewer {
682
684
  const promptInteractions = [];
683
685
  // Check for custom scenarios for this prompt
684
686
  const customScenarios = this.getScenariosForPrompt(prompt.name);
687
+ const deterministicQuestions = this.config.customScenariosOnly
688
+ ? []
689
+ : generatePromptTests(prompt, { maxTests: this.config.checkMode ? 3 : 2 });
685
690
  // Build questions list - custom scenarios + LLM-generated (unless customScenariosOnly)
686
691
  let questions = [];
687
692
  if (customScenarios.length > 0) {
@@ -693,21 +698,30 @@ export class Interviewer {
693
698
  description: s.description,
694
699
  args: s.args,
695
700
  }));
701
+ // Add deterministic prompt tests
702
+ if (deterministicQuestions.length > 0) {
703
+ questions = mergePromptQuestions(questions, deterministicQuestions);
704
+ }
696
705
  // If not custom-only mode and not fast CI mode, also generate LLM questions
697
706
  if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
698
707
  const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
699
- questions = [...questions, ...llmQuestions];
708
+ questions = mergePromptQuestions(questions, llmQuestions);
700
709
  }
701
710
  }
702
711
  else if (!this.config.customScenariosOnly &&
703
712
  !this.config.checkMode &&
704
713
  primaryOrchestrator) {
705
- // No custom scenarios - generate LLM questions as usual
706
- questions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
714
+ // No custom scenarios - deterministic tests + LLM questions
715
+ questions = mergePromptQuestions(questions, deterministicQuestions);
716
+ const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
717
+ questions = mergePromptQuestions(questions, llmQuestions);
707
718
  }
708
719
  else if (this.config.checkMode) {
709
- // Fast CI mode: use simple fallback question for prompt
710
- questions = [{ description: 'Basic prompt test', args: {} }];
720
+ // Fast CI mode: use deterministic prompt tests
721
+ questions =
722
+ deterministicQuestions.length > 0
723
+ ? deterministicQuestions
724
+ : [{ description: 'Basic prompt test', args: {} }];
711
725
  }
712
726
  // If customScenariosOnly and no scenarios for this prompt, skip it
713
727
  for (const question of questions) {
@@ -791,13 +805,17 @@ export class Interviewer {
791
805
  // Generate resource questions (skip LLM in fast CI mode)
792
806
  let questions;
793
807
  if (this.config.checkMode || !primaryOrchestrator) {
794
- // Fast CI mode: use simple fallback question
795
- questions = [
796
- { description: 'Basic resource read test', category: 'happy_path' },
797
- ];
808
+ // Fast CI mode: use deterministic resource tests
809
+ const deterministic = generateResourceTests(resource, { maxTests: 2 });
810
+ questions =
811
+ deterministic.length > 0
812
+ ? deterministic
813
+ : [{ description: 'Basic resource read test', category: 'happy_path' }];
798
814
  }
799
815
  else {
800
- questions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
816
+ const deterministic = generateResourceTests(resource, { maxTests: 2 });
817
+ const llmQuestions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
818
+ questions = mergeResourceQuestions(deterministic, llmQuestions);
801
819
  }
802
820
  for (const question of questions) {
803
821
  const interactionStart = Date.now();
@@ -1944,6 +1962,30 @@ export class Interviewer {
1944
1962
  return { results, summary };
1945
1963
  }
1946
1964
  }
1965
+ function mergePromptQuestions(base, additions) {
1966
+ const merged = [...base];
1967
+ const signatures = new Set(base.map((q) => `${q.description}|${JSON.stringify(q.args)}`));
1968
+ for (const q of additions) {
1969
+ const sig = `${q.description}|${JSON.stringify(q.args)}`;
1970
+ if (!signatures.has(sig)) {
1971
+ merged.push(q);
1972
+ signatures.add(sig);
1973
+ }
1974
+ }
1975
+ return merged;
1976
+ }
1977
+ function mergeResourceQuestions(base, additions) {
1978
+ const merged = [...base];
1979
+ const signatures = new Set(base.map((q) => `${q.description}|${q.category}`));
1980
+ for (const q of additions) {
1981
+ const sig = `${q.description}|${q.category}`;
1982
+ if (!signatures.has(sig)) {
1983
+ merged.push(q);
1984
+ signatures.add(sig);
1985
+ }
1986
+ }
1987
+ return merged;
1988
+ }
1947
1989
  function summarizeAssertions(interactions) {
1948
1990
  const allResults = interactions.filter((i) => !i.mocked).flatMap((i) => i.assertionResults ?? []);
1949
1991
  if (allResults.length === 0)
@@ -0,0 +1,12 @@
1
+ import type { MCPPrompt } from '../transport/types.js';
2
+ import type { PromptQuestion } from './types.js';
3
+ export interface PromptTestOptions {
4
+ /** Maximum tests to generate */
5
+ maxTests?: number;
6
+ }
7
+ /**
8
+ * Generate deterministic prompt tests based on prompt arguments.
9
+ * Focuses on valid inputs to avoid false negatives.
10
+ */
11
+ export declare function generatePromptTests(prompt: MCPPrompt, options?: PromptTestOptions): PromptQuestion[];
12
+ //# sourceMappingURL=prompt-test-generator.d.ts.map
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Generate deterministic prompt tests based on prompt arguments.
3
+ * Focuses on valid inputs to avoid false negatives.
4
+ */
5
+ export function generatePromptTests(prompt, options = {}) {
6
+ const maxTests = options.maxTests ?? 3;
7
+ const questions = [];
8
+ const argsSpec = prompt.arguments ?? [];
9
+ const requiredArgs = argsSpec.filter((a) => a.required);
10
+ const optionalArgs = argsSpec.filter((a) => !a.required);
11
+ const baseArgs = {};
12
+ for (const arg of requiredArgs) {
13
+ baseArgs[arg.name] = generatePromptArgValue(arg.name, arg.description);
14
+ }
15
+ // Basic invocation
16
+ questions.push({
17
+ description: requiredArgs.length > 0 ? 'Basic prompt invocation' : 'Prompt invocation (no args)',
18
+ args: baseArgs,
19
+ });
20
+ if (questions.length < maxTests && optionalArgs.length > 0) {
21
+ const fullArgs = { ...baseArgs };
22
+ for (const arg of optionalArgs.slice(0, 2)) {
23
+ fullArgs[arg.name] = generatePromptArgValue(arg.name, arg.description);
24
+ }
25
+ questions.push({
26
+ description: 'Prompt invocation with optional arguments',
27
+ args: fullArgs,
28
+ });
29
+ }
30
+ if (questions.length < maxTests && Object.keys(baseArgs).length > 0) {
31
+ const altArgs = {};
32
+ for (const arg of requiredArgs) {
33
+ altArgs[arg.name] = generateAlternateValue(baseArgs[arg.name], arg.name);
34
+ }
35
+ questions.push({
36
+ description: 'Prompt invocation with alternate values',
37
+ args: altArgs,
38
+ });
39
+ }
40
+ return questions.slice(0, maxTests);
41
+ }
42
+ function generatePromptArgValue(name, description) {
43
+ const lowerName = name.toLowerCase();
44
+ const lowerDesc = (description ?? '').toLowerCase();
45
+ if (lowerName.includes('path') || lowerDesc.includes('path'))
46
+ return '/tmp/example.txt';
47
+ if (lowerName.includes('url') || lowerDesc.includes('url') || lowerName.includes('uri')) {
48
+ return 'https://example.com';
49
+ }
50
+ if (lowerName.includes('email') || lowerDesc.includes('email'))
51
+ return 'test@example.com';
52
+ if (lowerName.includes('date') || lowerDesc.includes('date'))
53
+ return '2024-01-15';
54
+ if (lowerName.includes('time') || lowerDesc.includes('time'))
55
+ return '2024-01-15T14:30:00Z';
56
+ if (lowerName.includes('id') || lowerDesc.includes('identifier'))
57
+ return 'id_123';
58
+ if (lowerName.includes('query') || lowerDesc.includes('search'))
59
+ return 'example query';
60
+ return 'example';
61
+ }
62
+ function generateAlternateValue(value, name) {
63
+ if (value.startsWith('http')) {
64
+ return 'https://example.org';
65
+ }
66
+ if (value.startsWith('/')) {
67
+ return '/tmp/alternate.txt';
68
+ }
69
+ if (value.includes('@')) {
70
+ return 'user@example.org';
71
+ }
72
+ if (name.toLowerCase().includes('id')) {
73
+ return 'id_456';
74
+ }
75
+ return `${value}-alt`;
76
+ }
77
+ //# sourceMappingURL=prompt-test-generator.js.map
@@ -0,0 +1,12 @@
1
+ import type { MCPResource } from '../transport/types.js';
2
+ import type { ResourceQuestion } from './types.js';
3
+ export interface ResourceTestOptions {
4
+ /** Maximum tests to generate */
5
+ maxTests?: number;
6
+ }
7
+ /**
8
+ * Generate deterministic resource tests.
9
+ * Since resource reads are URI-based with no args, tests focus on consistency.
10
+ */
11
+ export declare function generateResourceTests(resource: MCPResource, options?: ResourceTestOptions): ResourceQuestion[];
12
+ //# sourceMappingURL=resource-test-generator.d.ts.map
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Generate deterministic resource tests.
3
+ * Since resource reads are URI-based with no args, tests focus on consistency.
4
+ */
5
+ export function generateResourceTests(resource, options = {}) {
6
+ const maxTests = options.maxTests ?? 2;
7
+ const questions = [];
8
+ questions.push({
9
+ description: `Basic resource read (${resource.name})`,
10
+ category: 'happy_path',
11
+ });
12
+ if (questions.length < maxTests) {
13
+ questions.push({
14
+ description: `Repeated resource read (${resource.name})`,
15
+ category: 'edge_case',
16
+ });
17
+ }
18
+ return questions.slice(0, maxTests);
19
+ }
20
+ //# sourceMappingURL=resource-test-generator.js.map
@@ -1,5 +1,5 @@
1
1
  import { createHash } from 'crypto';
2
- import { inferSchemaFromValue, computeInferredSchemaHash } from '../baseline/response-fingerprint.js';
2
+ import { inferSchemaFromValue, computeInferredSchemaHash, } from '../baseline/response-fingerprint.js';
3
3
  /**
4
4
  * Infer a response schema from an MCP tool response.
5
5
  */
@@ -26,7 +26,9 @@ export function inferResponseSchema(response) {
26
26
  };
27
27
  }
28
28
  const markdownStructure = detectMarkdownStructure(textContent);
29
- if (markdownStructure.hasHeaders || markdownStructure.hasTables || markdownStructure.hasCodeBlocks) {
29
+ if (markdownStructure.hasHeaders ||
30
+ markdownStructure.hasTables ||
31
+ markdownStructure.hasCodeBlocks) {
30
32
  return {
31
33
  inferredType: 'markdown',
32
34
  markdownStructure,
@@ -43,10 +45,16 @@ export function extractTextContent(response) {
43
45
  return null;
44
46
  }
45
47
  const textBlocks = response.content
46
- .filter((c) => c.type === 'text' && typeof c.text === 'string')
48
+ .filter((c) => typeof c.text === 'string')
47
49
  .map((c) => c.text);
48
50
  if (textBlocks.length === 0) {
49
- return null;
51
+ const decodedBlocks = response.content
52
+ .map((c) => decodeDataBlock(c.data, c.mimeType))
53
+ .filter((v) => typeof v === 'string');
54
+ if (decodedBlocks.length === 0) {
55
+ return null;
56
+ }
57
+ return decodedBlocks.join('\n');
50
58
  }
51
59
  return textBlocks.join('\n');
52
60
  }
@@ -68,4 +76,18 @@ function detectMarkdownStructure(text) {
68
76
  function hashString(value) {
69
77
  return createHash('sha256').update(value).digest('hex');
70
78
  }
79
+ function decodeDataBlock(data, mimeType) {
80
+ if (!data || typeof data !== 'string')
81
+ return null;
82
+ const mime = (mimeType ?? '').toLowerCase();
83
+ if (!mime.includes('json') && !mime.startsWith('text/')) {
84
+ return null;
85
+ }
86
+ try {
87
+ return Buffer.from(data, 'base64').toString('utf8');
88
+ }
89
+ catch {
90
+ return null;
91
+ }
92
+ }
71
93
  //# sourceMappingURL=schema-inferrer.js.map