@dotsetlabs/bellwether 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/README.md +8 -2
- package/dist/baseline/accessors.d.ts +1 -1
- package/dist/baseline/accessors.js +1 -3
- package/dist/baseline/baseline-format.d.ts +287 -0
- package/dist/baseline/baseline-format.js +12 -0
- package/dist/baseline/comparator.js +249 -11
- package/dist/baseline/converter.d.ts +15 -15
- package/dist/baseline/converter.js +46 -34
- package/dist/baseline/diff.d.ts +1 -1
- package/dist/baseline/diff.js +45 -28
- package/dist/baseline/error-analyzer.d.ts +1 -1
- package/dist/baseline/error-analyzer.js +90 -17
- package/dist/baseline/incremental-checker.js +8 -5
- package/dist/baseline/index.d.ts +2 -12
- package/dist/baseline/index.js +3 -23
- package/dist/baseline/performance-tracker.d.ts +0 -1
- package/dist/baseline/performance-tracker.js +13 -20
- package/dist/baseline/response-fingerprint.js +39 -2
- package/dist/baseline/saver.js +41 -10
- package/dist/baseline/schema-compare.d.ts +22 -0
- package/dist/baseline/schema-compare.js +259 -16
- package/dist/baseline/types.d.ts +10 -7
- package/dist/cache/response-cache.d.ts +8 -0
- package/dist/cache/response-cache.js +110 -0
- package/dist/cli/commands/check.js +23 -6
- package/dist/cli/commands/explore.js +34 -14
- package/dist/cli/index.js +8 -0
- package/dist/config/template.js +8 -7
- package/dist/config/validator.d.ts +59 -59
- package/dist/config/validator.js +245 -90
- package/dist/constants/core.d.ts +4 -0
- package/dist/constants/core.js +8 -19
- package/dist/constants/registry.d.ts +17 -0
- package/dist/constants/registry.js +18 -0
- package/dist/constants/testing.d.ts +0 -369
- package/dist/constants/testing.js +18 -456
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/docs/contract.js +131 -83
- package/dist/docs/report.js +8 -5
- package/dist/interview/insights.d.ts +17 -0
- package/dist/interview/insights.js +52 -0
- package/dist/interview/interviewer.js +52 -10
- package/dist/interview/prompt-test-generator.d.ts +12 -0
- package/dist/interview/prompt-test-generator.js +77 -0
- package/dist/interview/resource-test-generator.d.ts +12 -0
- package/dist/interview/resource-test-generator.js +20 -0
- package/dist/interview/schema-inferrer.js +26 -4
- package/dist/interview/schema-test-generator.js +278 -31
- package/dist/interview/stateful-test-runner.d.ts +3 -0
- package/dist/interview/stateful-test-runner.js +80 -0
- package/dist/interview/types.d.ts +12 -0
- package/dist/transport/mcp-client.js +1 -1
- package/dist/transport/sse-transport.d.ts +7 -3
- package/dist/transport/sse-transport.js +157 -67
- package/dist/version.js +1 -1
- package/man/bellwether.1 +1 -1
- package/man/bellwether.1.md +2 -2
- package/package.json +1 -1
- package/schemas/bellwether-check.schema.json +185 -0
- package/schemas/bellwether-explore.schema.json +837 -0
- package/scripts/completions/bellwether.bash +10 -4
- package/scripts/completions/bellwether.zsh +55 -2
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { generateSemanticTests } from '../validation/semantic-test-generator.js';
|
|
2
|
+
import { SEMANTIC_VALIDATION } from '../constants.js';
|
|
3
|
+
import { analyzeResponses } from '../baseline/response-fingerprint.js';
|
|
4
|
+
import { buildSchemaEvolution } from '../baseline/response-schema-tracker.js';
|
|
5
|
+
import { generateErrorSummary } from '../baseline/error-analyzer.js';
|
|
6
|
+
import { scoreDocumentation } from '../baseline/documentation-scorer.js';
|
|
7
|
+
/**
|
|
8
|
+
* Build derived insights from an interview result.
|
|
9
|
+
* These insights are used for documentation and JSON report enrichment.
|
|
10
|
+
*/
|
|
11
|
+
export function buildInterviewInsights(result) {
|
|
12
|
+
const semanticInferences = {};
|
|
13
|
+
for (const tool of result.discovery.tools) {
|
|
14
|
+
const inferenceResult = generateSemanticTests(tool, {
|
|
15
|
+
minConfidence: SEMANTIC_VALIDATION.MIN_CONFIDENCE_THRESHOLD,
|
|
16
|
+
maxInvalidValuesPerParam: SEMANTIC_VALIDATION.MAX_INVALID_VALUES_PER_PARAM,
|
|
17
|
+
skipSemanticTests: false,
|
|
18
|
+
});
|
|
19
|
+
if (inferenceResult.inferences.length > 0) {
|
|
20
|
+
semanticInferences[tool.name] = inferenceResult.inferences;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
const schemaEvolution = {};
|
|
24
|
+
const errorAnalysisSummaries = {};
|
|
25
|
+
for (const profile of result.toolProfiles) {
|
|
26
|
+
const responseData = profile.interactions
|
|
27
|
+
.filter((i) => !i.mocked)
|
|
28
|
+
.map((i) => ({ response: i.response, error: i.error }));
|
|
29
|
+
const responseAnalysis = analyzeResponses(responseData);
|
|
30
|
+
if (responseAnalysis.schemas.length > 0) {
|
|
31
|
+
schemaEvolution[profile.name] = buildSchemaEvolution(responseAnalysis.schemas);
|
|
32
|
+
}
|
|
33
|
+
if (responseAnalysis.errorPatterns.length > 0) {
|
|
34
|
+
const summary = generateErrorSummary(profile.name, responseAnalysis.errorPatterns);
|
|
35
|
+
const categoryCounts = summary.categoryCounts instanceof Map
|
|
36
|
+
? Object.fromEntries(summary.categoryCounts.entries())
|
|
37
|
+
: summary.categoryCounts;
|
|
38
|
+
errorAnalysisSummaries[profile.name] = {
|
|
39
|
+
...summary,
|
|
40
|
+
categoryCounts,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
const documentationScore = scoreDocumentation(result.discovery.tools);
|
|
45
|
+
return {
|
|
46
|
+
semanticInferences: Object.keys(semanticInferences).length > 0 ? semanticInferences : undefined,
|
|
47
|
+
schemaEvolution: Object.keys(schemaEvolution).length > 0 ? schemaEvolution : undefined,
|
|
48
|
+
errorAnalysisSummaries: Object.keys(errorAnalysisSummaries).length > 0 ? errorAnalysisSummaries : undefined,
|
|
49
|
+
documentationScore,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=insights.js.map
|
|
@@ -6,6 +6,8 @@ import { evaluateAssertions } from '../scenarios/evaluator.js';
|
|
|
6
6
|
import { withTimeout, DEFAULT_TIMEOUTS, parallelLimit, createMutex } from '../utils/index.js';
|
|
7
7
|
import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT, } from '../constants.js';
|
|
8
8
|
import { generateSchemaTests } from './schema-test-generator.js';
|
|
9
|
+
import { generatePromptTests } from './prompt-test-generator.js';
|
|
10
|
+
import { generateResourceTests } from './resource-test-generator.js';
|
|
9
11
|
import { WorkflowDiscoverer } from '../workflow/discovery.js';
|
|
10
12
|
import { WorkflowExecutor } from '../workflow/executor.js';
|
|
11
13
|
import { RateLimiter, calculateBackoffMs, isRateLimitError } from './rate-limiter.js';
|
|
@@ -682,6 +684,9 @@ export class Interviewer {
|
|
|
682
684
|
const promptInteractions = [];
|
|
683
685
|
// Check for custom scenarios for this prompt
|
|
684
686
|
const customScenarios = this.getScenariosForPrompt(prompt.name);
|
|
687
|
+
const deterministicQuestions = this.config.customScenariosOnly
|
|
688
|
+
? []
|
|
689
|
+
: generatePromptTests(prompt, { maxTests: this.config.checkMode ? 3 : 2 });
|
|
685
690
|
// Build questions list - custom scenarios + LLM-generated (unless customScenariosOnly)
|
|
686
691
|
let questions = [];
|
|
687
692
|
if (customScenarios.length > 0) {
|
|
@@ -693,21 +698,30 @@ export class Interviewer {
|
|
|
693
698
|
description: s.description,
|
|
694
699
|
args: s.args,
|
|
695
700
|
}));
|
|
701
|
+
// Add deterministic prompt tests
|
|
702
|
+
if (deterministicQuestions.length > 0) {
|
|
703
|
+
questions = mergePromptQuestions(questions, deterministicQuestions);
|
|
704
|
+
}
|
|
696
705
|
// If not custom-only mode and not fast CI mode, also generate LLM questions
|
|
697
706
|
if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
|
|
698
707
|
const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
|
|
699
|
-
questions =
|
|
708
|
+
questions = mergePromptQuestions(questions, llmQuestions);
|
|
700
709
|
}
|
|
701
710
|
}
|
|
702
711
|
else if (!this.config.customScenariosOnly &&
|
|
703
712
|
!this.config.checkMode &&
|
|
704
713
|
primaryOrchestrator) {
|
|
705
|
-
// No custom scenarios -
|
|
706
|
-
questions =
|
|
714
|
+
// No custom scenarios - deterministic tests + LLM questions
|
|
715
|
+
questions = mergePromptQuestions(questions, deterministicQuestions);
|
|
716
|
+
const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
|
|
717
|
+
questions = mergePromptQuestions(questions, llmQuestions);
|
|
707
718
|
}
|
|
708
719
|
else if (this.config.checkMode) {
|
|
709
|
-
// Fast CI mode: use
|
|
710
|
-
questions =
|
|
720
|
+
// Fast CI mode: use deterministic prompt tests
|
|
721
|
+
questions =
|
|
722
|
+
deterministicQuestions.length > 0
|
|
723
|
+
? deterministicQuestions
|
|
724
|
+
: [{ description: 'Basic prompt test', args: {} }];
|
|
711
725
|
}
|
|
712
726
|
// If customScenariosOnly and no scenarios for this prompt, skip it
|
|
713
727
|
for (const question of questions) {
|
|
@@ -791,13 +805,17 @@ export class Interviewer {
|
|
|
791
805
|
// Generate resource questions (skip LLM in fast CI mode)
|
|
792
806
|
let questions;
|
|
793
807
|
if (this.config.checkMode || !primaryOrchestrator) {
|
|
794
|
-
// Fast CI mode: use
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
808
|
+
// Fast CI mode: use deterministic resource tests
|
|
809
|
+
const deterministic = generateResourceTests(resource, { maxTests: 2 });
|
|
810
|
+
questions =
|
|
811
|
+
deterministic.length > 0
|
|
812
|
+
? deterministic
|
|
813
|
+
: [{ description: 'Basic resource read test', category: 'happy_path' }];
|
|
798
814
|
}
|
|
799
815
|
else {
|
|
800
|
-
|
|
816
|
+
const deterministic = generateResourceTests(resource, { maxTests: 2 });
|
|
817
|
+
const llmQuestions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
|
|
818
|
+
questions = mergeResourceQuestions(deterministic, llmQuestions);
|
|
801
819
|
}
|
|
802
820
|
for (const question of questions) {
|
|
803
821
|
const interactionStart = Date.now();
|
|
@@ -1944,6 +1962,30 @@ export class Interviewer {
|
|
|
1944
1962
|
return { results, summary };
|
|
1945
1963
|
}
|
|
1946
1964
|
}
|
|
1965
|
+
function mergePromptQuestions(base, additions) {
|
|
1966
|
+
const merged = [...base];
|
|
1967
|
+
const signatures = new Set(base.map((q) => `${q.description}|${JSON.stringify(q.args)}`));
|
|
1968
|
+
for (const q of additions) {
|
|
1969
|
+
const sig = `${q.description}|${JSON.stringify(q.args)}`;
|
|
1970
|
+
if (!signatures.has(sig)) {
|
|
1971
|
+
merged.push(q);
|
|
1972
|
+
signatures.add(sig);
|
|
1973
|
+
}
|
|
1974
|
+
}
|
|
1975
|
+
return merged;
|
|
1976
|
+
}
|
|
1977
|
+
function mergeResourceQuestions(base, additions) {
|
|
1978
|
+
const merged = [...base];
|
|
1979
|
+
const signatures = new Set(base.map((q) => `${q.description}|${q.category}`));
|
|
1980
|
+
for (const q of additions) {
|
|
1981
|
+
const sig = `${q.description}|${q.category}`;
|
|
1982
|
+
if (!signatures.has(sig)) {
|
|
1983
|
+
merged.push(q);
|
|
1984
|
+
signatures.add(sig);
|
|
1985
|
+
}
|
|
1986
|
+
}
|
|
1987
|
+
return merged;
|
|
1988
|
+
}
|
|
1947
1989
|
function summarizeAssertions(interactions) {
|
|
1948
1990
|
const allResults = interactions.filter((i) => !i.mocked).flatMap((i) => i.assertionResults ?? []);
|
|
1949
1991
|
if (allResults.length === 0)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { MCPPrompt } from '../transport/types.js';
|
|
2
|
+
import type { PromptQuestion } from './types.js';
|
|
3
|
+
export interface PromptTestOptions {
|
|
4
|
+
/** Maximum tests to generate */
|
|
5
|
+
maxTests?: number;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Generate deterministic prompt tests based on prompt arguments.
|
|
9
|
+
* Focuses on valid inputs to avoid false negatives.
|
|
10
|
+
*/
|
|
11
|
+
export declare function generatePromptTests(prompt: MCPPrompt, options?: PromptTestOptions): PromptQuestion[];
|
|
12
|
+
//# sourceMappingURL=prompt-test-generator.d.ts.map
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate deterministic prompt tests based on prompt arguments.
|
|
3
|
+
* Focuses on valid inputs to avoid false negatives.
|
|
4
|
+
*/
|
|
5
|
+
export function generatePromptTests(prompt, options = {}) {
|
|
6
|
+
const maxTests = options.maxTests ?? 3;
|
|
7
|
+
const questions = [];
|
|
8
|
+
const argsSpec = prompt.arguments ?? [];
|
|
9
|
+
const requiredArgs = argsSpec.filter((a) => a.required);
|
|
10
|
+
const optionalArgs = argsSpec.filter((a) => !a.required);
|
|
11
|
+
const baseArgs = {};
|
|
12
|
+
for (const arg of requiredArgs) {
|
|
13
|
+
baseArgs[arg.name] = generatePromptArgValue(arg.name, arg.description);
|
|
14
|
+
}
|
|
15
|
+
// Basic invocation
|
|
16
|
+
questions.push({
|
|
17
|
+
description: requiredArgs.length > 0 ? 'Basic prompt invocation' : 'Prompt invocation (no args)',
|
|
18
|
+
args: baseArgs,
|
|
19
|
+
});
|
|
20
|
+
if (questions.length < maxTests && optionalArgs.length > 0) {
|
|
21
|
+
const fullArgs = { ...baseArgs };
|
|
22
|
+
for (const arg of optionalArgs.slice(0, 2)) {
|
|
23
|
+
fullArgs[arg.name] = generatePromptArgValue(arg.name, arg.description);
|
|
24
|
+
}
|
|
25
|
+
questions.push({
|
|
26
|
+
description: 'Prompt invocation with optional arguments',
|
|
27
|
+
args: fullArgs,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
if (questions.length < maxTests && Object.keys(baseArgs).length > 0) {
|
|
31
|
+
const altArgs = {};
|
|
32
|
+
for (const arg of requiredArgs) {
|
|
33
|
+
altArgs[arg.name] = generateAlternateValue(baseArgs[arg.name], arg.name);
|
|
34
|
+
}
|
|
35
|
+
questions.push({
|
|
36
|
+
description: 'Prompt invocation with alternate values',
|
|
37
|
+
args: altArgs,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
return questions.slice(0, maxTests);
|
|
41
|
+
}
|
|
42
|
+
function generatePromptArgValue(name, description) {
|
|
43
|
+
const lowerName = name.toLowerCase();
|
|
44
|
+
const lowerDesc = (description ?? '').toLowerCase();
|
|
45
|
+
if (lowerName.includes('path') || lowerDesc.includes('path'))
|
|
46
|
+
return '/tmp/example.txt';
|
|
47
|
+
if (lowerName.includes('url') || lowerDesc.includes('url') || lowerName.includes('uri')) {
|
|
48
|
+
return 'https://example.com';
|
|
49
|
+
}
|
|
50
|
+
if (lowerName.includes('email') || lowerDesc.includes('email'))
|
|
51
|
+
return 'test@example.com';
|
|
52
|
+
if (lowerName.includes('date') || lowerDesc.includes('date'))
|
|
53
|
+
return '2024-01-15';
|
|
54
|
+
if (lowerName.includes('time') || lowerDesc.includes('time'))
|
|
55
|
+
return '2024-01-15T14:30:00Z';
|
|
56
|
+
if (lowerName.includes('id') || lowerDesc.includes('identifier'))
|
|
57
|
+
return 'id_123';
|
|
58
|
+
if (lowerName.includes('query') || lowerDesc.includes('search'))
|
|
59
|
+
return 'example query';
|
|
60
|
+
return 'example';
|
|
61
|
+
}
|
|
62
|
+
function generateAlternateValue(value, name) {
|
|
63
|
+
if (value.startsWith('http')) {
|
|
64
|
+
return 'https://example.org';
|
|
65
|
+
}
|
|
66
|
+
if (value.startsWith('/')) {
|
|
67
|
+
return '/tmp/alternate.txt';
|
|
68
|
+
}
|
|
69
|
+
if (value.includes('@')) {
|
|
70
|
+
return 'user@example.org';
|
|
71
|
+
}
|
|
72
|
+
if (name.toLowerCase().includes('id')) {
|
|
73
|
+
return 'id_456';
|
|
74
|
+
}
|
|
75
|
+
return `${value}-alt`;
|
|
76
|
+
}
|
|
77
|
+
//# sourceMappingURL=prompt-test-generator.js.map
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { MCPResource } from '../transport/types.js';
|
|
2
|
+
import type { ResourceQuestion } from './types.js';
|
|
3
|
+
export interface ResourceTestOptions {
|
|
4
|
+
/** Maximum tests to generate */
|
|
5
|
+
maxTests?: number;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Generate deterministic resource tests.
|
|
9
|
+
* Since resource reads are URI-based with no args, tests focus on consistency.
|
|
10
|
+
*/
|
|
11
|
+
export declare function generateResourceTests(resource: MCPResource, options?: ResourceTestOptions): ResourceQuestion[];
|
|
12
|
+
//# sourceMappingURL=resource-test-generator.d.ts.map
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate deterministic resource tests.
|
|
3
|
+
* Since resource reads are URI-based with no args, tests focus on consistency.
|
|
4
|
+
*/
|
|
5
|
+
export function generateResourceTests(resource, options = {}) {
|
|
6
|
+
const maxTests = options.maxTests ?? 2;
|
|
7
|
+
const questions = [];
|
|
8
|
+
questions.push({
|
|
9
|
+
description: `Basic resource read (${resource.name})`,
|
|
10
|
+
category: 'happy_path',
|
|
11
|
+
});
|
|
12
|
+
if (questions.length < maxTests) {
|
|
13
|
+
questions.push({
|
|
14
|
+
description: `Repeated resource read (${resource.name})`,
|
|
15
|
+
category: 'edge_case',
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
return questions.slice(0, maxTests);
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=resource-test-generator.js.map
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { createHash } from 'crypto';
|
|
2
|
-
import { inferSchemaFromValue, computeInferredSchemaHash } from '../baseline/response-fingerprint.js';
|
|
2
|
+
import { inferSchemaFromValue, computeInferredSchemaHash, } from '../baseline/response-fingerprint.js';
|
|
3
3
|
/**
|
|
4
4
|
* Infer a response schema from an MCP tool response.
|
|
5
5
|
*/
|
|
@@ -26,7 +26,9 @@ export function inferResponseSchema(response) {
|
|
|
26
26
|
};
|
|
27
27
|
}
|
|
28
28
|
const markdownStructure = detectMarkdownStructure(textContent);
|
|
29
|
-
if (markdownStructure.hasHeaders ||
|
|
29
|
+
if (markdownStructure.hasHeaders ||
|
|
30
|
+
markdownStructure.hasTables ||
|
|
31
|
+
markdownStructure.hasCodeBlocks) {
|
|
30
32
|
return {
|
|
31
33
|
inferredType: 'markdown',
|
|
32
34
|
markdownStructure,
|
|
@@ -43,10 +45,16 @@ export function extractTextContent(response) {
|
|
|
43
45
|
return null;
|
|
44
46
|
}
|
|
45
47
|
const textBlocks = response.content
|
|
46
|
-
.filter((c) =>
|
|
48
|
+
.filter((c) => typeof c.text === 'string')
|
|
47
49
|
.map((c) => c.text);
|
|
48
50
|
if (textBlocks.length === 0) {
|
|
49
|
-
|
|
51
|
+
const decodedBlocks = response.content
|
|
52
|
+
.map((c) => decodeDataBlock(c.data, c.mimeType))
|
|
53
|
+
.filter((v) => typeof v === 'string');
|
|
54
|
+
if (decodedBlocks.length === 0) {
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
return decodedBlocks.join('\n');
|
|
50
58
|
}
|
|
51
59
|
return textBlocks.join('\n');
|
|
52
60
|
}
|
|
@@ -68,4 +76,18 @@ function detectMarkdownStructure(text) {
|
|
|
68
76
|
function hashString(value) {
|
|
69
77
|
return createHash('sha256').update(value).digest('hex');
|
|
70
78
|
}
|
|
79
|
+
function decodeDataBlock(data, mimeType) {
|
|
80
|
+
if (!data || typeof data !== 'string')
|
|
81
|
+
return null;
|
|
82
|
+
const mime = (mimeType ?? '').toLowerCase();
|
|
83
|
+
if (!mime.includes('json') && !mime.startsWith('text/')) {
|
|
84
|
+
return null;
|
|
85
|
+
}
|
|
86
|
+
try {
|
|
87
|
+
return Buffer.from(data, 'base64').toString('utf8');
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
return null;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
71
93
|
//# sourceMappingURL=schema-inferrer.js.map
|