@dotsetlabs/bellwether 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +3 -2
- package/dist/cache/response-cache.d.ts +4 -2
- package/dist/cache/response-cache.js +68 -30
- package/dist/cli/commands/check.js +78 -49
- package/dist/cli/index.js +5 -3
- package/dist/interview/interviewer.js +70 -50
- package/dist/interview/orchestrator.js +49 -22
- package/dist/llm/anthropic.js +49 -16
- package/dist/llm/client.d.ts +2 -0
- package/dist/llm/client.js +61 -0
- package/dist/llm/ollama.js +9 -4
- package/dist/llm/openai.js +34 -23
- package/dist/transport/base-transport.d.ts +1 -1
- package/dist/transport/http-transport.d.ts +2 -2
- package/dist/transport/http-transport.js +26 -6
- package/dist/transport/mcp-client.d.ts +18 -6
- package/dist/transport/mcp-client.js +49 -19
- package/dist/transport/sse-transport.d.ts +1 -1
- package/dist/transport/sse-transport.js +4 -2
- package/dist/transport/stdio-transport.d.ts +1 -1
- package/dist/transport/stdio-transport.js +1 -1
- package/dist/utils/timeout.d.ts +10 -2
- package/dist/utils/timeout.js +9 -5
- package/dist/version.js +1 -1
- package/dist/workflow/executor.js +18 -13
- package/dist/workflow/loader.js +4 -1
- package/dist/workflow/state-tracker.js +22 -18
- package/man/bellwether.1 +204 -0
- package/man/bellwether.1.md +148 -0
- package/package.json +6 -7
|
@@ -4,7 +4,7 @@ import { DEFAULT_PERSONA } from '../persona/builtins.js';
|
|
|
4
4
|
import { getLogger, startTiming } from '../logging/logger.js';
|
|
5
5
|
import { evaluateAssertions } from '../scenarios/evaluator.js';
|
|
6
6
|
import { withTimeout, DEFAULT_TIMEOUTS, parallelLimit, createMutex } from '../utils/index.js';
|
|
7
|
-
import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT } from '../constants.js';
|
|
7
|
+
import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT, } from '../constants.js';
|
|
8
8
|
import { generateSchemaTests } from './schema-test-generator.js';
|
|
9
9
|
import { WorkflowDiscoverer } from '../workflow/discovery.js';
|
|
10
10
|
import { WorkflowExecutor } from '../workflow/executor.js';
|
|
@@ -62,7 +62,8 @@ export class Interviewer {
|
|
|
62
62
|
// Use multiple personas by default for better coverage
|
|
63
63
|
// Fall back to DEFAULT_PERSONAS if no personas provided or empty array
|
|
64
64
|
const providedPersonas = config?.personas;
|
|
65
|
-
this.personas =
|
|
65
|
+
this.personas =
|
|
66
|
+
providedPersonas && providedPersonas.length > 0 ? providedPersonas : DEFAULT_PERSONAS;
|
|
66
67
|
// Store cache reference for tool response and analysis caching
|
|
67
68
|
this.cache = config?.cache;
|
|
68
69
|
if (this.config.rateLimit?.enabled) {
|
|
@@ -270,12 +271,12 @@ export class Interviewer {
|
|
|
270
271
|
};
|
|
271
272
|
// Look for tools that reveal server constraints
|
|
272
273
|
for (const toolName of INTERVIEW.CONSTRAINT_DISCOVERY_TOOLS) {
|
|
273
|
-
const tool = discovery.tools.find(t => t.name === toolName);
|
|
274
|
+
const tool = discovery.tools.find((t) => t.name === toolName);
|
|
274
275
|
if (tool) {
|
|
275
276
|
try {
|
|
276
277
|
const result = await client.callTool(toolName, {});
|
|
277
278
|
if (result?.content) {
|
|
278
|
-
const textContent = result.content.find(c => c.type === 'text');
|
|
279
|
+
const textContent = result.content.find((c) => c.type === 'text');
|
|
279
280
|
if (textContent && 'text' in textContent) {
|
|
280
281
|
const text = String(textContent.text);
|
|
281
282
|
// Parse allowed directories from response
|
|
@@ -344,7 +345,7 @@ export class Interviewer {
|
|
|
344
345
|
try {
|
|
345
346
|
const parsed = JSON.parse(text);
|
|
346
347
|
if (Array.isArray(parsed)) {
|
|
347
|
-
return parsed.filter(d => typeof d === 'string' && d.startsWith('/'));
|
|
348
|
+
return parsed.filter((d) => typeof d === 'string' && d.startsWith('/'));
|
|
348
349
|
}
|
|
349
350
|
}
|
|
350
351
|
catch (error) {
|
|
@@ -443,7 +444,7 @@ export class Interviewer {
|
|
|
443
444
|
concurrency,
|
|
444
445
|
}, 'Running persona interviews in parallel');
|
|
445
446
|
// Create tasks for each persona
|
|
446
|
-
const personaTasks = this.personas.map(persona => async () => {
|
|
447
|
+
const personaTasks = this.personas.map((persona) => async () => {
|
|
447
448
|
progress.currentPersona = persona.name;
|
|
448
449
|
onProgress?.(progress);
|
|
449
450
|
const result = await this.interviewPersona(client, discovery, persona, toolCallMutex);
|
|
@@ -501,7 +502,10 @@ export class Interviewer {
|
|
|
501
502
|
if (statefulEnabled) {
|
|
502
503
|
this.logger.info({ toolCount: orderedTools.length }, 'Stateful testing enabled');
|
|
503
504
|
}
|
|
504
|
-
this.logger.info({
|
|
505
|
+
this.logger.info({
|
|
506
|
+
parallel: this.config.parallelTools && !statefulEnabled,
|
|
507
|
+
concurrency: effectiveConcurrency,
|
|
508
|
+
}, 'Using check mode tool testing');
|
|
505
509
|
const statefulRunner = statefulEnabled
|
|
506
510
|
? new StatefulTestRunner({ shareOutputs: statefulConfig?.shareOutputsBetweenTools ?? true })
|
|
507
511
|
: undefined;
|
|
@@ -516,13 +520,15 @@ export class Interviewer {
|
|
|
516
520
|
const toolData = toolInteractionsMap.get(profile.name);
|
|
517
521
|
if (toolData) {
|
|
518
522
|
toolData.interactions = profile.interactions;
|
|
519
|
-
toolData.findingsByPersona = [
|
|
523
|
+
toolData.findingsByPersona = [
|
|
524
|
+
{
|
|
520
525
|
personaId: 'check_mode',
|
|
521
526
|
personaName: 'Check Mode',
|
|
522
527
|
behavioralNotes: [],
|
|
523
528
|
limitations: [],
|
|
524
529
|
securityNotes: [],
|
|
525
|
-
}
|
|
530
|
+
},
|
|
531
|
+
];
|
|
526
532
|
}
|
|
527
533
|
}
|
|
528
534
|
// Update persona stats with aggregated counts
|
|
@@ -557,7 +563,7 @@ export class Interviewer {
|
|
|
557
563
|
const scenarioResults = await this.executeToolScenarios(client, tool.name, customScenarios);
|
|
558
564
|
allScenarioResults.push(...scenarioResults);
|
|
559
565
|
// Convert scenarios to interview questions for integration with profiling
|
|
560
|
-
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
566
|
+
questions = customScenarios.map((s) => this.scenarioToQuestion(s));
|
|
561
567
|
// If not custom-only mode, also generate LLM questions (skip in fast CI mode)
|
|
562
568
|
if (!this.config.customScenariosOnly && !this.config.checkMode) {
|
|
563
569
|
const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
@@ -568,7 +574,8 @@ export class Interviewer {
|
|
|
568
574
|
// No custom scenarios - generate questions
|
|
569
575
|
if (this.config.checkMode) {
|
|
570
576
|
// Fast CI mode: use fallback questions (no LLM call)
|
|
571
|
-
questions = orchestrator
|
|
577
|
+
questions = orchestrator
|
|
578
|
+
.getFallbackQuestions(tool, this.config.skipErrorTests)
|
|
572
579
|
.slice(0, this.config.maxQuestionsPerTool);
|
|
573
580
|
}
|
|
574
581
|
else {
|
|
@@ -589,8 +596,10 @@ export class Interviewer {
|
|
|
589
596
|
});
|
|
590
597
|
// If we have multiple failures, regenerate remaining questions with error context
|
|
591
598
|
// Skip in scenarios-only mode and fast CI mode
|
|
592
|
-
if (!this.config.customScenariosOnly &&
|
|
593
|
-
|
|
599
|
+
if (!this.config.customScenariosOnly &&
|
|
600
|
+
!this.config.checkMode &&
|
|
601
|
+
previousErrors.length >= 2 &&
|
|
602
|
+
personaInteractions.length < questions.length) {
|
|
594
603
|
const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
|
|
595
604
|
if (remaining > 0) {
|
|
596
605
|
this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
|
|
@@ -616,7 +625,7 @@ export class Interviewer {
|
|
|
616
625
|
};
|
|
617
626
|
}
|
|
618
627
|
else {
|
|
619
|
-
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
|
|
628
|
+
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map((i) => ({
|
|
620
629
|
question: i.question,
|
|
621
630
|
response: i.response,
|
|
622
631
|
error: i.error,
|
|
@@ -664,7 +673,9 @@ export class Interviewer {
|
|
|
664
673
|
progress.promptsCompleted = 0;
|
|
665
674
|
onProgress?.(progress);
|
|
666
675
|
// Only create orchestrator if NOT in check mode (requires LLM)
|
|
667
|
-
const primaryOrchestrator = this.isCheckMode()
|
|
676
|
+
const primaryOrchestrator = this.isCheckMode()
|
|
677
|
+
? null
|
|
678
|
+
: this.createOrchestrator(this.personas[0]);
|
|
668
679
|
for (const prompt of discovery.prompts) {
|
|
669
680
|
progress.currentTool = `prompt:${prompt.name}`;
|
|
670
681
|
onProgress?.(progress);
|
|
@@ -678,7 +689,7 @@ export class Interviewer {
|
|
|
678
689
|
const scenarioResults = await this.executePromptScenarios(client, prompt.name, customScenarios);
|
|
679
690
|
allScenarioResults.push(...scenarioResults);
|
|
680
691
|
// Convert scenarios to prompt questions for profiling
|
|
681
|
-
questions = customScenarios.map(s => ({
|
|
692
|
+
questions = customScenarios.map((s) => ({
|
|
682
693
|
description: s.description,
|
|
683
694
|
args: s.args,
|
|
684
695
|
}));
|
|
@@ -688,7 +699,9 @@ export class Interviewer {
|
|
|
688
699
|
questions = [...questions, ...llmQuestions];
|
|
689
700
|
}
|
|
690
701
|
}
|
|
691
|
-
else if (!this.config.customScenariosOnly &&
|
|
702
|
+
else if (!this.config.customScenariosOnly &&
|
|
703
|
+
!this.config.checkMode &&
|
|
704
|
+
primaryOrchestrator) {
|
|
692
705
|
// No custom scenarios - generate LLM questions as usual
|
|
693
706
|
questions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
|
|
694
707
|
}
|
|
@@ -740,7 +753,7 @@ export class Interviewer {
|
|
|
740
753
|
};
|
|
741
754
|
}
|
|
742
755
|
else {
|
|
743
|
-
profile = await primaryOrchestrator.synthesizePromptProfile(prompt, promptInteractions.map(i => ({
|
|
756
|
+
profile = await primaryOrchestrator.synthesizePromptProfile(prompt, promptInteractions.map((i) => ({
|
|
744
757
|
question: i.question,
|
|
745
758
|
response: i.response,
|
|
746
759
|
error: i.error,
|
|
@@ -768,7 +781,9 @@ export class Interviewer {
|
|
|
768
781
|
progress.resourcesCompleted = 0;
|
|
769
782
|
onProgress?.(progress);
|
|
770
783
|
// Only create orchestrator if NOT in check mode (requires LLM)
|
|
771
|
-
const primaryOrchestrator = this.isCheckMode()
|
|
784
|
+
const primaryOrchestrator = this.isCheckMode()
|
|
785
|
+
? null
|
|
786
|
+
: this.createOrchestrator(this.personas[0]);
|
|
772
787
|
for (const resource of discoveredResources) {
|
|
773
788
|
progress.currentTool = `resource:${resource.name}`;
|
|
774
789
|
onProgress?.(progress);
|
|
@@ -777,7 +792,9 @@ export class Interviewer {
|
|
|
777
792
|
let questions;
|
|
778
793
|
if (this.config.checkMode || !primaryOrchestrator) {
|
|
779
794
|
// Fast CI mode: use simple fallback question
|
|
780
|
-
questions = [
|
|
795
|
+
questions = [
|
|
796
|
+
{ description: 'Basic resource read test', category: 'happy_path' },
|
|
797
|
+
];
|
|
781
798
|
}
|
|
782
799
|
else {
|
|
783
800
|
questions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
|
|
@@ -787,8 +804,9 @@ export class Interviewer {
|
|
|
787
804
|
let response = null;
|
|
788
805
|
let error = null;
|
|
789
806
|
try {
|
|
807
|
+
const abortController = new AbortController();
|
|
790
808
|
// Apply timeout to resource read to prevent indefinite hangs
|
|
791
|
-
response = await withTimeout(client.readResource(resource.uri), this.config.resourceTimeout ?? DEFAULT_TIMEOUTS.resourceRead, `Resource read: ${resource.uri}
|
|
809
|
+
response = await withTimeout(client.readResource(resource.uri, { signal: abortController.signal }), this.config.resourceTimeout ?? DEFAULT_TIMEOUTS.resourceRead, `Resource read: ${resource.uri}`, { abortController });
|
|
792
810
|
resourceReadCount++;
|
|
793
811
|
}
|
|
794
812
|
catch (e) {
|
|
@@ -829,7 +847,7 @@ export class Interviewer {
|
|
|
829
847
|
};
|
|
830
848
|
}
|
|
831
849
|
else {
|
|
832
|
-
profile = await primaryOrchestrator.synthesizeResourceProfile(resource, resourceInteractions.map(i => ({
|
|
850
|
+
profile = await primaryOrchestrator.synthesizeResourceProfile(resource, resourceInteractions.map((i) => ({
|
|
833
851
|
question: i.question,
|
|
834
852
|
response: i.response,
|
|
835
853
|
error: i.error,
|
|
@@ -838,13 +856,14 @@ export class Interviewer {
|
|
|
838
856
|
}
|
|
839
857
|
// Extract content preview from first successful read
|
|
840
858
|
let contentPreview;
|
|
841
|
-
const successfulRead = resourceInteractions.find(i => i.response && !i.error);
|
|
859
|
+
const successfulRead = resourceInteractions.find((i) => i.response && !i.error);
|
|
842
860
|
if (successfulRead?.response?.contents?.[0]) {
|
|
843
861
|
const content = successfulRead.response.contents[0];
|
|
844
862
|
if (content.text) {
|
|
845
|
-
contentPreview =
|
|
846
|
-
|
|
847
|
-
|
|
863
|
+
contentPreview =
|
|
864
|
+
content.text.length > DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW
|
|
865
|
+
? `${content.text.substring(0, DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW)}...`
|
|
866
|
+
: content.text;
|
|
848
867
|
}
|
|
849
868
|
else if (content.blob) {
|
|
850
869
|
contentPreview = `[Binary data: ${content.blob.length} bytes base64]`;
|
|
@@ -1058,7 +1077,7 @@ export class Interviewer {
|
|
|
1058
1077
|
if (response.isError) {
|
|
1059
1078
|
stats.errorCount++;
|
|
1060
1079
|
hadError = true;
|
|
1061
|
-
const errorContent = response.content?.find(c => c.type === 'text');
|
|
1080
|
+
const errorContent = response.content?.find((c) => c.type === 'text');
|
|
1062
1081
|
if (errorContent && 'text' in errorContent) {
|
|
1063
1082
|
error = String(errorContent.text);
|
|
1064
1083
|
}
|
|
@@ -1143,7 +1162,7 @@ export class Interviewer {
|
|
|
1143
1162
|
// Extract allowed directories explicitly mentioned
|
|
1144
1163
|
const allowedMatch = error.match(/allowed director(?:y|ies)[:\s]+([^\n]+)/i);
|
|
1145
1164
|
if (allowedMatch) {
|
|
1146
|
-
const dirs = allowedMatch[1].split(/[,\s]+/).filter(d => d.startsWith('/'));
|
|
1165
|
+
const dirs = allowedMatch[1].split(/[,\s]+/).filter((d) => d.startsWith('/'));
|
|
1147
1166
|
if (dirs.length > 0) {
|
|
1148
1167
|
const currentContext = orchestrator.getServerContext() ?? { allowedDirectories: [] };
|
|
1149
1168
|
const existingDirs = currentContext.allowedDirectories ?? [];
|
|
@@ -1197,7 +1216,7 @@ export class Interviewer {
|
|
|
1197
1216
|
toolCallMutex.release();
|
|
1198
1217
|
}
|
|
1199
1218
|
// Convert scenarios to interview questions
|
|
1200
|
-
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
1219
|
+
questions = customScenarios.map((s) => this.scenarioToQuestion(s));
|
|
1201
1220
|
// If not custom-only mode, also generate LLM questions
|
|
1202
1221
|
if (!this.config.customScenariosOnly) {
|
|
1203
1222
|
const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
@@ -1231,7 +1250,8 @@ export class Interviewer {
|
|
|
1231
1250
|
});
|
|
1232
1251
|
// If we have multiple failures, regenerate remaining questions
|
|
1233
1252
|
if (!this.config.customScenariosOnly &&
|
|
1234
|
-
previousErrors.length >= 2 &&
|
|
1253
|
+
previousErrors.length >= 2 &&
|
|
1254
|
+
personaInteractions.length < questions.length) {
|
|
1235
1255
|
const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
|
|
1236
1256
|
if (remaining > 0) {
|
|
1237
1257
|
this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
|
|
@@ -1253,7 +1273,7 @@ export class Interviewer {
|
|
|
1253
1273
|
};
|
|
1254
1274
|
}
|
|
1255
1275
|
else {
|
|
1256
|
-
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
|
|
1276
|
+
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map((i) => ({
|
|
1257
1277
|
question: i.question,
|
|
1258
1278
|
response: i.response,
|
|
1259
1279
|
error: i.error,
|
|
@@ -1361,20 +1381,19 @@ export class Interviewer {
|
|
|
1361
1381
|
const results = await this.executeToolScenarios(client, tool.name, customScenarios);
|
|
1362
1382
|
scenarioResults.push(...results);
|
|
1363
1383
|
toolCallCount += results.length;
|
|
1364
|
-
errorCount += results.filter(r => !r.passed).length;
|
|
1384
|
+
errorCount += results.filter((r) => !r.passed).length;
|
|
1365
1385
|
}
|
|
1366
1386
|
finally {
|
|
1367
1387
|
toolCallMutex.release();
|
|
1368
1388
|
}
|
|
1369
1389
|
// Convert scenarios to interview questions
|
|
1370
|
-
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
1390
|
+
questions = customScenarios.map((s) => this.scenarioToQuestion(s));
|
|
1371
1391
|
}
|
|
1372
1392
|
else {
|
|
1373
1393
|
// No custom scenarios - use fallback questions (check mode, no LLM)
|
|
1374
1394
|
// We need an orchestrator for fallback questions, but we won't use LLM
|
|
1375
1395
|
// Get fallback questions directly
|
|
1376
|
-
questions = this.getFallbackQuestionsForTool(tool, this.config.skipErrorTests)
|
|
1377
|
-
.slice(0, this.config.maxQuestionsPerTool);
|
|
1396
|
+
questions = this.getFallbackQuestionsForTool(tool, this.config.skipErrorTests).slice(0, this.config.maxQuestionsPerTool);
|
|
1378
1397
|
}
|
|
1379
1398
|
// Execute warmup runs if configured (helps reduce cold-start timing variance)
|
|
1380
1399
|
// Warmup runs are not recorded in interactions
|
|
@@ -1444,7 +1463,10 @@ export class Interviewer {
|
|
|
1444
1463
|
// Generate simple analysis (no LLM in check mode)
|
|
1445
1464
|
const analysis = this.generateSimpleAnalysis(error, !!response, 'Tool call succeeded.');
|
|
1446
1465
|
const outcomeAssessment = this.assessOutcome(resolvedQuestion, response, error);
|
|
1447
|
-
if (this.config.assertions?.enabled &&
|
|
1466
|
+
if (this.config.assertions?.enabled &&
|
|
1467
|
+
outcomeAssessment.expected === 'success' &&
|
|
1468
|
+
response &&
|
|
1469
|
+
!response.isError) {
|
|
1448
1470
|
let schema = this.responseSchemas.get(tool.name);
|
|
1449
1471
|
if (!schema && this.config.assertions?.infer) {
|
|
1450
1472
|
const inferred = inferResponseSchema(response);
|
|
@@ -1546,7 +1568,7 @@ export class Interviewer {
|
|
|
1546
1568
|
parallel: this.config.parallelTools,
|
|
1547
1569
|
}, 'Running check mode tool testing');
|
|
1548
1570
|
// Create tasks for each tool
|
|
1549
|
-
const toolTasks = tools.map(tool => async () => {
|
|
1571
|
+
const toolTasks = tools.map((tool) => async () => {
|
|
1550
1572
|
progress.currentTool = tool.name;
|
|
1551
1573
|
onProgress?.(progress);
|
|
1552
1574
|
const result = await this.interviewToolInCheckMode(client, tool, toolCallMutex, options?.statefulRunner, options?.dependencyMap?.get(tool.name), options?.statefulConfig);
|
|
@@ -1575,7 +1597,7 @@ export class Interviewer {
|
|
|
1575
1597
|
let totalErrorCount = 0;
|
|
1576
1598
|
let totalQuestionsAsked = 0;
|
|
1577
1599
|
for (const result of successfulResults) {
|
|
1578
|
-
const tool = tools.find(t => t.name === result.toolName);
|
|
1600
|
+
const tool = tools.find((t) => t.name === result.toolName);
|
|
1579
1601
|
if (!tool)
|
|
1580
1602
|
continue;
|
|
1581
1603
|
// Classify errors to separate tool correctness from environment issues
|
|
@@ -1618,7 +1640,7 @@ export class Interviewer {
|
|
|
1618
1640
|
};
|
|
1619
1641
|
}
|
|
1620
1642
|
buildToolProgressSummary(result) {
|
|
1621
|
-
const interactions = result.interactions.filter(i => !i.mocked);
|
|
1643
|
+
const interactions = result.interactions.filter((i) => !i.mocked);
|
|
1622
1644
|
const totalTests = interactions.length;
|
|
1623
1645
|
let passedTests = 0;
|
|
1624
1646
|
let validationTotal = 0;
|
|
@@ -1674,14 +1696,14 @@ export class Interviewer {
|
|
|
1674
1696
|
*/
|
|
1675
1697
|
getScenariosForTool(toolName) {
|
|
1676
1698
|
const scenarios = this.config.customScenarios?.toolScenarios ?? [];
|
|
1677
|
-
return scenarios.filter(s => s.tool === toolName && !s.skip);
|
|
1699
|
+
return scenarios.filter((s) => s.tool === toolName && !s.skip);
|
|
1678
1700
|
}
|
|
1679
1701
|
/**
|
|
1680
1702
|
* Get custom scenarios for a specific prompt.
|
|
1681
1703
|
*/
|
|
1682
1704
|
getScenariosForPrompt(promptName) {
|
|
1683
1705
|
const scenarios = this.config.customScenarios?.promptScenarios ?? [];
|
|
1684
|
-
return scenarios.filter(s => s.prompt === promptName && !s.skip);
|
|
1706
|
+
return scenarios.filter((s) => s.prompt === promptName && !s.skip);
|
|
1685
1707
|
}
|
|
1686
1708
|
/**
|
|
1687
1709
|
* Execute custom test scenarios for a tool.
|
|
@@ -1708,7 +1730,7 @@ export class Interviewer {
|
|
|
1708
1730
|
response = result.response;
|
|
1709
1731
|
isError = response?.isError ?? false;
|
|
1710
1732
|
if (isError) {
|
|
1711
|
-
const errorContent = response?.content?.find(c => c.type === 'text');
|
|
1733
|
+
const errorContent = response?.content?.find((c) => c.type === 'text');
|
|
1712
1734
|
if (errorContent && 'text' in errorContent) {
|
|
1713
1735
|
error = String(errorContent.text);
|
|
1714
1736
|
}
|
|
@@ -1728,7 +1750,7 @@ export class Interviewer {
|
|
|
1728
1750
|
? evaluateAssertions(scenario.assertions, response, isError)
|
|
1729
1751
|
: [];
|
|
1730
1752
|
// Scenario passes if no error (or expected error) and all assertions pass
|
|
1731
|
-
const allAssertionsPassed = assertionResults.every(r => r.passed);
|
|
1753
|
+
const allAssertionsPassed = assertionResults.every((r) => r.passed);
|
|
1732
1754
|
const passed = allAssertionsPassed && (!isError || scenario.category === 'error_handling');
|
|
1733
1755
|
const result = {
|
|
1734
1756
|
scenario,
|
|
@@ -1771,9 +1793,9 @@ export class Interviewer {
|
|
|
1771
1793
|
const assertionResults = scenario.assertions
|
|
1772
1794
|
? evaluateAssertions(scenario.assertions, response, !!error)
|
|
1773
1795
|
: [];
|
|
1774
|
-
const allAssertionsPassed = assertionResults.every(r => r.passed);
|
|
1796
|
+
const allAssertionsPassed = assertionResults.every((r) => r.passed);
|
|
1775
1797
|
// Check if this scenario expects an error (has an assertion checking for 'error' to exist)
|
|
1776
|
-
const expectsError = scenario.assertions?.some(a => a.path === 'error' && a.condition === 'exists') ?? false;
|
|
1798
|
+
const expectsError = scenario.assertions?.some((a) => a.path === 'error' && a.condition === 'exists') ?? false;
|
|
1777
1799
|
// Scenario passes if assertions pass AND (no error OR scenario expects error)
|
|
1778
1800
|
const passed = allAssertionsPassed && (!error || expectsError);
|
|
1779
1801
|
const result = {
|
|
@@ -1824,7 +1846,7 @@ export class Interviewer {
|
|
|
1824
1846
|
discoveredCount = discovered.length;
|
|
1825
1847
|
this.logger.info({
|
|
1826
1848
|
count: discoveredCount,
|
|
1827
|
-
workflows: discovered.map(w => w.name),
|
|
1849
|
+
workflows: discovered.map((w) => w.name),
|
|
1828
1850
|
}, 'Discovered workflows');
|
|
1829
1851
|
}
|
|
1830
1852
|
else {
|
|
@@ -1904,7 +1926,7 @@ export class Interviewer {
|
|
|
1904
1926
|
}
|
|
1905
1927
|
}
|
|
1906
1928
|
// Build summary
|
|
1907
|
-
const successfulCount = results.filter(r => r.success).length;
|
|
1929
|
+
const successfulCount = results.filter((r) => r.success).length;
|
|
1908
1930
|
const summary = {
|
|
1909
1931
|
workflowCount: results.length,
|
|
1910
1932
|
successfulCount,
|
|
@@ -1923,9 +1945,7 @@ export class Interviewer {
|
|
|
1923
1945
|
}
|
|
1924
1946
|
}
|
|
1925
1947
|
function summarizeAssertions(interactions) {
|
|
1926
|
-
const allResults = interactions
|
|
1927
|
-
.filter((i) => !i.mocked)
|
|
1928
|
-
.flatMap((i) => i.assertionResults ?? []);
|
|
1948
|
+
const allResults = interactions.filter((i) => !i.mocked).flatMap((i) => i.assertionResults ?? []);
|
|
1929
1949
|
if (allResults.length === 0)
|
|
1930
1950
|
return undefined;
|
|
1931
1951
|
const passed = allResults.filter((r) => r.passed).length;
|
|
@@ -44,7 +44,11 @@ function categorizeLLMError(error) {
|
|
|
44
44
|
if (message.includes('empty or whitespace') ||
|
|
45
45
|
message.includes('token exhaustion') ||
|
|
46
46
|
message.includes('unexpected end of json')) {
|
|
47
|
-
return {
|
|
47
|
+
return {
|
|
48
|
+
category: 'format_error',
|
|
49
|
+
isRetryable: true,
|
|
50
|
+
message: 'LLM returned empty response (possible token exhaustion)',
|
|
51
|
+
};
|
|
48
52
|
}
|
|
49
53
|
// Check for format errors (LLM returned wrong format) - retryable once
|
|
50
54
|
if (message.includes('invalid question format') ||
|
|
@@ -53,7 +57,11 @@ function categorizeLLMError(error) {
|
|
|
53
57
|
message.includes('not valid json')) {
|
|
54
58
|
return { category: 'format_error', isRetryable: true, message: 'LLM returned invalid format' };
|
|
55
59
|
}
|
|
56
|
-
return {
|
|
60
|
+
return {
|
|
61
|
+
category: 'unknown',
|
|
62
|
+
isRetryable: false,
|
|
63
|
+
message: error instanceof Error ? error.message : String(error),
|
|
64
|
+
};
|
|
57
65
|
}
|
|
58
66
|
/**
|
|
59
67
|
* Orchestrator uses an LLM to generate interview questions and synthesize findings.
|
|
@@ -248,11 +256,13 @@ export class Orchestrator {
|
|
|
248
256
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
249
257
|
let rawResponse;
|
|
250
258
|
try {
|
|
259
|
+
const abortController = new AbortController();
|
|
251
260
|
// Apply timeout to LLM call - use streaming if enabled
|
|
252
261
|
const response = await withTimeout(this.completeWithStreaming(prompt, {
|
|
253
262
|
...COMPLETION_OPTIONS.questionGeneration,
|
|
254
263
|
systemPrompt: this.getSystemPrompt(),
|
|
255
|
-
|
|
264
|
+
signal: abortController.signal,
|
|
265
|
+
}, `generate-questions:${tool.name}`), DEFAULT_TIMEOUTS.questionGeneration, `Question generation for ${tool.name}`, { abortController });
|
|
256
266
|
rawResponse = response;
|
|
257
267
|
// Check for empty/whitespace-only responses (common with token exhaustion)
|
|
258
268
|
const trimmed = response.trim();
|
|
@@ -304,7 +314,7 @@ export class Orchestrator {
|
|
|
304
314
|
// Wait before retry with exponential backoff
|
|
305
315
|
if (attempt < maxRetries) {
|
|
306
316
|
const delay = Math.min(RETRY.INITIAL_DELAY * Math.pow(2, attempt), RETRY.MAX_DELAY);
|
|
307
|
-
await new Promise(resolve => setTimeout(resolve, delay));
|
|
317
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
308
318
|
}
|
|
309
319
|
}
|
|
310
320
|
}
|
|
@@ -361,7 +371,7 @@ export class Orchestrator {
|
|
|
361
371
|
return `Tool returned an error: ${error}`;
|
|
362
372
|
}
|
|
363
373
|
if (response?.content) {
|
|
364
|
-
const textContent = response.content.find(c => c.type === 'text');
|
|
374
|
+
const textContent = response.content.find((c) => c.type === 'text');
|
|
365
375
|
if (textContent && 'text' in textContent) {
|
|
366
376
|
return `Tool returned: ${String(textContent.text).substring(0, DISPLAY_LIMITS.TOOL_RESPONSE_PREVIEW)}`;
|
|
367
377
|
}
|
|
@@ -397,7 +407,7 @@ export class Orchestrator {
|
|
|
397
407
|
return {
|
|
398
408
|
name: tool.name,
|
|
399
409
|
description: tool.description ?? 'No description provided',
|
|
400
|
-
behavioralNotes: interactions.map(i => i.analysis).filter(a => a),
|
|
410
|
+
behavioralNotes: interactions.map((i) => i.analysis).filter((a) => a),
|
|
401
411
|
limitations: [],
|
|
402
412
|
securityNotes: [],
|
|
403
413
|
};
|
|
@@ -694,7 +704,9 @@ export class Orchestrator {
|
|
|
694
704
|
// Check name-based hints
|
|
695
705
|
if (lowerName.includes('path') || lowerName.includes('file')) {
|
|
696
706
|
const baseDir = this.serverContext?.allowedDirectories?.[0] ?? '/tmp';
|
|
697
|
-
if (lowerName.includes('dir') ||
|
|
707
|
+
if (lowerName.includes('dir') ||
|
|
708
|
+
lowerName.includes('directory') ||
|
|
709
|
+
lowerName.includes('folder')) {
|
|
698
710
|
return baseDir;
|
|
699
711
|
}
|
|
700
712
|
return `${baseDir}/test.txt`;
|
|
@@ -717,12 +729,16 @@ export class Orchestrator {
|
|
|
717
729
|
}
|
|
718
730
|
return 'test-name';
|
|
719
731
|
}
|
|
720
|
-
if (lowerName.includes('query') ||
|
|
732
|
+
if (lowerName.includes('query') ||
|
|
733
|
+
lowerName.includes('search') ||
|
|
734
|
+
lowerName.includes('filter')) {
|
|
721
735
|
// Use a more realistic search term based on description
|
|
722
736
|
if (description.includes('movie') || description.includes('film')) {
|
|
723
737
|
return 'The Matrix';
|
|
724
738
|
}
|
|
725
|
-
if (description.includes('music') ||
|
|
739
|
+
if (description.includes('music') ||
|
|
740
|
+
description.includes('song') ||
|
|
741
|
+
description.includes('artist')) {
|
|
726
742
|
return 'Beatles';
|
|
727
743
|
}
|
|
728
744
|
if (description.includes('book') || description.includes('author')) {
|
|
@@ -733,10 +749,14 @@ export class Orchestrator {
|
|
|
733
749
|
if (lowerName.includes('title')) {
|
|
734
750
|
return 'Test Title';
|
|
735
751
|
}
|
|
736
|
-
if (lowerName.includes('description') ||
|
|
752
|
+
if (lowerName.includes('description') ||
|
|
753
|
+
lowerName.includes('summary') ||
|
|
754
|
+
lowerName.includes('text')) {
|
|
737
755
|
return 'This is a test description for validation purposes.';
|
|
738
756
|
}
|
|
739
|
-
if (lowerName.includes('content') ||
|
|
757
|
+
if (lowerName.includes('content') ||
|
|
758
|
+
lowerName.includes('body') ||
|
|
759
|
+
lowerName.includes('message')) {
|
|
740
760
|
return 'Test content for the operation.';
|
|
741
761
|
}
|
|
742
762
|
if (lowerName.includes('comment')) {
|
|
@@ -745,7 +765,9 @@ export class Orchestrator {
|
|
|
745
765
|
if (lowerName.includes('code') || lowerName.includes('snippet')) {
|
|
746
766
|
return 'function example() { return "Hello"; }';
|
|
747
767
|
}
|
|
748
|
-
if (lowerName.includes('pattern') ||
|
|
768
|
+
if (lowerName.includes('pattern') ||
|
|
769
|
+
lowerName.includes('glob') ||
|
|
770
|
+
lowerName.includes('regex')) {
|
|
749
771
|
return '*.txt';
|
|
750
772
|
}
|
|
751
773
|
if (lowerName.includes('format') || lowerName.includes('type')) {
|
|
@@ -830,13 +852,17 @@ export class Orchestrator {
|
|
|
830
852
|
if (lowerName.includes('count') || lowerName.includes('limit') || lowerName.includes('num')) {
|
|
831
853
|
return 10;
|
|
832
854
|
}
|
|
833
|
-
if (lowerName.includes('enabled') ||
|
|
855
|
+
if (lowerName.includes('enabled') ||
|
|
856
|
+
lowerName.includes('active') ||
|
|
857
|
+
lowerName.includes('flag')) {
|
|
834
858
|
return true;
|
|
835
859
|
}
|
|
836
860
|
if (lowerName.includes('list') || lowerName.includes('items') || lowerName.includes('array')) {
|
|
837
861
|
return [];
|
|
838
862
|
}
|
|
839
|
-
if (lowerName.includes('config') ||
|
|
863
|
+
if (lowerName.includes('config') ||
|
|
864
|
+
lowerName.includes('options') ||
|
|
865
|
+
lowerName.includes('settings')) {
|
|
840
866
|
return {};
|
|
841
867
|
}
|
|
842
868
|
return 'test';
|
|
@@ -919,8 +945,7 @@ export class Orchestrator {
|
|
|
919
945
|
if (!schema?.properties)
|
|
920
946
|
return tests;
|
|
921
947
|
const required = new Set(schema.required ?? []);
|
|
922
|
-
const optionalParams = Object.entries(schema.properties)
|
|
923
|
-
.filter(([name]) => !required.has(name));
|
|
948
|
+
const optionalParams = Object.entries(schema.properties).filter(([name]) => !required.has(name));
|
|
924
949
|
if (optionalParams.length === 0)
|
|
925
950
|
return tests;
|
|
926
951
|
const allArgs = {};
|
|
@@ -1053,7 +1078,7 @@ export class Orchestrator {
|
|
|
1053
1078
|
const result = this.llm.parseJSON(response);
|
|
1054
1079
|
// Extract example output from first successful interaction
|
|
1055
1080
|
let exampleOutput;
|
|
1056
|
-
const successful = interactions.find(i => !i.error && i.response?.messages?.length);
|
|
1081
|
+
const successful = interactions.find((i) => !i.error && i.response?.messages?.length);
|
|
1057
1082
|
if (successful?.response) {
|
|
1058
1083
|
const firstMsg = successful.response.messages[0];
|
|
1059
1084
|
if (firstMsg?.content?.type === 'text' && firstMsg.content.text) {
|
|
@@ -1078,7 +1103,7 @@ export class Orchestrator {
|
|
|
1078
1103
|
name: prompt.name,
|
|
1079
1104
|
description: prompt.description ?? 'No description provided',
|
|
1080
1105
|
arguments: prompt.arguments ?? [],
|
|
1081
|
-
behavioralNotes: interactions.map(i => i.analysis).filter(a => a),
|
|
1106
|
+
behavioralNotes: interactions.map((i) => i.analysis).filter((a) => a),
|
|
1082
1107
|
limitations: [],
|
|
1083
1108
|
};
|
|
1084
1109
|
}
|
|
@@ -1102,7 +1127,7 @@ export class Orchestrator {
|
|
|
1102
1127
|
args,
|
|
1103
1128
|
});
|
|
1104
1129
|
// If there are optional args, add a test with all args
|
|
1105
|
-
const optionalArgs = prompt.arguments?.filter(a => !a.required) ?? [];
|
|
1130
|
+
const optionalArgs = prompt.arguments?.filter((a) => !a.required) ?? [];
|
|
1106
1131
|
if (optionalArgs.length > 0) {
|
|
1107
1132
|
const allArgs = { ...args };
|
|
1108
1133
|
for (const arg of optionalArgs) {
|
|
@@ -1210,10 +1235,12 @@ Description: ${resource.description ?? 'No description'}
|
|
|
1210
1235
|
MIME Type: ${resource.mimeType ?? 'Not specified'}
|
|
1211
1236
|
|
|
1212
1237
|
Test interactions:
|
|
1213
|
-
${interactions
|
|
1238
|
+
${interactions
|
|
1239
|
+
.map((i, idx) => `
|
|
1214
1240
|
${idx + 1}. ${i.question.description}
|
|
1215
1241
|
${i.error ? `Error: ${i.error}` : `Analysis: ${i.analysis}`}
|
|
1216
|
-
`)
|
|
1242
|
+
`)
|
|
1243
|
+
.join('')}
|
|
1217
1244
|
|
|
1218
1245
|
Generate a JSON object with:
|
|
1219
1246
|
{
|
|
@@ -1247,7 +1274,7 @@ Return ONLY valid JSON, no explanation.`;
|
|
|
1247
1274
|
name: resource.name,
|
|
1248
1275
|
description: resource.description ?? 'No description provided',
|
|
1249
1276
|
mimeType: resource.mimeType,
|
|
1250
|
-
behavioralNotes: interactions.map(i => i.analysis).filter(a => a),
|
|
1277
|
+
behavioralNotes: interactions.map((i) => i.analysis).filter((a) => a),
|
|
1251
1278
|
limitations: [],
|
|
1252
1279
|
};
|
|
1253
1280
|
}
|