@dotsetlabs/bellwether 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +291 -0
- package/LICENSE +21 -0
- package/README.md +739 -0
- package/dist/auth/credentials.d.ts +64 -0
- package/dist/auth/credentials.js +218 -0
- package/dist/auth/index.d.ts +6 -0
- package/dist/auth/index.js +6 -0
- package/dist/auth/keychain.d.ts +64 -0
- package/dist/auth/keychain.js +268 -0
- package/dist/baseline/ab-testing.d.ts +80 -0
- package/dist/baseline/ab-testing.js +236 -0
- package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
- package/dist/baseline/ai-compatibility-scorer.js +606 -0
- package/dist/baseline/calibration.d.ts +77 -0
- package/dist/baseline/calibration.js +136 -0
- package/dist/baseline/category-matching.d.ts +85 -0
- package/dist/baseline/category-matching.js +289 -0
- package/dist/baseline/change-impact-analyzer.d.ts +98 -0
- package/dist/baseline/change-impact-analyzer.js +592 -0
- package/dist/baseline/comparator.d.ts +64 -0
- package/dist/baseline/comparator.js +916 -0
- package/dist/baseline/confidence.d.ts +55 -0
- package/dist/baseline/confidence.js +122 -0
- package/dist/baseline/converter.d.ts +61 -0
- package/dist/baseline/converter.js +585 -0
- package/dist/baseline/dependency-analyzer.d.ts +89 -0
- package/dist/baseline/dependency-analyzer.js +567 -0
- package/dist/baseline/deprecation-tracker.d.ts +133 -0
- package/dist/baseline/deprecation-tracker.js +322 -0
- package/dist/baseline/diff.d.ts +55 -0
- package/dist/baseline/diff.js +1584 -0
- package/dist/baseline/documentation-scorer.d.ts +205 -0
- package/dist/baseline/documentation-scorer.js +466 -0
- package/dist/baseline/embeddings.d.ts +118 -0
- package/dist/baseline/embeddings.js +251 -0
- package/dist/baseline/error-analyzer.d.ts +198 -0
- package/dist/baseline/error-analyzer.js +721 -0
- package/dist/baseline/evaluation/evaluator.d.ts +42 -0
- package/dist/baseline/evaluation/evaluator.js +323 -0
- package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
- package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
- package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
- package/dist/baseline/evaluation/golden-dataset.js +717 -0
- package/dist/baseline/evaluation/index.d.ts +15 -0
- package/dist/baseline/evaluation/index.js +15 -0
- package/dist/baseline/evaluation/types.d.ts +186 -0
- package/dist/baseline/evaluation/types.js +8 -0
- package/dist/baseline/external-dependency-detector.d.ts +181 -0
- package/dist/baseline/external-dependency-detector.js +524 -0
- package/dist/baseline/golden-output.d.ts +162 -0
- package/dist/baseline/golden-output.js +636 -0
- package/dist/baseline/health-scorer.d.ts +174 -0
- package/dist/baseline/health-scorer.js +451 -0
- package/dist/baseline/incremental-checker.d.ts +97 -0
- package/dist/baseline/incremental-checker.js +174 -0
- package/dist/baseline/index.d.ts +31 -0
- package/dist/baseline/index.js +42 -0
- package/dist/baseline/migration-generator.d.ts +137 -0
- package/dist/baseline/migration-generator.js +554 -0
- package/dist/baseline/migrations.d.ts +60 -0
- package/dist/baseline/migrations.js +197 -0
- package/dist/baseline/performance-tracker.d.ts +214 -0
- package/dist/baseline/performance-tracker.js +577 -0
- package/dist/baseline/pr-comment-generator.d.ts +117 -0
- package/dist/baseline/pr-comment-generator.js +546 -0
- package/dist/baseline/response-fingerprint.d.ts +127 -0
- package/dist/baseline/response-fingerprint.js +728 -0
- package/dist/baseline/response-schema-tracker.d.ts +129 -0
- package/dist/baseline/response-schema-tracker.js +420 -0
- package/dist/baseline/risk-scorer.d.ts +54 -0
- package/dist/baseline/risk-scorer.js +434 -0
- package/dist/baseline/saver.d.ts +89 -0
- package/dist/baseline/saver.js +554 -0
- package/dist/baseline/scenario-generator.d.ts +151 -0
- package/dist/baseline/scenario-generator.js +905 -0
- package/dist/baseline/schema-compare.d.ts +86 -0
- package/dist/baseline/schema-compare.js +557 -0
- package/dist/baseline/schema-evolution.d.ts +189 -0
- package/dist/baseline/schema-evolution.js +467 -0
- package/dist/baseline/semantic.d.ts +203 -0
- package/dist/baseline/semantic.js +908 -0
- package/dist/baseline/synonyms.d.ts +60 -0
- package/dist/baseline/synonyms.js +386 -0
- package/dist/baseline/telemetry.d.ts +165 -0
- package/dist/baseline/telemetry.js +294 -0
- package/dist/baseline/test-pruner.d.ts +120 -0
- package/dist/baseline/test-pruner.js +387 -0
- package/dist/baseline/types.d.ts +449 -0
- package/dist/baseline/types.js +5 -0
- package/dist/baseline/version.d.ts +138 -0
- package/dist/baseline/version.js +206 -0
- package/dist/cache/index.d.ts +5 -0
- package/dist/cache/index.js +5 -0
- package/dist/cache/response-cache.d.ts +151 -0
- package/dist/cache/response-cache.js +287 -0
- package/dist/ci/index.d.ts +60 -0
- package/dist/ci/index.js +342 -0
- package/dist/cli/commands/auth.d.ts +12 -0
- package/dist/cli/commands/auth.js +352 -0
- package/dist/cli/commands/badge.d.ts +3 -0
- package/dist/cli/commands/badge.js +74 -0
- package/dist/cli/commands/baseline-accept.d.ts +15 -0
- package/dist/cli/commands/baseline-accept.js +178 -0
- package/dist/cli/commands/baseline-migrate.d.ts +12 -0
- package/dist/cli/commands/baseline-migrate.js +164 -0
- package/dist/cli/commands/baseline.d.ts +14 -0
- package/dist/cli/commands/baseline.js +449 -0
- package/dist/cli/commands/beta.d.ts +10 -0
- package/dist/cli/commands/beta.js +231 -0
- package/dist/cli/commands/check.d.ts +11 -0
- package/dist/cli/commands/check.js +820 -0
- package/dist/cli/commands/cloud/badge.d.ts +3 -0
- package/dist/cli/commands/cloud/badge.js +74 -0
- package/dist/cli/commands/cloud/diff.d.ts +6 -0
- package/dist/cli/commands/cloud/diff.js +79 -0
- package/dist/cli/commands/cloud/history.d.ts +6 -0
- package/dist/cli/commands/cloud/history.js +102 -0
- package/dist/cli/commands/cloud/link.d.ts +9 -0
- package/dist/cli/commands/cloud/link.js +119 -0
- package/dist/cli/commands/cloud/login.d.ts +7 -0
- package/dist/cli/commands/cloud/login.js +499 -0
- package/dist/cli/commands/cloud/projects.d.ts +6 -0
- package/dist/cli/commands/cloud/projects.js +44 -0
- package/dist/cli/commands/cloud/shared.d.ts +7 -0
- package/dist/cli/commands/cloud/shared.js +42 -0
- package/dist/cli/commands/cloud/teams.d.ts +8 -0
- package/dist/cli/commands/cloud/teams.js +169 -0
- package/dist/cli/commands/cloud/upload.d.ts +8 -0
- package/dist/cli/commands/cloud/upload.js +181 -0
- package/dist/cli/commands/contract.d.ts +11 -0
- package/dist/cli/commands/contract.js +280 -0
- package/dist/cli/commands/discover.d.ts +3 -0
- package/dist/cli/commands/discover.js +82 -0
- package/dist/cli/commands/eval.d.ts +9 -0
- package/dist/cli/commands/eval.js +187 -0
- package/dist/cli/commands/explore.d.ts +11 -0
- package/dist/cli/commands/explore.js +437 -0
- package/dist/cli/commands/feedback.d.ts +9 -0
- package/dist/cli/commands/feedback.js +174 -0
- package/dist/cli/commands/golden.d.ts +12 -0
- package/dist/cli/commands/golden.js +407 -0
- package/dist/cli/commands/history.d.ts +10 -0
- package/dist/cli/commands/history.js +202 -0
- package/dist/cli/commands/init.d.ts +9 -0
- package/dist/cli/commands/init.js +219 -0
- package/dist/cli/commands/interview.d.ts +3 -0
- package/dist/cli/commands/interview.js +903 -0
- package/dist/cli/commands/link.d.ts +10 -0
- package/dist/cli/commands/link.js +169 -0
- package/dist/cli/commands/login.d.ts +7 -0
- package/dist/cli/commands/login.js +499 -0
- package/dist/cli/commands/preset.d.ts +33 -0
- package/dist/cli/commands/preset.js +297 -0
- package/dist/cli/commands/profile.d.ts +33 -0
- package/dist/cli/commands/profile.js +286 -0
- package/dist/cli/commands/registry.d.ts +11 -0
- package/dist/cli/commands/registry.js +146 -0
- package/dist/cli/commands/shared.d.ts +79 -0
- package/dist/cli/commands/shared.js +196 -0
- package/dist/cli/commands/teams.d.ts +8 -0
- package/dist/cli/commands/teams.js +169 -0
- package/dist/cli/commands/test.d.ts +9 -0
- package/dist/cli/commands/test.js +500 -0
- package/dist/cli/commands/upload.d.ts +8 -0
- package/dist/cli/commands/upload.js +223 -0
- package/dist/cli/commands/validate-config.d.ts +6 -0
- package/dist/cli/commands/validate-config.js +35 -0
- package/dist/cli/commands/verify.d.ts +11 -0
- package/dist/cli/commands/verify.js +283 -0
- package/dist/cli/commands/watch.d.ts +12 -0
- package/dist/cli/commands/watch.js +253 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.js +178 -0
- package/dist/cli/interactive.d.ts +47 -0
- package/dist/cli/interactive.js +216 -0
- package/dist/cli/output/terminal-reporter.d.ts +19 -0
- package/dist/cli/output/terminal-reporter.js +104 -0
- package/dist/cli/output.d.ts +226 -0
- package/dist/cli/output.js +438 -0
- package/dist/cli/utils/env.d.ts +5 -0
- package/dist/cli/utils/env.js +14 -0
- package/dist/cli/utils/progress.d.ts +59 -0
- package/dist/cli/utils/progress.js +206 -0
- package/dist/cli/utils/server-context.d.ts +10 -0
- package/dist/cli/utils/server-context.js +36 -0
- package/dist/cloud/auth.d.ts +144 -0
- package/dist/cloud/auth.js +374 -0
- package/dist/cloud/client.d.ts +24 -0
- package/dist/cloud/client.js +65 -0
- package/dist/cloud/http-client.d.ts +38 -0
- package/dist/cloud/http-client.js +215 -0
- package/dist/cloud/index.d.ts +23 -0
- package/dist/cloud/index.js +25 -0
- package/dist/cloud/mock-client.d.ts +107 -0
- package/dist/cloud/mock-client.js +545 -0
- package/dist/cloud/types.d.ts +515 -0
- package/dist/cloud/types.js +15 -0
- package/dist/config/defaults.d.ts +160 -0
- package/dist/config/defaults.js +169 -0
- package/dist/config/loader.d.ts +24 -0
- package/dist/config/loader.js +122 -0
- package/dist/config/template.d.ts +42 -0
- package/dist/config/template.js +647 -0
- package/dist/config/validator.d.ts +2112 -0
- package/dist/config/validator.js +658 -0
- package/dist/constants/cloud.d.ts +107 -0
- package/dist/constants/cloud.js +110 -0
- package/dist/constants/core.d.ts +521 -0
- package/dist/constants/core.js +556 -0
- package/dist/constants/testing.d.ts +1283 -0
- package/dist/constants/testing.js +1568 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.js +10 -0
- package/dist/contract/index.d.ts +6 -0
- package/dist/contract/index.js +5 -0
- package/dist/contract/validator.d.ts +177 -0
- package/dist/contract/validator.js +574 -0
- package/dist/cost/index.d.ts +6 -0
- package/dist/cost/index.js +5 -0
- package/dist/cost/tracker.d.ts +134 -0
- package/dist/cost/tracker.js +313 -0
- package/dist/discovery/discovery.d.ts +16 -0
- package/dist/discovery/discovery.js +173 -0
- package/dist/discovery/types.d.ts +51 -0
- package/dist/discovery/types.js +2 -0
- package/dist/docs/agents.d.ts +3 -0
- package/dist/docs/agents.js +995 -0
- package/dist/docs/contract.d.ts +51 -0
- package/dist/docs/contract.js +1681 -0
- package/dist/docs/generator.d.ts +4 -0
- package/dist/docs/generator.js +4 -0
- package/dist/docs/html-reporter.d.ts +9 -0
- package/dist/docs/html-reporter.js +757 -0
- package/dist/docs/index.d.ts +10 -0
- package/dist/docs/index.js +11 -0
- package/dist/docs/junit-reporter.d.ts +18 -0
- package/dist/docs/junit-reporter.js +210 -0
- package/dist/docs/report.d.ts +14 -0
- package/dist/docs/report.js +44 -0
- package/dist/docs/sarif-reporter.d.ts +19 -0
- package/dist/docs/sarif-reporter.js +335 -0
- package/dist/docs/shared.d.ts +35 -0
- package/dist/docs/shared.js +162 -0
- package/dist/docs/templates.d.ts +12 -0
- package/dist/docs/templates.js +76 -0
- package/dist/errors/index.d.ts +6 -0
- package/dist/errors/index.js +6 -0
- package/dist/errors/retry.d.ts +92 -0
- package/dist/errors/retry.js +323 -0
- package/dist/errors/types.d.ts +321 -0
- package/dist/errors/types.js +584 -0
- package/dist/index.d.ts +32 -0
- package/dist/index.js +32 -0
- package/dist/interview/dependency-resolver.d.ts +11 -0
- package/dist/interview/dependency-resolver.js +32 -0
- package/dist/interview/interviewer.d.ts +232 -0
- package/dist/interview/interviewer.js +1939 -0
- package/dist/interview/mock-response-generator.d.ts +7 -0
- package/dist/interview/mock-response-generator.js +102 -0
- package/dist/interview/orchestrator.d.ts +237 -0
- package/dist/interview/orchestrator.js +1296 -0
- package/dist/interview/rate-limiter.d.ts +15 -0
- package/dist/interview/rate-limiter.js +55 -0
- package/dist/interview/response-validator.d.ts +10 -0
- package/dist/interview/response-validator.js +132 -0
- package/dist/interview/schema-inferrer.d.ts +8 -0
- package/dist/interview/schema-inferrer.js +71 -0
- package/dist/interview/schema-test-generator.d.ts +71 -0
- package/dist/interview/schema-test-generator.js +834 -0
- package/dist/interview/smart-value-generator.d.ts +155 -0
- package/dist/interview/smart-value-generator.js +554 -0
- package/dist/interview/stateful-test-runner.d.ts +19 -0
- package/dist/interview/stateful-test-runner.js +106 -0
- package/dist/interview/types.d.ts +561 -0
- package/dist/interview/types.js +2 -0
- package/dist/llm/anthropic.d.ts +41 -0
- package/dist/llm/anthropic.js +355 -0
- package/dist/llm/client.d.ts +123 -0
- package/dist/llm/client.js +42 -0
- package/dist/llm/factory.d.ts +38 -0
- package/dist/llm/factory.js +145 -0
- package/dist/llm/fallback.d.ts +140 -0
- package/dist/llm/fallback.js +379 -0
- package/dist/llm/index.d.ts +18 -0
- package/dist/llm/index.js +15 -0
- package/dist/llm/ollama.d.ts +37 -0
- package/dist/llm/ollama.js +330 -0
- package/dist/llm/openai.d.ts +25 -0
- package/dist/llm/openai.js +320 -0
- package/dist/llm/token-budget.d.ts +161 -0
- package/dist/llm/token-budget.js +395 -0
- package/dist/logging/logger.d.ts +70 -0
- package/dist/logging/logger.js +130 -0
- package/dist/metrics/collector.d.ts +106 -0
- package/dist/metrics/collector.js +547 -0
- package/dist/metrics/index.d.ts +7 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/prometheus.d.ts +20 -0
- package/dist/metrics/prometheus.js +241 -0
- package/dist/metrics/types.d.ts +209 -0
- package/dist/metrics/types.js +5 -0
- package/dist/persona/builtins.d.ts +54 -0
- package/dist/persona/builtins.js +219 -0
- package/dist/persona/index.d.ts +8 -0
- package/dist/persona/index.js +8 -0
- package/dist/persona/loader.d.ts +30 -0
- package/dist/persona/loader.js +190 -0
- package/dist/persona/types.d.ts +144 -0
- package/dist/persona/types.js +5 -0
- package/dist/persona/validation.d.ts +94 -0
- package/dist/persona/validation.js +332 -0
- package/dist/prompts/index.d.ts +5 -0
- package/dist/prompts/index.js +5 -0
- package/dist/prompts/templates.d.ts +180 -0
- package/dist/prompts/templates.js +431 -0
- package/dist/registry/client.d.ts +49 -0
- package/dist/registry/client.js +191 -0
- package/dist/registry/index.d.ts +7 -0
- package/dist/registry/index.js +6 -0
- package/dist/registry/types.d.ts +140 -0
- package/dist/registry/types.js +6 -0
- package/dist/scenarios/evaluator.d.ts +43 -0
- package/dist/scenarios/evaluator.js +206 -0
- package/dist/scenarios/index.d.ts +10 -0
- package/dist/scenarios/index.js +9 -0
- package/dist/scenarios/loader.d.ts +20 -0
- package/dist/scenarios/loader.js +285 -0
- package/dist/scenarios/types.d.ts +153 -0
- package/dist/scenarios/types.js +8 -0
- package/dist/security/index.d.ts +17 -0
- package/dist/security/index.js +18 -0
- package/dist/security/payloads.d.ts +61 -0
- package/dist/security/payloads.js +268 -0
- package/dist/security/security-tester.d.ts +42 -0
- package/dist/security/security-tester.js +582 -0
- package/dist/security/types.d.ts +166 -0
- package/dist/security/types.js +8 -0
- package/dist/transport/base-transport.d.ts +59 -0
- package/dist/transport/base-transport.js +38 -0
- package/dist/transport/http-transport.d.ts +67 -0
- package/dist/transport/http-transport.js +238 -0
- package/dist/transport/mcp-client.d.ts +141 -0
- package/dist/transport/mcp-client.js +496 -0
- package/dist/transport/sse-transport.d.ts +88 -0
- package/dist/transport/sse-transport.js +316 -0
- package/dist/transport/stdio-transport.d.ts +43 -0
- package/dist/transport/stdio-transport.js +238 -0
- package/dist/transport/types.d.ts +125 -0
- package/dist/transport/types.js +16 -0
- package/dist/utils/concurrency.d.ts +123 -0
- package/dist/utils/concurrency.js +213 -0
- package/dist/utils/formatters.d.ts +16 -0
- package/dist/utils/formatters.js +37 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/jsonpath.d.ts +87 -0
- package/dist/utils/jsonpath.js +326 -0
- package/dist/utils/markdown.d.ts +113 -0
- package/dist/utils/markdown.js +265 -0
- package/dist/utils/network.d.ts +14 -0
- package/dist/utils/network.js +17 -0
- package/dist/utils/sanitize.d.ts +92 -0
- package/dist/utils/sanitize.js +191 -0
- package/dist/utils/semantic.d.ts +194 -0
- package/dist/utils/semantic.js +1051 -0
- package/dist/utils/smart-truncate.d.ts +94 -0
- package/dist/utils/smart-truncate.js +361 -0
- package/dist/utils/timeout.d.ts +153 -0
- package/dist/utils/timeout.js +205 -0
- package/dist/utils/yaml-parser.d.ts +58 -0
- package/dist/utils/yaml-parser.js +86 -0
- package/dist/validation/index.d.ts +32 -0
- package/dist/validation/index.js +32 -0
- package/dist/validation/semantic-test-generator.d.ts +50 -0
- package/dist/validation/semantic-test-generator.js +176 -0
- package/dist/validation/semantic-types.d.ts +66 -0
- package/dist/validation/semantic-types.js +94 -0
- package/dist/validation/semantic-validator.d.ts +38 -0
- package/dist/validation/semantic-validator.js +340 -0
- package/dist/verification/index.d.ts +6 -0
- package/dist/verification/index.js +5 -0
- package/dist/verification/types.d.ts +133 -0
- package/dist/verification/types.js +5 -0
- package/dist/verification/verifier.d.ts +30 -0
- package/dist/verification/verifier.js +309 -0
- package/dist/version.d.ts +19 -0
- package/dist/version.js +48 -0
- package/dist/workflow/auto-generator.d.ts +27 -0
- package/dist/workflow/auto-generator.js +513 -0
- package/dist/workflow/discovery.d.ts +40 -0
- package/dist/workflow/discovery.js +195 -0
- package/dist/workflow/executor.d.ts +82 -0
- package/dist/workflow/executor.js +611 -0
- package/dist/workflow/index.d.ts +10 -0
- package/dist/workflow/index.js +10 -0
- package/dist/workflow/loader.d.ts +24 -0
- package/dist/workflow/loader.js +194 -0
- package/dist/workflow/state-tracker.d.ts +98 -0
- package/dist/workflow/state-tracker.js +424 -0
- package/dist/workflow/types.d.ts +337 -0
- package/dist/workflow/types.js +5 -0
- package/package.json +94 -0
- package/schemas/bellwether-check.schema.json +651 -0
|
@@ -0,0 +1,1939 @@
|
|
|
1
|
+
import { Orchestrator } from './orchestrator.js';
|
|
2
|
+
import { categorizeErrorSource, detectExternalServiceFromTool, getExternalServiceStatus, } from '../baseline/external-dependency-detector.js';
|
|
3
|
+
import { DEFAULT_PERSONA } from '../persona/builtins.js';
|
|
4
|
+
import { getLogger, startTiming } from '../logging/logger.js';
|
|
5
|
+
import { evaluateAssertions } from '../scenarios/evaluator.js';
|
|
6
|
+
import { withTimeout, DEFAULT_TIMEOUTS, parallelLimit, createMutex } from '../utils/index.js';
|
|
7
|
+
import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT } from '../constants.js';
|
|
8
|
+
import { generateSchemaTests } from './schema-test-generator.js';
|
|
9
|
+
import { WorkflowDiscoverer } from '../workflow/discovery.js';
|
|
10
|
+
import { WorkflowExecutor } from '../workflow/executor.js';
|
|
11
|
+
import { RateLimiter, calculateBackoffMs, isRateLimitError } from './rate-limiter.js';
|
|
12
|
+
import { inferResponseSchema } from './schema-inferrer.js';
|
|
13
|
+
import { validateResponseAssertions } from './response-validator.js';
|
|
14
|
+
import { StatefulTestRunner } from './stateful-test-runner.js';
|
|
15
|
+
import { resolveToolDependencies, getDependencyOrder } from './dependency-resolver.js';
|
|
16
|
+
import { generateMockResponse } from './mock-response-generator.js';
|
|
17
|
+
/**
|
|
18
|
+
* Default interview configuration.
|
|
19
|
+
*/
|
|
20
|
+
export const DEFAULT_CONFIG = {
|
|
21
|
+
maxQuestionsPerTool: INTERVIEW.MAX_QUESTIONS_PER_TOOL,
|
|
22
|
+
timeout: INTERVIEW.TOOL_TIMEOUT,
|
|
23
|
+
skipErrorTests: false,
|
|
24
|
+
};
|
|
25
|
+
/**
|
|
26
|
+
* Default personas to use if none specified.
|
|
27
|
+
* Uses Technical Writer only for a fast, cost-effective default experience.
|
|
28
|
+
* Use --security or --personas to add more personas.
|
|
29
|
+
*/
|
|
30
|
+
export const DEFAULT_PERSONAS = [DEFAULT_PERSONA];
|
|
31
|
+
/**
|
|
32
|
+
* Interviewer conducts the interview process using the orchestrator.
|
|
33
|
+
* Supports streaming output for real-time feedback during LLM operations.
|
|
34
|
+
* Supports parallel persona execution for improved performance.
|
|
35
|
+
* Supports caching tool responses and LLM analysis for efficiency.
|
|
36
|
+
*
|
|
37
|
+
* Two modes of operation:
|
|
38
|
+
* - Check mode: No LLM required, uses fallback questions and simple analysis
|
|
39
|
+
* - Explore mode: LLM required for question generation and behavioral analysis
|
|
40
|
+
*/
|
|
41
|
+
export class Interviewer {
|
|
42
|
+
llm;
|
|
43
|
+
config;
|
|
44
|
+
personas;
|
|
45
|
+
logger = getLogger('interviewer');
|
|
46
|
+
serverContext;
|
|
47
|
+
cache;
|
|
48
|
+
rateLimiter;
|
|
49
|
+
responseSchemas = new Map();
|
|
50
|
+
rateLimitEvents = new Map();
|
|
51
|
+
rateLimitRetries = 0;
|
|
52
|
+
externalServiceStatuses = new Map();
|
|
53
|
+
skippedTools = new Set();
|
|
54
|
+
mockedTools = new Set();
|
|
55
|
+
constructor(llm, config) {
|
|
56
|
+
this.llm = llm;
|
|
57
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
58
|
+
// Validate: if no LLM provided, must be in check mode
|
|
59
|
+
if (!llm && !this.config.checkMode) {
|
|
60
|
+
throw new Error('LLM client is required for explore mode. Use checkMode: true for check mode.');
|
|
61
|
+
}
|
|
62
|
+
// Use multiple personas by default for better coverage
|
|
63
|
+
// Fall back to DEFAULT_PERSONAS if no personas provided or empty array
|
|
64
|
+
const providedPersonas = config?.personas;
|
|
65
|
+
this.personas = (providedPersonas && providedPersonas.length > 0) ? providedPersonas : DEFAULT_PERSONAS;
|
|
66
|
+
// Store cache reference for tool response and analysis caching
|
|
67
|
+
this.cache = config?.cache;
|
|
68
|
+
if (this.config.rateLimit?.enabled) {
|
|
69
|
+
this.rateLimiter = new RateLimiter(this.config.rateLimit);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Create an orchestrator with streaming and caching enabled if configured.
|
|
74
|
+
* Throws an error if called in check mode since orchestrator requires LLM.
|
|
75
|
+
*/
|
|
76
|
+
createOrchestrator(persona) {
|
|
77
|
+
if (!this.llm) {
|
|
78
|
+
throw new Error('Cannot create orchestrator in check mode - LLM client is required');
|
|
79
|
+
}
|
|
80
|
+
const orchestrator = new Orchestrator(this.llm, persona, this.serverContext, this.cache);
|
|
81
|
+
// Enable streaming if configured
|
|
82
|
+
if (this.config.enableStreaming && this.config.streamingCallbacks) {
|
|
83
|
+
orchestrator.enableStreaming(this.config.streamingCallbacks);
|
|
84
|
+
}
|
|
85
|
+
return orchestrator;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Generate simple analysis for check/fast mode.
|
|
89
|
+
* Avoids LLM calls by providing basic success/error messages.
|
|
90
|
+
*/
|
|
91
|
+
generateSimpleAnalysis(error, hasResponse, successMessage) {
|
|
92
|
+
if (error) {
|
|
93
|
+
return `Error: ${error}`;
|
|
94
|
+
}
|
|
95
|
+
if (hasResponse) {
|
|
96
|
+
return successMessage;
|
|
97
|
+
}
|
|
98
|
+
return 'No response received.';
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Assess whether the tool interaction outcome matched expectations.
|
|
102
|
+
*/
|
|
103
|
+
assessOutcome(question, response, error) {
|
|
104
|
+
const expected = this.inferExpectedOutcome(question);
|
|
105
|
+
const actual = error || response?.isError ? 'error' : 'success';
|
|
106
|
+
const correct = expected === 'either' || expected === actual;
|
|
107
|
+
const isValidationSuccess = expected === 'error' && actual === 'error';
|
|
108
|
+
return {
|
|
109
|
+
expected,
|
|
110
|
+
actual,
|
|
111
|
+
correct,
|
|
112
|
+
isValidationSuccess,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Infer expected outcome when not explicitly provided.
|
|
117
|
+
*/
|
|
118
|
+
inferExpectedOutcome(question) {
|
|
119
|
+
if (question.expectedOutcome)
|
|
120
|
+
return question.expectedOutcome;
|
|
121
|
+
if (OUTCOME_ASSESSMENT.EXPECTS_ERROR_CATEGORIES.includes(question.category)) {
|
|
122
|
+
return 'error';
|
|
123
|
+
}
|
|
124
|
+
if (OUTCOME_ASSESSMENT.EXPECTS_SUCCESS_CATEGORIES.includes(question.category)) {
|
|
125
|
+
return 'success';
|
|
126
|
+
}
|
|
127
|
+
if (OUTCOME_ASSESSMENT.EITHER_OUTCOME_CATEGORIES.includes(question.category)) {
|
|
128
|
+
return 'either';
|
|
129
|
+
}
|
|
130
|
+
if (OUTCOME_ASSESSMENT.EXPECTS_ERROR_PATTERNS.some((pattern) => pattern.test(question.description))) {
|
|
131
|
+
return 'error';
|
|
132
|
+
}
|
|
133
|
+
return 'success';
|
|
134
|
+
}
|
|
135
|
+
extractErrorMessage(response, error) {
|
|
136
|
+
if (error)
|
|
137
|
+
return error;
|
|
138
|
+
const errorContent = response?.content?.find((c) => c.type === 'text');
|
|
139
|
+
if (errorContent && 'text' in errorContent) {
|
|
140
|
+
return String(errorContent.text);
|
|
141
|
+
}
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
resolveExternalServiceDecision(tool) {
|
|
145
|
+
const externalConfig = this.config.externalServices;
|
|
146
|
+
if (!externalConfig) {
|
|
147
|
+
return { action: 'allow' };
|
|
148
|
+
}
|
|
149
|
+
const detected = detectExternalServiceFromTool(tool.name, tool.description);
|
|
150
|
+
if (!detected) {
|
|
151
|
+
return { action: 'allow' };
|
|
152
|
+
}
|
|
153
|
+
const status = getExternalServiceStatus(detected.serviceName, externalConfig);
|
|
154
|
+
this.externalServiceStatuses.set(detected.serviceName, status);
|
|
155
|
+
if (status.configured) {
|
|
156
|
+
return { action: 'allow', serviceName: detected.serviceName };
|
|
157
|
+
}
|
|
158
|
+
const missing = status.missingCredentials.length > 0
|
|
159
|
+
? `Missing: ${status.missingCredentials.join(', ')}`
|
|
160
|
+
: 'Service not configured';
|
|
161
|
+
if (externalConfig.mode === 'fail') {
|
|
162
|
+
throw new Error(`External service "${detected.displayName}" is not configured. ${missing}`);
|
|
163
|
+
}
|
|
164
|
+
if (externalConfig.mode === 'mock' && status.mockAvailable) {
|
|
165
|
+
return {
|
|
166
|
+
action: 'mock',
|
|
167
|
+
serviceName: detected.serviceName,
|
|
168
|
+
reason: missing,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
return {
|
|
172
|
+
action: 'skip',
|
|
173
|
+
serviceName: detected.serviceName,
|
|
174
|
+
reason: missing,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
recordRateLimitEvent(toolName) {
|
|
178
|
+
const current = this.rateLimitEvents.get(toolName) ?? 0;
|
|
179
|
+
this.rateLimitEvents.set(toolName, current + 1);
|
|
180
|
+
}
|
|
181
|
+
async callToolWithPolicies(client, tool, args, decisionOverride) {
|
|
182
|
+
const decision = decisionOverride ?? this.resolveExternalServiceDecision(tool);
|
|
183
|
+
if (decision.action === 'skip') {
|
|
184
|
+
this.skippedTools.add(tool.name);
|
|
185
|
+
return {
|
|
186
|
+
response: null,
|
|
187
|
+
error: null,
|
|
188
|
+
skipped: true,
|
|
189
|
+
skipReason: decision.reason,
|
|
190
|
+
toolExecutionMs: 0,
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
if (decision.action === 'mock') {
|
|
194
|
+
if (decision.serviceName) {
|
|
195
|
+
this.mockedTools.add(tool.name);
|
|
196
|
+
return {
|
|
197
|
+
response: generateMockResponse(tool, decision.serviceName),
|
|
198
|
+
error: null,
|
|
199
|
+
mocked: true,
|
|
200
|
+
mockService: decision.serviceName,
|
|
201
|
+
toolExecutionMs: 0,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
this.skippedTools.add(tool.name);
|
|
205
|
+
return {
|
|
206
|
+
response: null,
|
|
207
|
+
error: null,
|
|
208
|
+
skipped: true,
|
|
209
|
+
skipReason: 'Mock response unavailable',
|
|
210
|
+
toolExecutionMs: 0,
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
const rateLimitEnabled = this.config.rateLimit?.enabled ?? false;
|
|
214
|
+
let attempts = 0;
|
|
215
|
+
let lastError = null;
|
|
216
|
+
let toolExecutionMs = 0;
|
|
217
|
+
while (attempts <= (this.config.rateLimit?.maxRetries ?? 0)) {
|
|
218
|
+
if (this.rateLimiter) {
|
|
219
|
+
await this.rateLimiter.acquire();
|
|
220
|
+
}
|
|
221
|
+
const toolCallStart = Date.now();
|
|
222
|
+
try {
|
|
223
|
+
const response = await client.callTool(tool.name, args);
|
|
224
|
+
toolExecutionMs = Date.now() - toolCallStart;
|
|
225
|
+
const errorMessage = response.isError ? this.extractErrorMessage(response, null) : null;
|
|
226
|
+
if (rateLimitEnabled && response.isError && isRateLimitError(errorMessage)) {
|
|
227
|
+
this.recordRateLimitEvent(tool.name);
|
|
228
|
+
this.rateLimitRetries += 1;
|
|
229
|
+
attempts += 1;
|
|
230
|
+
const backoff = calculateBackoffMs(attempts, this.config.rateLimit?.backoffStrategy ?? 'exponential');
|
|
231
|
+
await new Promise((resolve) => setTimeout(resolve, backoff));
|
|
232
|
+
lastError = errorMessage ?? 'Rate limit exceeded';
|
|
233
|
+
continue;
|
|
234
|
+
}
|
|
235
|
+
return { response, error: errorMessage, toolExecutionMs };
|
|
236
|
+
}
|
|
237
|
+
catch (error) {
|
|
238
|
+
toolExecutionMs = Date.now() - toolCallStart;
|
|
239
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
240
|
+
if (rateLimitEnabled && isRateLimitError(message)) {
|
|
241
|
+
this.recordRateLimitEvent(tool.name);
|
|
242
|
+
this.rateLimitRetries += 1;
|
|
243
|
+
attempts += 1;
|
|
244
|
+
const backoff = calculateBackoffMs(attempts, this.config.rateLimit?.backoffStrategy ?? 'exponential');
|
|
245
|
+
await new Promise((resolve) => setTimeout(resolve, backoff));
|
|
246
|
+
lastError = message;
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
return { response: null, error: message, toolExecutionMs };
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
return { response: null, error: lastError ?? 'Rate limit exceeded', toolExecutionMs };
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Check if we're in fast/check mode (no LLM calls).
|
|
256
|
+
*/
|
|
257
|
+
isCheckMode() {
|
|
258
|
+
return this.config.customScenariosOnly || this.config.checkMode || false;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Extract server context by probing discovery tools.
|
|
262
|
+
* Looks for tools like list_allowed_directories to understand constraints.
|
|
263
|
+
*/
|
|
264
|
+
async extractServerContext(client, discovery) {
|
|
265
|
+
const context = {
|
|
266
|
+
allowedDirectories: [],
|
|
267
|
+
allowedHosts: [],
|
|
268
|
+
constraints: [],
|
|
269
|
+
hints: [],
|
|
270
|
+
};
|
|
271
|
+
// Look for tools that reveal server constraints
|
|
272
|
+
for (const toolName of INTERVIEW.CONSTRAINT_DISCOVERY_TOOLS) {
|
|
273
|
+
const tool = discovery.tools.find(t => t.name === toolName);
|
|
274
|
+
if (tool) {
|
|
275
|
+
try {
|
|
276
|
+
const result = await client.callTool(toolName, {});
|
|
277
|
+
if (result?.content) {
|
|
278
|
+
const textContent = result.content.find(c => c.type === 'text');
|
|
279
|
+
if (textContent && 'text' in textContent) {
|
|
280
|
+
const text = String(textContent.text);
|
|
281
|
+
// Parse allowed directories from response
|
|
282
|
+
const dirs = this.parseAllowedDirectories(text);
|
|
283
|
+
if (dirs.length > 0) {
|
|
284
|
+
context.allowedDirectories = dirs;
|
|
285
|
+
this.logger.info({ dirs }, 'Extracted allowed directories from server');
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
catch (error) {
|
|
291
|
+
this.logger.debug({
|
|
292
|
+
toolName,
|
|
293
|
+
error: error instanceof Error ? error.message : String(error),
|
|
294
|
+
}, 'Tool probe failed during context extraction');
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
// Extract hints and hosts from tool descriptions
|
|
299
|
+
for (const tool of discovery.tools) {
|
|
300
|
+
if (tool.description) {
|
|
301
|
+
const desc = tool.description.toLowerCase();
|
|
302
|
+
// Look for path restrictions mentioned in descriptions
|
|
303
|
+
if (desc.includes('allowed director') || desc.includes('within allowed')) {
|
|
304
|
+
context.hints?.push(`${tool.name}: operates within allowed directories only`);
|
|
305
|
+
}
|
|
306
|
+
if (desc.includes('only works within')) {
|
|
307
|
+
const match = tool.description.match(/only works within (.+?)(?:\.|$)/i);
|
|
308
|
+
if (match) {
|
|
309
|
+
context.hints?.push(`${tool.name}: ${match[0]}`);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
// Extract allowed hosts/URLs from descriptions
|
|
313
|
+
const urlMatch = tool.description.match(/https?:\/\/[^\s"'<>]+/gi);
|
|
314
|
+
if (urlMatch) {
|
|
315
|
+
for (const url of urlMatch) {
|
|
316
|
+
try {
|
|
317
|
+
const parsed = new URL(url);
|
|
318
|
+
const baseUrl = `${parsed.protocol}//${parsed.host}`;
|
|
319
|
+
if (!context.allowedHosts?.includes(baseUrl)) {
|
|
320
|
+
context.allowedHosts?.push(baseUrl);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
catch {
|
|
324
|
+
// Invalid URL, skip
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
// If we didn't find explicit directories but have hints, try to infer from CLI args
|
|
331
|
+
// This will be populated by the interview command based on server args
|
|
332
|
+
if (context.allowedDirectories?.length === 0) {
|
|
333
|
+
// Default fallback - will be overridden if server args specify directories
|
|
334
|
+
context.constraints?.push('Server may have directory restrictions - watch for access denied errors');
|
|
335
|
+
}
|
|
336
|
+
return context;
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Parse allowed directories from tool response text.
|
|
340
|
+
*/
|
|
341
|
+
parseAllowedDirectories(text) {
|
|
342
|
+
const dirs = [];
|
|
343
|
+
// Try to parse as JSON array
|
|
344
|
+
try {
|
|
345
|
+
const parsed = JSON.parse(text);
|
|
346
|
+
if (Array.isArray(parsed)) {
|
|
347
|
+
return parsed.filter(d => typeof d === 'string' && d.startsWith('/'));
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
catch (error) {
|
|
351
|
+
this.logger.debug({
|
|
352
|
+
error: error instanceof Error ? error.message : String(error),
|
|
353
|
+
textPreview: text.substring(0, 100),
|
|
354
|
+
}, 'Directory list not JSON, trying line-by-line parsing');
|
|
355
|
+
}
|
|
356
|
+
// Parse line by line looking for paths
|
|
357
|
+
const lines = text.split('\n');
|
|
358
|
+
for (const line of lines) {
|
|
359
|
+
const trimmed = line.trim();
|
|
360
|
+
// Match absolute paths
|
|
361
|
+
if (trimmed.startsWith('/') && !trimmed.includes(' ')) {
|
|
362
|
+
dirs.push(trimmed);
|
|
363
|
+
}
|
|
364
|
+
// Match "Allowed: /path" format
|
|
365
|
+
const match = trimmed.match(/allowed[:\s]+(.+)/i);
|
|
366
|
+
if (match) {
|
|
367
|
+
const path = match[1].trim();
|
|
368
|
+
if (path.startsWith('/')) {
|
|
369
|
+
dirs.push(path);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
return [...new Set(dirs)]; // Dedupe
|
|
374
|
+
}
|
|
375
|
+
/**
|
|
376
|
+
* Set server context directly (e.g., from CLI arguments).
|
|
377
|
+
*/
|
|
378
|
+
setServerContext(context) {
|
|
379
|
+
this.serverContext = context;
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Run a complete interview on a connected MCP server.
|
|
383
|
+
* Supports multiple personas - runs each persona's interview and aggregates findings.
|
|
384
|
+
*/
|
|
385
|
+
async interview(client, discovery, onProgress) {
|
|
386
|
+
const done = startTiming(this.logger, 'interview');
|
|
387
|
+
const startTime = new Date();
|
|
388
|
+
this.logger.info({
|
|
389
|
+
serverName: discovery.serverInfo.name,
|
|
390
|
+
toolCount: discovery.tools.length,
|
|
391
|
+
personaCount: this.personas.length,
|
|
392
|
+
}, 'Starting interview');
|
|
393
|
+
// Extract server context if not already set
|
|
394
|
+
if (!this.serverContext) {
|
|
395
|
+
this.serverContext = await this.extractServerContext(client, discovery);
|
|
396
|
+
}
|
|
397
|
+
// Track stats per persona
|
|
398
|
+
const personaStats = new Map();
|
|
399
|
+
for (const persona of this.personas) {
|
|
400
|
+
personaStats.set(persona.id, {
|
|
401
|
+
id: persona.id,
|
|
402
|
+
name: persona.name,
|
|
403
|
+
questionsAsked: 0,
|
|
404
|
+
toolCallCount: 0,
|
|
405
|
+
errorCount: 0,
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
const progress = {
|
|
409
|
+
phase: 'starting',
|
|
410
|
+
personasCompleted: 0,
|
|
411
|
+
totalPersonas: this.personas.length,
|
|
412
|
+
toolsCompleted: 0,
|
|
413
|
+
totalTools: discovery.tools.length,
|
|
414
|
+
questionsAsked: 0,
|
|
415
|
+
promptsCompleted: 0,
|
|
416
|
+
totalPrompts: discovery.prompts.length,
|
|
417
|
+
resourcesCompleted: 0,
|
|
418
|
+
totalResources: (discovery.resources ?? []).length,
|
|
419
|
+
};
|
|
420
|
+
onProgress?.(progress);
|
|
421
|
+
// Aggregate interactions by tool across all personas
|
|
422
|
+
const toolInteractionsMap = new Map();
|
|
423
|
+
// Initialize map for each tool
|
|
424
|
+
for (const tool of discovery.tools) {
|
|
425
|
+
toolInteractionsMap.set(tool.name, {
|
|
426
|
+
interactions: [],
|
|
427
|
+
findingsByPersona: [],
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
// Track all scenario results
|
|
431
|
+
let allScenarioResults = [];
|
|
432
|
+
let checkModeResult = null;
|
|
433
|
+
// Interview with each persona
|
|
434
|
+
progress.phase = 'interviewing';
|
|
435
|
+
// Check if parallel execution is enabled
|
|
436
|
+
const useParallel = this.config.parallelPersonas && this.personas.length > 1;
|
|
437
|
+
if (useParallel) {
|
|
438
|
+
// Parallel persona execution
|
|
439
|
+
const concurrency = this.config.personaConcurrency ?? INTERVIEW.DEFAULT_PERSONA_CONCURRENCY;
|
|
440
|
+
const toolCallMutex = createMutex(); // Shared mutex for serializing MCP tool calls
|
|
441
|
+
this.logger.info({
|
|
442
|
+
personaCount: this.personas.length,
|
|
443
|
+
concurrency,
|
|
444
|
+
}, 'Running persona interviews in parallel');
|
|
445
|
+
// Create tasks for each persona
|
|
446
|
+
const personaTasks = this.personas.map(persona => async () => {
|
|
447
|
+
progress.currentPersona = persona.name;
|
|
448
|
+
onProgress?.(progress);
|
|
449
|
+
const result = await this.interviewPersona(client, discovery, persona, toolCallMutex);
|
|
450
|
+
progress.personasCompleted++;
|
|
451
|
+
progress.questionsAsked += result.stats.questionsAsked;
|
|
452
|
+
onProgress?.(progress);
|
|
453
|
+
return result;
|
|
454
|
+
});
|
|
455
|
+
// Execute personas in parallel with concurrency limit
|
|
456
|
+
const parallelResults = await parallelLimit(personaTasks, { concurrency });
|
|
457
|
+
// Check for errors
|
|
458
|
+
if (!parallelResults.allSucceeded) {
|
|
459
|
+
for (const [index, error] of parallelResults.errors) {
|
|
460
|
+
this.logger.error({
|
|
461
|
+
persona: this.personas[index]?.name,
|
|
462
|
+
error: error.message,
|
|
463
|
+
}, 'Persona interview failed');
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
// Aggregate results
|
|
467
|
+
const successfulResults = parallelResults.results.filter((r) => r !== undefined);
|
|
468
|
+
const aggregated = this.aggregateParallelResults(successfulResults, discovery);
|
|
469
|
+
// Update tracking maps
|
|
470
|
+
for (const [toolName, data] of aggregated.toolInteractionsMap) {
|
|
471
|
+
const existing = toolInteractionsMap.get(toolName);
|
|
472
|
+
if (existing) {
|
|
473
|
+
existing.interactions = data.interactions;
|
|
474
|
+
existing.findingsByPersona = data.findingsByPersona;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
// Update persona stats
|
|
478
|
+
for (const [personaId, stats] of aggregated.personaStats) {
|
|
479
|
+
personaStats.set(personaId, stats);
|
|
480
|
+
}
|
|
481
|
+
allScenarioResults = aggregated.allScenarioResults;
|
|
482
|
+
}
|
|
483
|
+
else if (this.config.checkMode) {
|
|
484
|
+
// Check mode tool testing (parallel or sequential based on config)
|
|
485
|
+
// This path doesn't require an LLM - uses fallback questions and simple analysis
|
|
486
|
+
const statefulConfig = this.config.statefulTesting;
|
|
487
|
+
const statefulEnabled = statefulConfig?.enabled ?? false;
|
|
488
|
+
const dependencies = statefulEnabled ? resolveToolDependencies(discovery.tools) : [];
|
|
489
|
+
const dependencyMap = new Map(dependencies.map((d) => [d.tool, d]));
|
|
490
|
+
const toolMap = new Map(discovery.tools.map((tool) => [tool.name, tool]));
|
|
491
|
+
const orderedTools = statefulEnabled
|
|
492
|
+
? getDependencyOrder(dependencies)
|
|
493
|
+
.map((name) => toolMap.get(name))
|
|
494
|
+
.filter((tool) => !!tool)
|
|
495
|
+
: discovery.tools;
|
|
496
|
+
const effectiveConcurrency = statefulEnabled
|
|
497
|
+
? 1
|
|
498
|
+
: this.config.parallelTools
|
|
499
|
+
? (this.config.toolConcurrency ?? INTERVIEW.DEFAULT_TOOL_CONCURRENCY)
|
|
500
|
+
: 1; // Sequential when parallelTools is disabled
|
|
501
|
+
if (statefulEnabled) {
|
|
502
|
+
this.logger.info({ toolCount: orderedTools.length }, 'Stateful testing enabled');
|
|
503
|
+
}
|
|
504
|
+
this.logger.info({ parallel: this.config.parallelTools && !statefulEnabled, concurrency: effectiveConcurrency }, 'Using check mode tool testing');
|
|
505
|
+
const statefulRunner = statefulEnabled
|
|
506
|
+
? new StatefulTestRunner({ shareOutputs: statefulConfig?.shareOutputsBetweenTools ?? true })
|
|
507
|
+
: undefined;
|
|
508
|
+
const parallelResult = await this.interviewToolsInParallel(client, orderedTools, progress, onProgress, {
|
|
509
|
+
statefulRunner,
|
|
510
|
+
dependencyMap,
|
|
511
|
+
statefulConfig,
|
|
512
|
+
});
|
|
513
|
+
checkModeResult = parallelResult;
|
|
514
|
+
// Update tool interactions map with parallel results
|
|
515
|
+
for (const profile of parallelResult.toolProfiles) {
|
|
516
|
+
const toolData = toolInteractionsMap.get(profile.name);
|
|
517
|
+
if (toolData) {
|
|
518
|
+
toolData.interactions = profile.interactions;
|
|
519
|
+
toolData.findingsByPersona = [{
|
|
520
|
+
personaId: 'check_mode',
|
|
521
|
+
personaName: 'Check Mode',
|
|
522
|
+
behavioralNotes: [],
|
|
523
|
+
limitations: [],
|
|
524
|
+
securityNotes: [],
|
|
525
|
+
}];
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
// Update persona stats with aggregated counts
|
|
529
|
+
const checkModeStats = personaStats.get(this.personas[0].id);
|
|
530
|
+
if (checkModeStats) {
|
|
531
|
+
checkModeStats.questionsAsked = parallelResult.totalQuestionsAsked;
|
|
532
|
+
checkModeStats.toolCallCount = parallelResult.totalToolCallCount;
|
|
533
|
+
checkModeStats.errorCount = parallelResult.totalErrorCount;
|
|
534
|
+
}
|
|
535
|
+
allScenarioResults = parallelResult.scenarioResults;
|
|
536
|
+
}
|
|
537
|
+
else {
|
|
538
|
+
// Sequential persona execution (original behavior)
|
|
539
|
+
for (const persona of this.personas) {
|
|
540
|
+
progress.currentPersona = persona.name;
|
|
541
|
+
onProgress?.(progress);
|
|
542
|
+
// Create orchestrator with server context and streaming if enabled
|
|
543
|
+
const orchestrator = this.createOrchestrator(persona);
|
|
544
|
+
const stats = personaStats.get(persona.id);
|
|
545
|
+
// Interview each tool with this persona
|
|
546
|
+
for (const tool of discovery.tools) {
|
|
547
|
+
progress.currentTool = tool.name;
|
|
548
|
+
onProgress?.(progress);
|
|
549
|
+
const personaInteractions = [];
|
|
550
|
+
const previousErrors = [];
|
|
551
|
+
// Check for custom scenarios for this tool
|
|
552
|
+
const customScenarios = this.getScenariosForTool(tool.name);
|
|
553
|
+
// If customScenariosOnly and we have scenarios, skip LLM generation
|
|
554
|
+
let questions = [];
|
|
555
|
+
if (customScenarios.length > 0) {
|
|
556
|
+
// Execute custom scenarios
|
|
557
|
+
const scenarioResults = await this.executeToolScenarios(client, tool.name, customScenarios);
|
|
558
|
+
allScenarioResults.push(...scenarioResults);
|
|
559
|
+
// Convert scenarios to interview questions for integration with profiling
|
|
560
|
+
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
561
|
+
// If not custom-only mode, also generate LLM questions (skip in fast CI mode)
|
|
562
|
+
if (!this.config.customScenariosOnly && !this.config.checkMode) {
|
|
563
|
+
const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
564
|
+
questions = [...questions, ...llmQuestions];
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
else if (!this.config.customScenariosOnly) {
|
|
568
|
+
// No custom scenarios - generate questions
|
|
569
|
+
if (this.config.checkMode) {
|
|
570
|
+
// Fast CI mode: use fallback questions (no LLM call)
|
|
571
|
+
questions = orchestrator.getFallbackQuestions(tool, this.config.skipErrorTests)
|
|
572
|
+
.slice(0, this.config.maxQuestionsPerTool);
|
|
573
|
+
}
|
|
574
|
+
else {
|
|
575
|
+
// Normal mode: generate LLM questions
|
|
576
|
+
questions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
// If customScenariosOnly and no scenarios for this tool, skip it
|
|
580
|
+
// Ask each question with retry logic
|
|
581
|
+
for (const question of questions) {
|
|
582
|
+
const { interaction, hadError } = await this.executeWithRetry(client, tool, question, orchestrator, persona.id, stats);
|
|
583
|
+
personaInteractions.push(interaction);
|
|
584
|
+
// Track errors for learning
|
|
585
|
+
if (hadError && interaction.error) {
|
|
586
|
+
previousErrors.push({
|
|
587
|
+
args: question.args,
|
|
588
|
+
error: interaction.error,
|
|
589
|
+
});
|
|
590
|
+
// If we have multiple failures, regenerate remaining questions with error context
|
|
591
|
+
// Skip in scenarios-only mode and fast CI mode
|
|
592
|
+
if (!this.config.customScenariosOnly && !this.config.checkMode &&
|
|
593
|
+
previousErrors.length >= 2 && personaInteractions.length < questions.length) {
|
|
594
|
+
const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
|
|
595
|
+
if (remaining > 0) {
|
|
596
|
+
this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
|
|
597
|
+
const newQuestions = await orchestrator.generateQuestions(tool, remaining, this.config.skipErrorTests, previousErrors);
|
|
598
|
+
// Replace remaining questions with newly generated ones
|
|
599
|
+
questions = [...questions.slice(0, personaInteractions.length), ...newQuestions];
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
stats.questionsAsked++;
|
|
604
|
+
progress.questionsAsked++;
|
|
605
|
+
onProgress?.(progress);
|
|
606
|
+
}
|
|
607
|
+
// Synthesize this persona's findings for this tool
|
|
608
|
+
// Skip LLM synthesis in scenarios-only mode and fast CI mode
|
|
609
|
+
let personaProfile;
|
|
610
|
+
if (this.config.customScenariosOnly || this.config.checkMode) {
|
|
611
|
+
// Check mode: minimal profile, no misleading error counts
|
|
612
|
+
personaProfile = {
|
|
613
|
+
behavioralNotes: [],
|
|
614
|
+
limitations: [],
|
|
615
|
+
securityNotes: [],
|
|
616
|
+
};
|
|
617
|
+
}
|
|
618
|
+
else {
|
|
619
|
+
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
|
|
620
|
+
question: i.question,
|
|
621
|
+
response: i.response,
|
|
622
|
+
error: i.error,
|
|
623
|
+
analysis: i.analysis,
|
|
624
|
+
})));
|
|
625
|
+
}
|
|
626
|
+
// Store findings
|
|
627
|
+
const toolData = toolInteractionsMap.get(tool.name);
|
|
628
|
+
toolData.interactions.push(...personaInteractions);
|
|
629
|
+
toolData.findingsByPersona.push({
|
|
630
|
+
personaId: persona.id,
|
|
631
|
+
personaName: persona.name,
|
|
632
|
+
behavioralNotes: personaProfile.behavioralNotes,
|
|
633
|
+
limitations: personaProfile.limitations,
|
|
634
|
+
securityNotes: personaProfile.securityNotes,
|
|
635
|
+
});
|
|
636
|
+
progress.toolsCompleted++;
|
|
637
|
+
onProgress?.(progress);
|
|
638
|
+
}
|
|
639
|
+
progress.personasCompleted++;
|
|
640
|
+
// Reset tool count for next persona
|
|
641
|
+
progress.toolsCompleted = 0;
|
|
642
|
+
onProgress?.(progress);
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
// Build aggregated tool profiles
|
|
646
|
+
let toolProfiles = [];
|
|
647
|
+
if (this.config.checkMode && checkModeResult) {
|
|
648
|
+
toolProfiles = checkModeResult.toolProfiles;
|
|
649
|
+
}
|
|
650
|
+
else {
|
|
651
|
+
for (const tool of discovery.tools) {
|
|
652
|
+
const toolData = toolInteractionsMap.get(tool.name);
|
|
653
|
+
// Aggregate findings across personas (deduplicate)
|
|
654
|
+
const aggregatedProfile = this.aggregateFindings(tool.name, tool.description ?? '', toolData);
|
|
655
|
+
toolProfiles.push(aggregatedProfile);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
// Interview prompts (if server has prompts capability)
|
|
659
|
+
const promptProfiles = [];
|
|
660
|
+
if (discovery.prompts.length > 0) {
|
|
661
|
+
this.logger.info({ promptCount: discovery.prompts.length }, 'Interviewing prompts');
|
|
662
|
+
// Update phase for prompts
|
|
663
|
+
progress.phase = 'prompts';
|
|
664
|
+
progress.promptsCompleted = 0;
|
|
665
|
+
onProgress?.(progress);
|
|
666
|
+
// Only create orchestrator if NOT in check mode (requires LLM)
|
|
667
|
+
const primaryOrchestrator = this.isCheckMode() ? null : this.createOrchestrator(this.personas[0]);
|
|
668
|
+
for (const prompt of discovery.prompts) {
|
|
669
|
+
progress.currentTool = `prompt:${prompt.name}`;
|
|
670
|
+
onProgress?.(progress);
|
|
671
|
+
const promptInteractions = [];
|
|
672
|
+
// Check for custom scenarios for this prompt
|
|
673
|
+
const customScenarios = this.getScenariosForPrompt(prompt.name);
|
|
674
|
+
// Build questions list - custom scenarios + LLM-generated (unless customScenariosOnly)
|
|
675
|
+
let questions = [];
|
|
676
|
+
if (customScenarios.length > 0) {
|
|
677
|
+
// Execute custom prompt scenarios
|
|
678
|
+
const scenarioResults = await this.executePromptScenarios(client, prompt.name, customScenarios);
|
|
679
|
+
allScenarioResults.push(...scenarioResults);
|
|
680
|
+
// Convert scenarios to prompt questions for profiling
|
|
681
|
+
questions = customScenarios.map(s => ({
|
|
682
|
+
description: s.description,
|
|
683
|
+
args: s.args,
|
|
684
|
+
}));
|
|
685
|
+
// If not custom-only mode and not fast CI mode, also generate LLM questions
|
|
686
|
+
if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
|
|
687
|
+
const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
|
|
688
|
+
questions = [...questions, ...llmQuestions];
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
else if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
|
|
692
|
+
// No custom scenarios - generate LLM questions as usual
|
|
693
|
+
questions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
|
|
694
|
+
}
|
|
695
|
+
else if (this.config.checkMode) {
|
|
696
|
+
// Fast CI mode: use simple fallback question for prompt
|
|
697
|
+
questions = [{ description: 'Basic prompt test', args: {} }];
|
|
698
|
+
}
|
|
699
|
+
// If customScenariosOnly and no scenarios for this prompt, skip it
|
|
700
|
+
for (const question of questions) {
|
|
701
|
+
const interactionStart = Date.now();
|
|
702
|
+
let response = null;
|
|
703
|
+
let error = null;
|
|
704
|
+
try {
|
|
705
|
+
response = await client.getPrompt(prompt.name, question.args);
|
|
706
|
+
}
|
|
707
|
+
catch (e) {
|
|
708
|
+
error = e instanceof Error ? e.message : String(e);
|
|
709
|
+
}
|
|
710
|
+
// Skip LLM analysis in scenarios-only mode and fast CI mode
|
|
711
|
+
let analysis;
|
|
712
|
+
if (this.isCheckMode() || !primaryOrchestrator) {
|
|
713
|
+
analysis = this.generateSimpleAnalysis(error, !!response, 'Prompt call succeeded.');
|
|
714
|
+
}
|
|
715
|
+
else {
|
|
716
|
+
analysis = await primaryOrchestrator.analyzePromptResponse(prompt, question, response, error);
|
|
717
|
+
}
|
|
718
|
+
promptInteractions.push({
|
|
719
|
+
promptName: prompt.name,
|
|
720
|
+
question,
|
|
721
|
+
response,
|
|
722
|
+
error,
|
|
723
|
+
analysis,
|
|
724
|
+
durationMs: Date.now() - interactionStart,
|
|
725
|
+
});
|
|
726
|
+
progress.questionsAsked++;
|
|
727
|
+
onProgress?.(progress);
|
|
728
|
+
}
|
|
729
|
+
// Synthesize prompt profile
|
|
730
|
+
// Skip LLM synthesis in scenarios-only mode and fast CI mode
|
|
731
|
+
let profile;
|
|
732
|
+
if (this.config.customScenariosOnly || this.config.checkMode || !primaryOrchestrator) {
|
|
733
|
+
// Check mode: minimal profile, no misleading error counts
|
|
734
|
+
profile = {
|
|
735
|
+
name: prompt.name,
|
|
736
|
+
description: prompt.description || prompt.name,
|
|
737
|
+
arguments: prompt.arguments || [],
|
|
738
|
+
behavioralNotes: [],
|
|
739
|
+
limitations: [],
|
|
740
|
+
};
|
|
741
|
+
}
|
|
742
|
+
else {
|
|
743
|
+
profile = await primaryOrchestrator.synthesizePromptProfile(prompt, promptInteractions.map(i => ({
|
|
744
|
+
question: i.question,
|
|
745
|
+
response: i.response,
|
|
746
|
+
error: i.error,
|
|
747
|
+
analysis: i.analysis,
|
|
748
|
+
})));
|
|
749
|
+
}
|
|
750
|
+
promptProfiles.push({
|
|
751
|
+
...profile,
|
|
752
|
+
interactions: promptInteractions,
|
|
753
|
+
});
|
|
754
|
+
// Update prompt progress
|
|
755
|
+
progress.promptsCompleted = (progress.promptsCompleted ?? 0) + 1;
|
|
756
|
+
onProgress?.(progress);
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
// Interview resources (if server has resources capability)
|
|
760
|
+
// Skip in scenarios-only mode since there's no resource scenario format
|
|
761
|
+
const resourceProfiles = [];
|
|
762
|
+
let resourceReadCount = 0;
|
|
763
|
+
const discoveredResources = discovery.resources ?? [];
|
|
764
|
+
if (discoveredResources.length > 0 && !this.config.customScenariosOnly) {
|
|
765
|
+
this.logger.info({ resourceCount: discoveredResources.length }, 'Interviewing resources');
|
|
766
|
+
// Update phase for resources
|
|
767
|
+
progress.phase = 'resources';
|
|
768
|
+
progress.resourcesCompleted = 0;
|
|
769
|
+
onProgress?.(progress);
|
|
770
|
+
// Only create orchestrator if NOT in check mode (requires LLM)
|
|
771
|
+
const primaryOrchestrator = this.isCheckMode() ? null : this.createOrchestrator(this.personas[0]);
|
|
772
|
+
for (const resource of discoveredResources) {
|
|
773
|
+
progress.currentTool = `resource:${resource.name}`;
|
|
774
|
+
onProgress?.(progress);
|
|
775
|
+
const resourceInteractions = [];
|
|
776
|
+
// Generate resource questions (skip LLM in fast CI mode)
|
|
777
|
+
let questions;
|
|
778
|
+
if (this.config.checkMode || !primaryOrchestrator) {
|
|
779
|
+
// Fast CI mode: use simple fallback question
|
|
780
|
+
questions = [{ description: 'Basic resource read test', category: 'happy_path' }];
|
|
781
|
+
}
|
|
782
|
+
else {
|
|
783
|
+
questions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
|
|
784
|
+
}
|
|
785
|
+
for (const question of questions) {
|
|
786
|
+
const interactionStart = Date.now();
|
|
787
|
+
let response = null;
|
|
788
|
+
let error = null;
|
|
789
|
+
try {
|
|
790
|
+
// Apply timeout to resource read to prevent indefinite hangs
|
|
791
|
+
response = await withTimeout(client.readResource(resource.uri), this.config.resourceTimeout ?? DEFAULT_TIMEOUTS.resourceRead, `Resource read: ${resource.uri}`);
|
|
792
|
+
resourceReadCount++;
|
|
793
|
+
}
|
|
794
|
+
catch (e) {
|
|
795
|
+
error = e instanceof Error ? e.message : String(e);
|
|
796
|
+
resourceReadCount++;
|
|
797
|
+
}
|
|
798
|
+
// Skip LLM analysis in fast CI mode
|
|
799
|
+
let analysis;
|
|
800
|
+
if (this.isCheckMode() || !primaryOrchestrator) {
|
|
801
|
+
analysis = this.generateSimpleAnalysis(error, !!response, 'Resource read succeeded.');
|
|
802
|
+
}
|
|
803
|
+
else {
|
|
804
|
+
analysis = await primaryOrchestrator.analyzeResourceResponse(resource, question, response, error);
|
|
805
|
+
}
|
|
806
|
+
resourceInteractions.push({
|
|
807
|
+
resourceUri: resource.uri,
|
|
808
|
+
resourceName: resource.name,
|
|
809
|
+
question,
|
|
810
|
+
response,
|
|
811
|
+
error,
|
|
812
|
+
analysis,
|
|
813
|
+
durationMs: Date.now() - interactionStart,
|
|
814
|
+
});
|
|
815
|
+
progress.questionsAsked++;
|
|
816
|
+
onProgress?.(progress);
|
|
817
|
+
}
|
|
818
|
+
// Synthesize resource profile (skip LLM in fast CI mode)
|
|
819
|
+
let profile;
|
|
820
|
+
if (this.config.checkMode || !primaryOrchestrator) {
|
|
821
|
+
// Check mode: minimal profile, no misleading error counts
|
|
822
|
+
profile = {
|
|
823
|
+
name: resource.name,
|
|
824
|
+
uri: resource.uri,
|
|
825
|
+
description: resource.description || resource.name,
|
|
826
|
+
mimeType: resource.mimeType,
|
|
827
|
+
behavioralNotes: [],
|
|
828
|
+
limitations: [],
|
|
829
|
+
};
|
|
830
|
+
}
|
|
831
|
+
else {
|
|
832
|
+
profile = await primaryOrchestrator.synthesizeResourceProfile(resource, resourceInteractions.map(i => ({
|
|
833
|
+
question: i.question,
|
|
834
|
+
response: i.response,
|
|
835
|
+
error: i.error,
|
|
836
|
+
analysis: i.analysis,
|
|
837
|
+
})));
|
|
838
|
+
}
|
|
839
|
+
// Extract content preview from first successful read
|
|
840
|
+
let contentPreview;
|
|
841
|
+
const successfulRead = resourceInteractions.find(i => i.response && !i.error);
|
|
842
|
+
if (successfulRead?.response?.contents?.[0]) {
|
|
843
|
+
const content = successfulRead.response.contents[0];
|
|
844
|
+
if (content.text) {
|
|
845
|
+
contentPreview = content.text.length > DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW
|
|
846
|
+
? content.text.substring(0, DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW) + '...'
|
|
847
|
+
: content.text;
|
|
848
|
+
}
|
|
849
|
+
else if (content.blob) {
|
|
850
|
+
contentPreview = `[Binary data: ${content.blob.length} bytes base64]`;
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
resourceProfiles.push({
|
|
854
|
+
...profile,
|
|
855
|
+
interactions: resourceInteractions,
|
|
856
|
+
contentPreview,
|
|
857
|
+
});
|
|
858
|
+
// Update resource progress
|
|
859
|
+
progress.resourcesCompleted = (progress.resourcesCompleted ?? 0) + 1;
|
|
860
|
+
onProgress?.(progress);
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
// Execute workflows if configured
|
|
864
|
+
let workflowResults;
|
|
865
|
+
let workflowSummary;
|
|
866
|
+
const workflowConfig = this.config.workflowConfig;
|
|
867
|
+
if (workflowConfig && (workflowConfig.workflows?.length || workflowConfig.discoverWorkflows)) {
|
|
868
|
+
progress.phase = 'workflows';
|
|
869
|
+
onProgress?.(progress);
|
|
870
|
+
const { results, summary } = await this.executeWorkflows(client, discovery, workflowConfig, progress, onProgress);
|
|
871
|
+
workflowResults = results.length > 0 ? results : undefined;
|
|
872
|
+
workflowSummary = summary;
|
|
873
|
+
}
|
|
874
|
+
// Synthesize overall findings (use first persona's orchestrator for synthesis)
|
|
875
|
+
// Skip LLM synthesis in scenarios-only mode and fast CI mode
|
|
876
|
+
progress.phase = 'synthesizing';
|
|
877
|
+
onProgress?.(progress);
|
|
878
|
+
let overall;
|
|
879
|
+
if (this.config.customScenariosOnly || this.config.checkMode) {
|
|
880
|
+
// Check mode: simple summary focused on verification, not pass/fail
|
|
881
|
+
const serverName = discovery.serverInfo.name || 'This MCP server';
|
|
882
|
+
overall = {
|
|
883
|
+
summary: `${serverName} provides ${toolProfiles.length} tool(s) for MCP integration.`,
|
|
884
|
+
limitations: [],
|
|
885
|
+
recommendations: [],
|
|
886
|
+
};
|
|
887
|
+
}
|
|
888
|
+
else {
|
|
889
|
+
const primaryOrchestrator = this.createOrchestrator(this.personas[0]);
|
|
890
|
+
overall = await primaryOrchestrator.synthesizeOverall(discovery, toolProfiles);
|
|
891
|
+
}
|
|
892
|
+
// Calculate totals
|
|
893
|
+
let totalToolCallCount = 0;
|
|
894
|
+
let totalErrorCount = 0;
|
|
895
|
+
for (const stats of personaStats.values()) {
|
|
896
|
+
totalToolCallCount += stats.toolCallCount;
|
|
897
|
+
totalErrorCount += stats.errorCount;
|
|
898
|
+
}
|
|
899
|
+
const endTime = new Date();
|
|
900
|
+
const allInteractions = toolProfiles.flatMap((p) => p.interactions);
|
|
901
|
+
const assertionSummary = summarizeAssertions(allInteractions);
|
|
902
|
+
const rateLimitSummary = this.rateLimitEvents.size > 0
|
|
903
|
+
? {
|
|
904
|
+
totalEvents: Array.from(this.rateLimitEvents.values()).reduce((sum, v) => sum + v, 0),
|
|
905
|
+
totalRetries: this.rateLimitRetries,
|
|
906
|
+
tools: Array.from(this.rateLimitEvents.keys()),
|
|
907
|
+
}
|
|
908
|
+
: undefined;
|
|
909
|
+
const externalServicesSummary = this.externalServiceStatuses.size > 0
|
|
910
|
+
? {
|
|
911
|
+
mode: this.config.externalServices?.mode ?? 'skip',
|
|
912
|
+
unconfiguredServices: Array.from(this.externalServiceStatuses.values())
|
|
913
|
+
.filter((s) => !s.configured)
|
|
914
|
+
.map((s) => s.service),
|
|
915
|
+
skippedTools: Array.from(this.skippedTools),
|
|
916
|
+
mockedTools: Array.from(this.mockedTools),
|
|
917
|
+
}
|
|
918
|
+
: undefined;
|
|
919
|
+
const statefulSummary = this.config.statefulTesting?.enabled
|
|
920
|
+
? {
|
|
921
|
+
enabled: true,
|
|
922
|
+
toolCount: toolProfiles.length,
|
|
923
|
+
dependencyCount: toolProfiles.reduce((sum, profile) => sum + (profile.dependencyInfo?.dependsOn.length ?? 0), 0),
|
|
924
|
+
maxChainLength: this.config.statefulTesting?.maxChainLength ?? 0,
|
|
925
|
+
}
|
|
926
|
+
: undefined;
|
|
927
|
+
const metadata = {
|
|
928
|
+
startTime,
|
|
929
|
+
endTime,
|
|
930
|
+
durationMs: endTime.getTime() - startTime.getTime(),
|
|
931
|
+
toolCallCount: totalToolCallCount,
|
|
932
|
+
resourceReadCount: resourceReadCount > 0 ? resourceReadCount : undefined,
|
|
933
|
+
errorCount: totalErrorCount,
|
|
934
|
+
model: this.config.checkMode ? 'check' : this.config.model,
|
|
935
|
+
personas: Array.from(personaStats.values()),
|
|
936
|
+
workflows: workflowSummary,
|
|
937
|
+
serverCommand: this.config.serverCommand,
|
|
938
|
+
rateLimit: rateLimitSummary,
|
|
939
|
+
externalServices: externalServicesSummary,
|
|
940
|
+
assertions: assertionSummary,
|
|
941
|
+
statefulTesting: statefulSummary,
|
|
942
|
+
};
|
|
943
|
+
progress.phase = 'complete';
|
|
944
|
+
onProgress?.(progress);
|
|
945
|
+
this.logger.info({
|
|
946
|
+
toolsProfiled: toolProfiles.length,
|
|
947
|
+
totalToolCalls: totalToolCallCount,
|
|
948
|
+
totalErrors: totalErrorCount,
|
|
949
|
+
durationMs: metadata.durationMs,
|
|
950
|
+
}, 'Interview complete');
|
|
951
|
+
done();
|
|
952
|
+
return {
|
|
953
|
+
discovery,
|
|
954
|
+
toolProfiles,
|
|
955
|
+
promptProfiles: promptProfiles.length > 0 ? promptProfiles : undefined,
|
|
956
|
+
resourceProfiles: resourceProfiles.length > 0 ? resourceProfiles : undefined,
|
|
957
|
+
workflowResults,
|
|
958
|
+
scenarioResults: allScenarioResults.length > 0 ? allScenarioResults : undefined,
|
|
959
|
+
summary: overall.summary,
|
|
960
|
+
limitations: overall.limitations,
|
|
961
|
+
recommendations: overall.recommendations,
|
|
962
|
+
metadata,
|
|
963
|
+
};
|
|
964
|
+
}
|
|
965
|
+
/**
|
|
966
|
+
* Classify errors from interactions to separate tool correctness from environment issues.
|
|
967
|
+
*/
|
|
968
|
+
classifyErrors(interactions, toolName, toolDescription) {
|
|
969
|
+
let externalServiceErrors = 0;
|
|
970
|
+
let environmentErrors = 0;
|
|
971
|
+
let codeBugErrors = 0;
|
|
972
|
+
let unknownErrors = 0;
|
|
973
|
+
const detectedServices = new Set();
|
|
974
|
+
for (const interaction of interactions) {
|
|
975
|
+
if (interaction.error) {
|
|
976
|
+
const analysis = categorizeErrorSource(interaction.error, toolName, toolDescription);
|
|
977
|
+
switch (analysis.source) {
|
|
978
|
+
case 'external_dependency':
|
|
979
|
+
externalServiceErrors++;
|
|
980
|
+
if (analysis.dependency?.displayName) {
|
|
981
|
+
detectedServices.add(analysis.dependency.displayName);
|
|
982
|
+
}
|
|
983
|
+
break;
|
|
984
|
+
case 'environment':
|
|
985
|
+
environmentErrors++;
|
|
986
|
+
break;
|
|
987
|
+
case 'code_bug':
|
|
988
|
+
codeBugErrors++;
|
|
989
|
+
break;
|
|
990
|
+
default:
|
|
991
|
+
unknownErrors++;
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
return {
|
|
996
|
+
externalServiceErrors,
|
|
997
|
+
environmentErrors,
|
|
998
|
+
codeBugErrors,
|
|
999
|
+
unknownErrors,
|
|
1000
|
+
detectedServices: detectedServices.size > 0 ? Array.from(detectedServices) : undefined,
|
|
1001
|
+
};
|
|
1002
|
+
}
|
|
1003
|
+
/**
|
|
1004
|
+
* Aggregate findings from multiple personas into a single tool profile.
|
|
1005
|
+
*/
|
|
1006
|
+
aggregateFindings(toolName, description, data) {
|
|
1007
|
+
// Collect all notes, deduplicating similar content
|
|
1008
|
+
const behavioralNotes = new Set();
|
|
1009
|
+
const limitations = new Set();
|
|
1010
|
+
const securityNotes = new Set();
|
|
1011
|
+
for (const findings of data.findingsByPersona) {
|
|
1012
|
+
for (const note of findings.behavioralNotes) {
|
|
1013
|
+
behavioralNotes.add(note);
|
|
1014
|
+
}
|
|
1015
|
+
for (const limitation of findings.limitations) {
|
|
1016
|
+
limitations.add(limitation);
|
|
1017
|
+
}
|
|
1018
|
+
for (const note of findings.securityNotes) {
|
|
1019
|
+
securityNotes.add(note);
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
// Classify errors to separate tool correctness from environment issues
|
|
1023
|
+
const errorClassification = this.classifyErrors(data.interactions, toolName, description);
|
|
1024
|
+
return {
|
|
1025
|
+
name: toolName,
|
|
1026
|
+
description,
|
|
1027
|
+
interactions: data.interactions,
|
|
1028
|
+
behavioralNotes: Array.from(behavioralNotes),
|
|
1029
|
+
limitations: Array.from(limitations),
|
|
1030
|
+
securityNotes: Array.from(securityNotes),
|
|
1031
|
+
findingsByPersona: data.findingsByPersona,
|
|
1032
|
+
errorClassification,
|
|
1033
|
+
};
|
|
1034
|
+
}
|
|
1035
|
+
/**
|
|
1036
|
+
* Execute a tool call with retry logic for recoverable errors.
|
|
1037
|
+
* Learns from errors and can update server context based on error messages.
|
|
1038
|
+
* Uses caching to avoid redundant tool calls with identical arguments.
|
|
1039
|
+
*/
|
|
1040
|
+
async executeWithRetry(client, tool, question, orchestrator, personaId, stats) {
|
|
1041
|
+
const interactionStart = Date.now();
|
|
1042
|
+
let response = null;
|
|
1043
|
+
let error = null;
|
|
1044
|
+
let hadError = false;
|
|
1045
|
+
let fromCache = false;
|
|
1046
|
+
let toolExecutionMs = 0;
|
|
1047
|
+
let llmAnalysisMs = 0;
|
|
1048
|
+
let mocked = false;
|
|
1049
|
+
let mockService;
|
|
1050
|
+
// Check cache for tool response (same tool + same args = same response)
|
|
1051
|
+
if (this.cache) {
|
|
1052
|
+
const cachedResponse = this.cache.getToolResponse(tool.name, question.args);
|
|
1053
|
+
if (cachedResponse) {
|
|
1054
|
+
response = cachedResponse;
|
|
1055
|
+
fromCache = true;
|
|
1056
|
+
this.logger.debug({ toolName: tool.name, args: question.args }, 'Tool response served from cache');
|
|
1057
|
+
stats.toolCallCount++; // Still count as a tool call for metrics
|
|
1058
|
+
if (response.isError) {
|
|
1059
|
+
stats.errorCount++;
|
|
1060
|
+
hadError = true;
|
|
1061
|
+
const errorContent = response.content?.find(c => c.type === 'text');
|
|
1062
|
+
if (errorContent && 'text' in errorContent) {
|
|
1063
|
+
error = String(errorContent.text);
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
// Make actual tool call if not cached
|
|
1069
|
+
if (!fromCache) {
|
|
1070
|
+
const result = await this.callToolWithPolicies(client, tool, question.args);
|
|
1071
|
+
response = result.response;
|
|
1072
|
+
error = result.error;
|
|
1073
|
+
toolExecutionMs = result.toolExecutionMs;
|
|
1074
|
+
mocked = !!result.mocked;
|
|
1075
|
+
mockService = result.mockService;
|
|
1076
|
+
if (result.skipped) {
|
|
1077
|
+
error = result.skipReason ?? 'Skipped: external service not configured';
|
|
1078
|
+
hadError = true;
|
|
1079
|
+
}
|
|
1080
|
+
else {
|
|
1081
|
+
stats.toolCallCount++;
|
|
1082
|
+
if (error || response?.isError) {
|
|
1083
|
+
stats.errorCount++;
|
|
1084
|
+
hadError = true;
|
|
1085
|
+
if (error) {
|
|
1086
|
+
this.learnFromError(error, orchestrator);
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
else if (this.cache && response) {
|
|
1090
|
+
// Cache successful responses for reuse by other personas
|
|
1091
|
+
// Don't cache errors as they may be transient
|
|
1092
|
+
this.cache.setToolResponse(tool.name, question.args, response);
|
|
1093
|
+
this.logger.debug({ toolName: tool.name, args: question.args }, 'Tool response cached');
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
// Analyze the response with this persona's perspective
|
|
1098
|
+
// Skip LLM analysis in scenarios-only mode and fast CI mode
|
|
1099
|
+
let analysis;
|
|
1100
|
+
const llmAnalysisStart = Date.now();
|
|
1101
|
+
if (this.isCheckMode()) {
|
|
1102
|
+
// In fast mode, generate simple analysis (no LLM call)
|
|
1103
|
+
analysis = this.generateSimpleAnalysis(error, !!response, 'Tool call succeeded.');
|
|
1104
|
+
llmAnalysisMs = 0; // No LLM call in fast mode
|
|
1105
|
+
}
|
|
1106
|
+
else {
|
|
1107
|
+
const analysisTool = { name: tool.name, description: tool.description ?? '' };
|
|
1108
|
+
analysis = await orchestrator.analyzeResponse(analysisTool, question, response, error);
|
|
1109
|
+
llmAnalysisMs = Date.now() - llmAnalysisStart;
|
|
1110
|
+
}
|
|
1111
|
+
const interaction = {
|
|
1112
|
+
toolName: tool.name,
|
|
1113
|
+
question,
|
|
1114
|
+
response,
|
|
1115
|
+
error,
|
|
1116
|
+
analysis,
|
|
1117
|
+
durationMs: Date.now() - interactionStart,
|
|
1118
|
+
toolExecutionMs: fromCache ? 0 : toolExecutionMs,
|
|
1119
|
+
llmAnalysisMs,
|
|
1120
|
+
personaId,
|
|
1121
|
+
outcomeAssessment: this.assessOutcome(question, response, error),
|
|
1122
|
+
mocked,
|
|
1123
|
+
mockService,
|
|
1124
|
+
};
|
|
1125
|
+
return { interaction, hadError };
|
|
1126
|
+
}
|
|
1127
|
+
/**
|
|
1128
|
+
* Learn server constraints from error messages.
|
|
1129
|
+
* Updates server context with discovered restrictions.
|
|
1130
|
+
*/
|
|
1131
|
+
learnFromError(error, orchestrator) {
|
|
1132
|
+
// Extract allowed directories from error messages
|
|
1133
|
+
const pathMatch = error.match(/access denied|not allowed|outside.*(?:allowed|permitted).*?([/\\][^\s"']+)/i);
|
|
1134
|
+
if (pathMatch) {
|
|
1135
|
+
// Error mentions a path restriction
|
|
1136
|
+
const constraint = `Path access restricted: ${error.substring(0, DISPLAY_LIMITS.ERROR_CONSTRAINT_LENGTH)}`;
|
|
1137
|
+
const currentContext = orchestrator.getServerContext() ?? { constraints: [] };
|
|
1138
|
+
if (!currentContext.constraints?.includes(constraint)) {
|
|
1139
|
+
currentContext.constraints = [...(currentContext.constraints ?? []), constraint];
|
|
1140
|
+
orchestrator.setServerContext(currentContext);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
// Extract allowed directories explicitly mentioned
|
|
1144
|
+
const allowedMatch = error.match(/allowed director(?:y|ies)[:\s]+([^\n]+)/i);
|
|
1145
|
+
if (allowedMatch) {
|
|
1146
|
+
const dirs = allowedMatch[1].split(/[,\s]+/).filter(d => d.startsWith('/'));
|
|
1147
|
+
if (dirs.length > 0) {
|
|
1148
|
+
const currentContext = orchestrator.getServerContext() ?? { allowedDirectories: [] };
|
|
1149
|
+
const existingDirs = currentContext.allowedDirectories ?? [];
|
|
1150
|
+
const newDirs = [...new Set([...existingDirs, ...dirs])];
|
|
1151
|
+
if (newDirs.length > existingDirs.length) {
|
|
1152
|
+
currentContext.allowedDirectories = newDirs;
|
|
1153
|
+
orchestrator.setServerContext(currentContext);
|
|
1154
|
+
this.logger.debug({ dirs: newDirs }, 'Learned allowed directories from error');
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
/**
|
|
1160
|
+
* Interview all tools with a single persona.
|
|
1161
|
+
* Designed for parallel execution across personas.
|
|
1162
|
+
*
|
|
1163
|
+
* @param client - MCP client for tool calls
|
|
1164
|
+
* @param discovery - Discovery result with available tools
|
|
1165
|
+
* @param persona - Persona to use for this interview
|
|
1166
|
+
* @param toolCallMutex - Mutex for serializing tool calls (shared resource)
|
|
1167
|
+
* @returns PersonaInterviewData with all interactions and findings
|
|
1168
|
+
*/
|
|
1169
|
+
async interviewPersona(client, discovery, persona, toolCallMutex) {
|
|
1170
|
+
const orchestrator = this.createOrchestrator(persona);
|
|
1171
|
+
const stats = {
|
|
1172
|
+
id: persona.id,
|
|
1173
|
+
name: persona.name,
|
|
1174
|
+
questionsAsked: 0,
|
|
1175
|
+
toolCallCount: 0,
|
|
1176
|
+
errorCount: 0,
|
|
1177
|
+
};
|
|
1178
|
+
const toolInteractions = new Map();
|
|
1179
|
+
const toolFindings = new Map();
|
|
1180
|
+
const scenarioResults = [];
|
|
1181
|
+
// Interview each tool with this persona
|
|
1182
|
+
for (const tool of discovery.tools) {
|
|
1183
|
+
const personaInteractions = [];
|
|
1184
|
+
const previousErrors = [];
|
|
1185
|
+
// Check for custom scenarios for this tool
|
|
1186
|
+
const customScenarios = this.getScenariosForTool(tool.name);
|
|
1187
|
+
// Build questions list
|
|
1188
|
+
let questions = [];
|
|
1189
|
+
if (customScenarios.length > 0) {
|
|
1190
|
+
// Execute custom scenarios (need mutex for tool calls)
|
|
1191
|
+
await toolCallMutex.acquire();
|
|
1192
|
+
try {
|
|
1193
|
+
const results = await this.executeToolScenarios(client, tool.name, customScenarios);
|
|
1194
|
+
scenarioResults.push(...results);
|
|
1195
|
+
}
|
|
1196
|
+
finally {
|
|
1197
|
+
toolCallMutex.release();
|
|
1198
|
+
}
|
|
1199
|
+
// Convert scenarios to interview questions
|
|
1200
|
+
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
1201
|
+
// If not custom-only mode, also generate LLM questions
|
|
1202
|
+
if (!this.config.customScenariosOnly) {
|
|
1203
|
+
const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
1204
|
+
questions = [...questions, ...llmQuestions];
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
else if (!this.config.customScenariosOnly) {
|
|
1208
|
+
// No custom scenarios - generate LLM questions as usual
|
|
1209
|
+
questions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
|
|
1210
|
+
}
|
|
1211
|
+
// Ask each question with retry logic
|
|
1212
|
+
for (const question of questions) {
|
|
1213
|
+
// Acquire mutex for tool calls (shared MCP client)
|
|
1214
|
+
await toolCallMutex.acquire();
|
|
1215
|
+
let interaction;
|
|
1216
|
+
let hadError;
|
|
1217
|
+
try {
|
|
1218
|
+
const result = await this.executeWithRetry(client, tool, question, orchestrator, persona.id, stats);
|
|
1219
|
+
interaction = result.interaction;
|
|
1220
|
+
hadError = result.hadError;
|
|
1221
|
+
}
|
|
1222
|
+
finally {
|
|
1223
|
+
toolCallMutex.release();
|
|
1224
|
+
}
|
|
1225
|
+
personaInteractions.push(interaction);
|
|
1226
|
+
// Track errors for learning
|
|
1227
|
+
if (hadError && interaction.error) {
|
|
1228
|
+
previousErrors.push({
|
|
1229
|
+
args: question.args,
|
|
1230
|
+
error: interaction.error,
|
|
1231
|
+
});
|
|
1232
|
+
// If we have multiple failures, regenerate remaining questions
|
|
1233
|
+
if (!this.config.customScenariosOnly &&
|
|
1234
|
+
previousErrors.length >= 2 && personaInteractions.length < questions.length) {
|
|
1235
|
+
const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
|
|
1236
|
+
if (remaining > 0) {
|
|
1237
|
+
this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
|
|
1238
|
+
const newQuestions = await orchestrator.generateQuestions(tool, remaining, this.config.skipErrorTests, previousErrors);
|
|
1239
|
+
questions = [...questions.slice(0, personaInteractions.length), ...newQuestions];
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
1243
|
+
stats.questionsAsked++;
|
|
1244
|
+
}
|
|
1245
|
+
// Synthesize this persona's findings for this tool
|
|
1246
|
+
let personaProfile;
|
|
1247
|
+
if (this.config.customScenariosOnly) {
|
|
1248
|
+
// Scenarios-only mode: minimal profile, no misleading error counts
|
|
1249
|
+
personaProfile = {
|
|
1250
|
+
behavioralNotes: [],
|
|
1251
|
+
limitations: [],
|
|
1252
|
+
securityNotes: [],
|
|
1253
|
+
};
|
|
1254
|
+
}
|
|
1255
|
+
else {
|
|
1256
|
+
personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
|
|
1257
|
+
question: i.question,
|
|
1258
|
+
response: i.response,
|
|
1259
|
+
error: i.error,
|
|
1260
|
+
analysis: i.analysis,
|
|
1261
|
+
})));
|
|
1262
|
+
}
|
|
1263
|
+
// Store interactions and findings
|
|
1264
|
+
toolInteractions.set(tool.name, personaInteractions);
|
|
1265
|
+
toolFindings.set(tool.name, {
|
|
1266
|
+
personaId: persona.id,
|
|
1267
|
+
personaName: persona.name,
|
|
1268
|
+
behavioralNotes: personaProfile.behavioralNotes,
|
|
1269
|
+
limitations: personaProfile.limitations,
|
|
1270
|
+
securityNotes: personaProfile.securityNotes,
|
|
1271
|
+
});
|
|
1272
|
+
}
|
|
1273
|
+
this.logger.debug({
|
|
1274
|
+
persona: persona.name,
|
|
1275
|
+
toolCount: discovery.tools.length,
|
|
1276
|
+
questionsAsked: stats.questionsAsked,
|
|
1277
|
+
}, 'Persona interview complete');
|
|
1278
|
+
return {
|
|
1279
|
+
persona,
|
|
1280
|
+
stats,
|
|
1281
|
+
toolInteractions,
|
|
1282
|
+
toolFindings,
|
|
1283
|
+
scenarioResults,
|
|
1284
|
+
};
|
|
1285
|
+
}
|
|
1286
|
+
/**
|
|
1287
|
+
* Aggregate results from parallel persona interviews.
|
|
1288
|
+
*/
|
|
1289
|
+
aggregateParallelResults(personaResults, discovery) {
|
|
1290
|
+
const toolInteractionsMap = new Map();
|
|
1291
|
+
// Initialize map for each tool
|
|
1292
|
+
for (const tool of discovery.tools) {
|
|
1293
|
+
toolInteractionsMap.set(tool.name, {
|
|
1294
|
+
interactions: [],
|
|
1295
|
+
findingsByPersona: [],
|
|
1296
|
+
});
|
|
1297
|
+
}
|
|
1298
|
+
const personaStats = new Map();
|
|
1299
|
+
const allScenarioResults = [];
|
|
1300
|
+
// Aggregate results from each persona
|
|
1301
|
+
for (const result of personaResults) {
|
|
1302
|
+
personaStats.set(result.persona.id, result.stats);
|
|
1303
|
+
allScenarioResults.push(...result.scenarioResults);
|
|
1304
|
+
// Merge tool interactions
|
|
1305
|
+
for (const [toolName, interactions] of result.toolInteractions) {
|
|
1306
|
+
const toolData = toolInteractionsMap.get(toolName);
|
|
1307
|
+
if (toolData) {
|
|
1308
|
+
toolData.interactions.push(...interactions);
|
|
1309
|
+
}
|
|
1310
|
+
}
|
|
1311
|
+
// Merge tool findings
|
|
1312
|
+
for (const [toolName, findings] of result.toolFindings) {
|
|
1313
|
+
const toolData = toolInteractionsMap.get(toolName);
|
|
1314
|
+
if (toolData) {
|
|
1315
|
+
toolData.findingsByPersona.push(findings);
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
return { toolInteractionsMap, personaStats, allScenarioResults };
|
|
1320
|
+
}
|
|
1321
|
+
/**
|
|
1322
|
+
* Interview a single tool in check mode (parallel-safe).
|
|
1323
|
+
* Designed for parallel tool testing with minimal overhead.
|
|
1324
|
+
*
|
|
1325
|
+
* @param client - MCP client for tool calls
|
|
1326
|
+
* @param tool - Tool to test
|
|
1327
|
+
* @param toolCallMutex - Mutex for serializing tool calls (shared resource)
|
|
1328
|
+
* @returns ToolCheckResult with interactions and stats
|
|
1329
|
+
*/
|
|
1330
|
+
async interviewToolInCheckMode(client, tool, toolCallMutex, statefulRunner, dependencyInfo, statefulConfig) {
|
|
1331
|
+
const interactions = [];
|
|
1332
|
+
const scenarioResults = [];
|
|
1333
|
+
let questionsAsked = 0;
|
|
1334
|
+
let toolCallCount = 0;
|
|
1335
|
+
let errorCount = 0;
|
|
1336
|
+
const maxChainLength = statefulConfig?.maxChainLength ?? Number.POSITIVE_INFINITY;
|
|
1337
|
+
const allowStateful = !!statefulRunner && (dependencyInfo?.sequencePosition ?? 0) < maxChainLength;
|
|
1338
|
+
const externalDecision = this.resolveExternalServiceDecision(tool);
|
|
1339
|
+
if (externalDecision.action === 'skip') {
|
|
1340
|
+
this.skippedTools.add(tool.name);
|
|
1341
|
+
return {
|
|
1342
|
+
toolName: tool.name,
|
|
1343
|
+
interactions: [],
|
|
1344
|
+
scenarioResults,
|
|
1345
|
+
questionsAsked,
|
|
1346
|
+
toolCallCount,
|
|
1347
|
+
errorCount,
|
|
1348
|
+
skipped: true,
|
|
1349
|
+
skipReason: externalDecision.reason,
|
|
1350
|
+
dependencyInfo,
|
|
1351
|
+
};
|
|
1352
|
+
}
|
|
1353
|
+
// Check for custom scenarios for this tool
|
|
1354
|
+
const customScenarios = this.getScenariosForTool(tool.name);
|
|
1355
|
+
// Build questions list - custom scenarios or fallback questions
|
|
1356
|
+
let questions = [];
|
|
1357
|
+
if (customScenarios.length > 0) {
|
|
1358
|
+
// Execute custom scenarios
|
|
1359
|
+
await toolCallMutex.acquire();
|
|
1360
|
+
try {
|
|
1361
|
+
const results = await this.executeToolScenarios(client, tool.name, customScenarios);
|
|
1362
|
+
scenarioResults.push(...results);
|
|
1363
|
+
toolCallCount += results.length;
|
|
1364
|
+
errorCount += results.filter(r => !r.passed).length;
|
|
1365
|
+
}
|
|
1366
|
+
finally {
|
|
1367
|
+
toolCallMutex.release();
|
|
1368
|
+
}
|
|
1369
|
+
// Convert scenarios to interview questions
|
|
1370
|
+
questions = customScenarios.map(s => this.scenarioToQuestion(s));
|
|
1371
|
+
}
|
|
1372
|
+
else {
|
|
1373
|
+
// No custom scenarios - use fallback questions (check mode, no LLM)
|
|
1374
|
+
// We need an orchestrator for fallback questions, but we won't use LLM
|
|
1375
|
+
// Get fallback questions directly
|
|
1376
|
+
questions = this.getFallbackQuestionsForTool(tool, this.config.skipErrorTests)
|
|
1377
|
+
.slice(0, this.config.maxQuestionsPerTool);
|
|
1378
|
+
}
|
|
1379
|
+
// Execute warmup runs if configured (helps reduce cold-start timing variance)
|
|
1380
|
+
// Warmup runs are not recorded in interactions
|
|
1381
|
+
const warmupRuns = this.config.warmupRuns ?? 1;
|
|
1382
|
+
if (warmupRuns > 0 && questions.length > 0) {
|
|
1383
|
+
const warmupQuestion = questions[0]; // Use first question for warmup
|
|
1384
|
+
await toolCallMutex.acquire();
|
|
1385
|
+
try {
|
|
1386
|
+
for (let i = 0; i < warmupRuns; i++) {
|
|
1387
|
+
try {
|
|
1388
|
+
await this.callToolWithPolicies(client, tool, warmupQuestion.args, externalDecision);
|
|
1389
|
+
}
|
|
1390
|
+
catch {
|
|
1391
|
+
// Ignore warmup errors - we just want to warm up the system
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
1395
|
+
finally {
|
|
1396
|
+
toolCallMutex.release();
|
|
1397
|
+
}
|
|
1398
|
+
this.logger.debug({ tool: tool.name, warmupRuns }, 'Warmup runs complete');
|
|
1399
|
+
}
|
|
1400
|
+
// Ask each question
|
|
1401
|
+
for (const question of questions) {
|
|
1402
|
+
const interactionStart = Date.now();
|
|
1403
|
+
let response = null;
|
|
1404
|
+
let error = null;
|
|
1405
|
+
let toolExecutionMs = 0;
|
|
1406
|
+
let assertionResults;
|
|
1407
|
+
let assertionsPassed;
|
|
1408
|
+
let mocked = false;
|
|
1409
|
+
let mockService;
|
|
1410
|
+
const expectedOutcome = this.inferExpectedOutcome(question);
|
|
1411
|
+
const shouldUseState = allowStateful && expectedOutcome !== 'error';
|
|
1412
|
+
const statefulArgs = shouldUseState && statefulRunner
|
|
1413
|
+
? statefulRunner.applyStateToQuestion(tool.name, question)
|
|
1414
|
+
: { args: { ...question.args }, usedKeys: [] };
|
|
1415
|
+
const resolvedQuestion = {
|
|
1416
|
+
...question,
|
|
1417
|
+
args: statefulArgs.args,
|
|
1418
|
+
metadata: {
|
|
1419
|
+
...question.metadata,
|
|
1420
|
+
stateful: {
|
|
1421
|
+
usedKeys: statefulArgs.usedKeys,
|
|
1422
|
+
},
|
|
1423
|
+
},
|
|
1424
|
+
};
|
|
1425
|
+
// Acquire mutex for tool calls (shared MCP client)
|
|
1426
|
+
await toolCallMutex.acquire();
|
|
1427
|
+
try {
|
|
1428
|
+
const result = await this.callToolWithPolicies(client, tool, resolvedQuestion.args, externalDecision);
|
|
1429
|
+
response = result.response;
|
|
1430
|
+
error = result.error;
|
|
1431
|
+
toolExecutionMs = result.toolExecutionMs;
|
|
1432
|
+
mocked = !!result.mocked;
|
|
1433
|
+
mockService = result.mockService;
|
|
1434
|
+
if (!result.skipped) {
|
|
1435
|
+
toolCallCount++;
|
|
1436
|
+
if (error || response?.isError) {
|
|
1437
|
+
errorCount++;
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
}
|
|
1441
|
+
finally {
|
|
1442
|
+
toolCallMutex.release();
|
|
1443
|
+
}
|
|
1444
|
+
// Generate simple analysis (no LLM in check mode)
|
|
1445
|
+
const analysis = this.generateSimpleAnalysis(error, !!response, 'Tool call succeeded.');
|
|
1446
|
+
const outcomeAssessment = this.assessOutcome(resolvedQuestion, response, error);
|
|
1447
|
+
if (this.config.assertions?.enabled && outcomeAssessment.expected === 'success' && response && !response.isError) {
|
|
1448
|
+
let schema = this.responseSchemas.get(tool.name);
|
|
1449
|
+
if (!schema && this.config.assertions?.infer) {
|
|
1450
|
+
const inferred = inferResponseSchema(response);
|
|
1451
|
+
if (inferred) {
|
|
1452
|
+
schema = inferred;
|
|
1453
|
+
this.responseSchemas.set(tool.name, inferred);
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
if (schema) {
|
|
1457
|
+
assertionResults = validateResponseAssertions(response, schema);
|
|
1458
|
+
assertionsPassed = assertionResults.every((r) => r.passed);
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
if (allowStateful && response && !response.isError && statefulRunner) {
|
|
1462
|
+
const providedKeys = statefulRunner.recordResponse(tool, response);
|
|
1463
|
+
resolvedQuestion.metadata = {
|
|
1464
|
+
...resolvedQuestion.metadata,
|
|
1465
|
+
stateful: {
|
|
1466
|
+
...(resolvedQuestion.metadata?.stateful ?? {}),
|
|
1467
|
+
providedKeys,
|
|
1468
|
+
},
|
|
1469
|
+
};
|
|
1470
|
+
}
|
|
1471
|
+
const interaction = {
|
|
1472
|
+
toolName: tool.name,
|
|
1473
|
+
question: resolvedQuestion,
|
|
1474
|
+
response,
|
|
1475
|
+
error,
|
|
1476
|
+
analysis,
|
|
1477
|
+
durationMs: Date.now() - interactionStart,
|
|
1478
|
+
toolExecutionMs,
|
|
1479
|
+
llmAnalysisMs: 0, // No LLM in check mode
|
|
1480
|
+
personaId: 'check_mode',
|
|
1481
|
+
outcomeAssessment,
|
|
1482
|
+
assertionResults,
|
|
1483
|
+
assertionsPassed,
|
|
1484
|
+
mocked,
|
|
1485
|
+
mockService,
|
|
1486
|
+
};
|
|
1487
|
+
interactions.push(interaction);
|
|
1488
|
+
questionsAsked++;
|
|
1489
|
+
}
|
|
1490
|
+
this.logger.debug({
|
|
1491
|
+
tool: tool.name,
|
|
1492
|
+
questionsAsked,
|
|
1493
|
+
toolCallCount,
|
|
1494
|
+
errorCount,
|
|
1495
|
+
}, 'Tool check complete');
|
|
1496
|
+
return {
|
|
1497
|
+
toolName: tool.name,
|
|
1498
|
+
interactions,
|
|
1499
|
+
scenarioResults,
|
|
1500
|
+
questionsAsked,
|
|
1501
|
+
toolCallCount,
|
|
1502
|
+
errorCount,
|
|
1503
|
+
mocked: interactions.some((i) => i.mocked),
|
|
1504
|
+
mockService: interactions.find((i) => i.mockService)?.mockService,
|
|
1505
|
+
responseSchema: this.responseSchemas.get(tool.name),
|
|
1506
|
+
dependencyInfo,
|
|
1507
|
+
};
|
|
1508
|
+
}
|
|
1509
|
+
/**
|
|
1510
|
+
* Get fallback questions for a tool without requiring an orchestrator.
|
|
1511
|
+
* Used in check mode when parallel tool testing is enabled.
|
|
1512
|
+
*
|
|
1513
|
+
* Uses the SchemaTestGenerator to produce comprehensive deterministic tests
|
|
1514
|
+
* including boundaries, type coercion, enum validation, and error handling.
|
|
1515
|
+
*/
|
|
1516
|
+
getFallbackQuestionsForTool(tool, skipErrorTests) {
|
|
1517
|
+
// Use the enhanced schema test generator for comprehensive coverage
|
|
1518
|
+
// Allow more tests in check mode since there's no LLM cost
|
|
1519
|
+
const maxTests = Math.max(this.config.maxQuestionsPerTool * 4, SCHEMA_TESTING.MAX_TESTS_PER_TOOL);
|
|
1520
|
+
return generateSchemaTests(tool, {
|
|
1521
|
+
skipErrorTests,
|
|
1522
|
+
maxTestsPerTool: maxTests,
|
|
1523
|
+
});
|
|
1524
|
+
}
|
|
1525
|
+
/**
|
|
1526
|
+
* Run parallel tool testing in check mode.
|
|
1527
|
+
* Tests all tools concurrently with a configurable worker limit.
|
|
1528
|
+
*
|
|
1529
|
+
* @param client - MCP client for tool calls
|
|
1530
|
+
* @param tools - Tools to test
|
|
1531
|
+
* @param onProgress - Progress callback
|
|
1532
|
+
* @returns Aggregated tool profiles
|
|
1533
|
+
*/
|
|
1534
|
+
async interviewToolsInParallel(client, tools, progress, onProgress, options) {
|
|
1535
|
+
// Use concurrency=1 for sequential execution when parallelTools is disabled
|
|
1536
|
+
const statefulEnabled = !!options?.statefulRunner;
|
|
1537
|
+
const concurrency = statefulEnabled
|
|
1538
|
+
? 1
|
|
1539
|
+
: this.config.parallelTools
|
|
1540
|
+
? (this.config.toolConcurrency ?? INTERVIEW.DEFAULT_TOOL_CONCURRENCY)
|
|
1541
|
+
: 1;
|
|
1542
|
+
const toolCallMutex = createMutex(); // Shared mutex for serializing MCP client calls
|
|
1543
|
+
this.logger.info({
|
|
1544
|
+
toolCount: tools.length,
|
|
1545
|
+
concurrency,
|
|
1546
|
+
parallel: this.config.parallelTools,
|
|
1547
|
+
}, 'Running check mode tool testing');
|
|
1548
|
+
// Create tasks for each tool
|
|
1549
|
+
const toolTasks = tools.map(tool => async () => {
|
|
1550
|
+
progress.currentTool = tool.name;
|
|
1551
|
+
onProgress?.(progress);
|
|
1552
|
+
const result = await this.interviewToolInCheckMode(client, tool, toolCallMutex, options?.statefulRunner, options?.dependencyMap?.get(tool.name), options?.statefulConfig);
|
|
1553
|
+
progress.toolsCompleted++;
|
|
1554
|
+
progress.questionsAsked += result.questionsAsked;
|
|
1555
|
+
progress.lastCompletedTool = this.buildToolProgressSummary(result);
|
|
1556
|
+
onProgress?.(progress);
|
|
1557
|
+
return result;
|
|
1558
|
+
});
|
|
1559
|
+
// Execute tools in parallel with concurrency limit
|
|
1560
|
+
const parallelResults = await parallelLimit(toolTasks, { concurrency });
|
|
1561
|
+
// Check for errors
|
|
1562
|
+
if (!parallelResults.allSucceeded) {
|
|
1563
|
+
for (const [index, error] of parallelResults.errors) {
|
|
1564
|
+
this.logger.error({
|
|
1565
|
+
tool: tools[index]?.name,
|
|
1566
|
+
error: error.message,
|
|
1567
|
+
}, 'Tool check failed');
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
// Aggregate results
|
|
1571
|
+
const successfulResults = parallelResults.results.filter((r) => r !== undefined);
|
|
1572
|
+
const toolProfiles = [];
|
|
1573
|
+
const scenarioResults = [];
|
|
1574
|
+
let totalToolCallCount = 0;
|
|
1575
|
+
let totalErrorCount = 0;
|
|
1576
|
+
let totalQuestionsAsked = 0;
|
|
1577
|
+
for (const result of successfulResults) {
|
|
1578
|
+
const tool = tools.find(t => t.name === result.toolName);
|
|
1579
|
+
if (!tool)
|
|
1580
|
+
continue;
|
|
1581
|
+
// Classify errors to separate tool correctness from environment issues
|
|
1582
|
+
const errorClassification = this.classifyErrors(result.interactions, result.toolName, tool.description ?? '');
|
|
1583
|
+
const assertionSummary = summarizeAssertions(result.interactions);
|
|
1584
|
+
// Build minimal profile for check mode
|
|
1585
|
+
toolProfiles.push({
|
|
1586
|
+
name: result.toolName,
|
|
1587
|
+
description: tool.description ?? '',
|
|
1588
|
+
interactions: result.interactions,
|
|
1589
|
+
behavioralNotes: [],
|
|
1590
|
+
limitations: [],
|
|
1591
|
+
securityNotes: [],
|
|
1592
|
+
findingsByPersona: [],
|
|
1593
|
+
errorClassification,
|
|
1594
|
+
skipped: result.skipped,
|
|
1595
|
+
skipReason: result.skipReason,
|
|
1596
|
+
mocked: result.mocked,
|
|
1597
|
+
mockService: result.mockService,
|
|
1598
|
+
responseSchema: result.responseSchema,
|
|
1599
|
+
assertionSummary,
|
|
1600
|
+
dependencyInfo: result.dependencyInfo,
|
|
1601
|
+
});
|
|
1602
|
+
scenarioResults.push(...result.scenarioResults);
|
|
1603
|
+
totalToolCallCount += result.toolCallCount;
|
|
1604
|
+
totalErrorCount += result.errorCount;
|
|
1605
|
+
totalQuestionsAsked += result.questionsAsked;
|
|
1606
|
+
}
|
|
1607
|
+
this.logger.info({
|
|
1608
|
+
toolCount: toolProfiles.length,
|
|
1609
|
+
totalToolCallCount,
|
|
1610
|
+
totalErrorCount,
|
|
1611
|
+
}, 'Parallel tool testing complete');
|
|
1612
|
+
return {
|
|
1613
|
+
toolProfiles,
|
|
1614
|
+
scenarioResults,
|
|
1615
|
+
totalToolCallCount,
|
|
1616
|
+
totalErrorCount,
|
|
1617
|
+
totalQuestionsAsked,
|
|
1618
|
+
};
|
|
1619
|
+
}
|
|
1620
|
+
buildToolProgressSummary(result) {
|
|
1621
|
+
const interactions = result.interactions.filter(i => !i.mocked);
|
|
1622
|
+
const totalTests = interactions.length;
|
|
1623
|
+
let passedTests = 0;
|
|
1624
|
+
let validationTotal = 0;
|
|
1625
|
+
let validationPassed = 0;
|
|
1626
|
+
let totalDuration = 0;
|
|
1627
|
+
for (const interaction of interactions) {
|
|
1628
|
+
totalDuration += interaction.durationMs;
|
|
1629
|
+
const assessment = interaction.outcomeAssessment;
|
|
1630
|
+
if (assessment) {
|
|
1631
|
+
if (assessment.correct) {
|
|
1632
|
+
passedTests += 1;
|
|
1633
|
+
}
|
|
1634
|
+
if (assessment.expected === 'error') {
|
|
1635
|
+
validationTotal += 1;
|
|
1636
|
+
if (assessment.correct) {
|
|
1637
|
+
validationPassed += 1;
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
else {
|
|
1642
|
+
const hasError = interaction.error || interaction.response?.isError;
|
|
1643
|
+
if (!hasError) {
|
|
1644
|
+
passedTests += 1;
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
const avgMs = totalTests > 0 ? Math.round(totalDuration / totalTests) : 0;
|
|
1649
|
+
return {
|
|
1650
|
+
toolName: result.toolName,
|
|
1651
|
+
totalTests,
|
|
1652
|
+
passedTests,
|
|
1653
|
+
validationTotal,
|
|
1654
|
+
validationPassed,
|
|
1655
|
+
avgMs,
|
|
1656
|
+
skipped: result.skipped,
|
|
1657
|
+
skipReason: result.skipReason,
|
|
1658
|
+
mocked: result.mocked,
|
|
1659
|
+
mockService: result.mockService,
|
|
1660
|
+
};
|
|
1661
|
+
}
|
|
1662
|
+
/**
|
|
1663
|
+
* Convert a TestScenario to an InterviewQuestion.
|
|
1664
|
+
*/
|
|
1665
|
+
scenarioToQuestion(scenario) {
|
|
1666
|
+
return {
|
|
1667
|
+
description: scenario.description,
|
|
1668
|
+
category: scenario.category,
|
|
1669
|
+
args: scenario.args,
|
|
1670
|
+
};
|
|
1671
|
+
}
|
|
1672
|
+
/**
|
|
1673
|
+
* Get custom scenarios for a specific tool.
|
|
1674
|
+
*/
|
|
1675
|
+
getScenariosForTool(toolName) {
|
|
1676
|
+
const scenarios = this.config.customScenarios?.toolScenarios ?? [];
|
|
1677
|
+
return scenarios.filter(s => s.tool === toolName && !s.skip);
|
|
1678
|
+
}
|
|
1679
|
+
/**
|
|
1680
|
+
* Get custom scenarios for a specific prompt.
|
|
1681
|
+
*/
|
|
1682
|
+
getScenariosForPrompt(promptName) {
|
|
1683
|
+
const scenarios = this.config.customScenarios?.promptScenarios ?? [];
|
|
1684
|
+
return scenarios.filter(s => s.prompt === promptName && !s.skip);
|
|
1685
|
+
}
|
|
1686
|
+
/**
|
|
1687
|
+
* Execute custom test scenarios for a tool.
|
|
1688
|
+
* Returns scenario results with assertion evaluations.
|
|
1689
|
+
*/
|
|
1690
|
+
async executeToolScenarios(client, toolName, scenarios) {
|
|
1691
|
+
const results = [];
|
|
1692
|
+
const tool = { name: toolName, description: '' };
|
|
1693
|
+
for (const scenario of scenarios) {
|
|
1694
|
+
if (scenario.skip) {
|
|
1695
|
+
continue;
|
|
1696
|
+
}
|
|
1697
|
+
const startTime = Date.now();
|
|
1698
|
+
let response = null;
|
|
1699
|
+
let error;
|
|
1700
|
+
let isError = false;
|
|
1701
|
+
try {
|
|
1702
|
+
const result = await this.callToolWithPolicies(client, tool, scenario.args);
|
|
1703
|
+
if (result.skipped) {
|
|
1704
|
+
error = result.skipReason ?? 'Skipped: external service not configured';
|
|
1705
|
+
isError = true;
|
|
1706
|
+
}
|
|
1707
|
+
else {
|
|
1708
|
+
response = result.response;
|
|
1709
|
+
isError = response?.isError ?? false;
|
|
1710
|
+
if (isError) {
|
|
1711
|
+
const errorContent = response?.content?.find(c => c.type === 'text');
|
|
1712
|
+
if (errorContent && 'text' in errorContent) {
|
|
1713
|
+
error = String(errorContent.text);
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
if (result.error) {
|
|
1717
|
+
error = result.error;
|
|
1718
|
+
isError = true;
|
|
1719
|
+
}
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1722
|
+
catch (e) {
|
|
1723
|
+
error = e instanceof Error ? e.message : String(e);
|
|
1724
|
+
isError = true;
|
|
1725
|
+
}
|
|
1726
|
+
// Evaluate assertions if provided
|
|
1727
|
+
const assertionResults = scenario.assertions
|
|
1728
|
+
? evaluateAssertions(scenario.assertions, response, isError)
|
|
1729
|
+
: [];
|
|
1730
|
+
// Scenario passes if no error (or expected error) and all assertions pass
|
|
1731
|
+
const allAssertionsPassed = assertionResults.every(r => r.passed);
|
|
1732
|
+
const passed = allAssertionsPassed && (!isError || scenario.category === 'error_handling');
|
|
1733
|
+
const result = {
|
|
1734
|
+
scenario,
|
|
1735
|
+
passed,
|
|
1736
|
+
assertionResults,
|
|
1737
|
+
error,
|
|
1738
|
+
response,
|
|
1739
|
+
durationMs: Date.now() - startTime,
|
|
1740
|
+
};
|
|
1741
|
+
results.push(result);
|
|
1742
|
+
this.logger.debug({
|
|
1743
|
+
tool: toolName,
|
|
1744
|
+
scenario: scenario.description,
|
|
1745
|
+
passed,
|
|
1746
|
+
assertions: assertionResults.length,
|
|
1747
|
+
}, 'Scenario executed');
|
|
1748
|
+
}
|
|
1749
|
+
return results;
|
|
1750
|
+
}
|
|
1751
|
+
/**
|
|
1752
|
+
* Execute custom test scenarios for a prompt.
|
|
1753
|
+
* Returns scenario results with assertion evaluations.
|
|
1754
|
+
*/
|
|
1755
|
+
async executePromptScenarios(client, promptName, scenarios) {
|
|
1756
|
+
const results = [];
|
|
1757
|
+
for (const scenario of scenarios) {
|
|
1758
|
+
if (scenario.skip) {
|
|
1759
|
+
continue;
|
|
1760
|
+
}
|
|
1761
|
+
const startTime = Date.now();
|
|
1762
|
+
let response = null;
|
|
1763
|
+
let error;
|
|
1764
|
+
try {
|
|
1765
|
+
response = await client.getPrompt(promptName, scenario.args);
|
|
1766
|
+
}
|
|
1767
|
+
catch (e) {
|
|
1768
|
+
error = e instanceof Error ? e.message : String(e);
|
|
1769
|
+
}
|
|
1770
|
+
// Evaluate assertions if provided
|
|
1771
|
+
const assertionResults = scenario.assertions
|
|
1772
|
+
? evaluateAssertions(scenario.assertions, response, !!error)
|
|
1773
|
+
: [];
|
|
1774
|
+
const allAssertionsPassed = assertionResults.every(r => r.passed);
|
|
1775
|
+
// Check if this scenario expects an error (has an assertion checking for 'error' to exist)
|
|
1776
|
+
const expectsError = scenario.assertions?.some(a => a.path === 'error' && a.condition === 'exists') ?? false;
|
|
1777
|
+
// Scenario passes if assertions pass AND (no error OR scenario expects error)
|
|
1778
|
+
const passed = allAssertionsPassed && (!error || expectsError);
|
|
1779
|
+
const result = {
|
|
1780
|
+
scenario,
|
|
1781
|
+
passed,
|
|
1782
|
+
assertionResults,
|
|
1783
|
+
error,
|
|
1784
|
+
response,
|
|
1785
|
+
durationMs: Date.now() - startTime,
|
|
1786
|
+
};
|
|
1787
|
+
results.push(result);
|
|
1788
|
+
this.logger.debug({
|
|
1789
|
+
prompt: promptName,
|
|
1790
|
+
scenario: scenario.description,
|
|
1791
|
+
passed,
|
|
1792
|
+
assertions: assertionResults.length,
|
|
1793
|
+
}, 'Prompt scenario executed');
|
|
1794
|
+
}
|
|
1795
|
+
return results;
|
|
1796
|
+
}
|
|
1797
|
+
/**
|
|
1798
|
+
* Execute workflow discovery and/or execution.
|
|
1799
|
+
* Discovers workflows using LLM if enabled, loads from file if provided,
|
|
1800
|
+
* and executes all workflows against the MCP server.
|
|
1801
|
+
*/
|
|
1802
|
+
async executeWorkflows(client, discovery, workflowConfig, progress, onProgress) {
|
|
1803
|
+
const allWorkflows = [];
|
|
1804
|
+
let discoveredCount = 0;
|
|
1805
|
+
let loadedCount = 0;
|
|
1806
|
+
// Add user-provided workflows
|
|
1807
|
+
if (workflowConfig.workflows && workflowConfig.workflows.length > 0) {
|
|
1808
|
+
allWorkflows.push(...workflowConfig.workflows);
|
|
1809
|
+
loadedCount = workflowConfig.workflows.length;
|
|
1810
|
+
this.logger.info({ count: loadedCount }, 'Using workflows loaded from file');
|
|
1811
|
+
}
|
|
1812
|
+
// Discover workflows using LLM if enabled (requires LLM - skip in check mode)
|
|
1813
|
+
if (workflowConfig.discoverWorkflows && discovery.tools.length >= 2 && this.llm) {
|
|
1814
|
+
this.logger.info('Discovering workflows using LLM analysis');
|
|
1815
|
+
const discoverer = new WorkflowDiscoverer(this.llm, {
|
|
1816
|
+
maxWorkflows: workflowConfig.maxDiscoveredWorkflows ?? WORKFLOW.MAX_DISCOVERED_WORKFLOWS,
|
|
1817
|
+
minSteps: WORKFLOW.MIN_WORKFLOW_STEPS,
|
|
1818
|
+
maxSteps: WORKFLOW.MAX_WORKFLOW_STEPS,
|
|
1819
|
+
});
|
|
1820
|
+
try {
|
|
1821
|
+
const discovered = await discoverer.discover(discovery.tools);
|
|
1822
|
+
if (discovered.length > 0) {
|
|
1823
|
+
allWorkflows.push(...discovered);
|
|
1824
|
+
discoveredCount = discovered.length;
|
|
1825
|
+
this.logger.info({
|
|
1826
|
+
count: discoveredCount,
|
|
1827
|
+
workflows: discovered.map(w => w.name),
|
|
1828
|
+
}, 'Discovered workflows');
|
|
1829
|
+
}
|
|
1830
|
+
else {
|
|
1831
|
+
this.logger.info('No workflows discovered from tool analysis');
|
|
1832
|
+
}
|
|
1833
|
+
}
|
|
1834
|
+
catch (error) {
|
|
1835
|
+
this.logger.warn({
|
|
1836
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1837
|
+
}, 'Workflow discovery failed');
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
// Execute all workflows
|
|
1841
|
+
const results = [];
|
|
1842
|
+
// Execute workflows (requires LLM for analysis - skip in check mode unless analyzeSteps is disabled)
|
|
1843
|
+
if (allWorkflows.length > 0 && !workflowConfig.skipWorkflowExecution && this.llm) {
|
|
1844
|
+
this.logger.info({ count: allWorkflows.length }, 'Executing workflows');
|
|
1845
|
+
progress.totalWorkflows = allWorkflows.length;
|
|
1846
|
+
progress.workflowsCompleted = 0;
|
|
1847
|
+
onProgress?.(progress);
|
|
1848
|
+
const stepTimeout = workflowConfig.stepTimeout ?? WORKFLOW.STEP_TIMEOUT;
|
|
1849
|
+
const timeouts = workflowConfig.timeouts ?? {
|
|
1850
|
+
toolCall: stepTimeout,
|
|
1851
|
+
stateSnapshot: WORKFLOW.STATE_SNAPSHOT_TIMEOUT,
|
|
1852
|
+
probeTool: WORKFLOW.PROBE_TOOL_TIMEOUT,
|
|
1853
|
+
llmAnalysis: WORKFLOW.LLM_ANALYSIS_TIMEOUT,
|
|
1854
|
+
llmSummary: WORKFLOW.LLM_SUMMARY_TIMEOUT,
|
|
1855
|
+
};
|
|
1856
|
+
const executor = new WorkflowExecutor(client, this.llm, discovery.tools, {
|
|
1857
|
+
stepTimeout,
|
|
1858
|
+
analyzeSteps: !this.config.customScenariosOnly,
|
|
1859
|
+
generateSummary: !this.config.customScenariosOnly,
|
|
1860
|
+
stateTracking: workflowConfig.enableStateTracking
|
|
1861
|
+
? {
|
|
1862
|
+
enabled: true,
|
|
1863
|
+
snapshotBefore: true,
|
|
1864
|
+
snapshotAfter: true,
|
|
1865
|
+
snapshotAfterEachStep: false,
|
|
1866
|
+
}
|
|
1867
|
+
: undefined,
|
|
1868
|
+
timeouts,
|
|
1869
|
+
});
|
|
1870
|
+
for (const workflow of allWorkflows) {
|
|
1871
|
+
progress.currentWorkflow = workflow.name;
|
|
1872
|
+
onProgress?.(progress);
|
|
1873
|
+
this.logger.debug({
|
|
1874
|
+
workflowId: workflow.id,
|
|
1875
|
+
workflowName: workflow.name,
|
|
1876
|
+
stepCount: workflow.steps.length,
|
|
1877
|
+
}, 'Executing workflow');
|
|
1878
|
+
try {
|
|
1879
|
+
const result = await executor.execute(workflow);
|
|
1880
|
+
results.push(result);
|
|
1881
|
+
this.logger.info({
|
|
1882
|
+
workflowId: workflow.id,
|
|
1883
|
+
success: result.success,
|
|
1884
|
+
durationMs: result.durationMs,
|
|
1885
|
+
}, 'Workflow execution complete');
|
|
1886
|
+
}
|
|
1887
|
+
catch (error) {
|
|
1888
|
+
this.logger.error({
|
|
1889
|
+
workflowId: workflow.id,
|
|
1890
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1891
|
+
}, 'Workflow execution failed');
|
|
1892
|
+
// Create a failed result
|
|
1893
|
+
results.push({
|
|
1894
|
+
workflow,
|
|
1895
|
+
steps: [],
|
|
1896
|
+
success: false,
|
|
1897
|
+
failureReason: error instanceof Error ? error.message : String(error),
|
|
1898
|
+
durationMs: 0,
|
|
1899
|
+
dataFlow: [],
|
|
1900
|
+
});
|
|
1901
|
+
}
|
|
1902
|
+
progress.workflowsCompleted = (progress.workflowsCompleted ?? 0) + 1;
|
|
1903
|
+
onProgress?.(progress);
|
|
1904
|
+
}
|
|
1905
|
+
}
|
|
1906
|
+
// Build summary
|
|
1907
|
+
const successfulCount = results.filter(r => r.success).length;
|
|
1908
|
+
const summary = {
|
|
1909
|
+
workflowCount: results.length,
|
|
1910
|
+
successfulCount,
|
|
1911
|
+
failedCount: results.length - successfulCount,
|
|
1912
|
+
discoveredCount,
|
|
1913
|
+
loadedCount,
|
|
1914
|
+
};
|
|
1915
|
+
this.logger.info({
|
|
1916
|
+
total: summary.workflowCount,
|
|
1917
|
+
successful: summary.successfulCount,
|
|
1918
|
+
failed: summary.failedCount,
|
|
1919
|
+
discovered: summary.discoveredCount,
|
|
1920
|
+
loaded: summary.loadedCount,
|
|
1921
|
+
}, 'Workflow execution summary');
|
|
1922
|
+
return { results, summary };
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
function summarizeAssertions(interactions) {
|
|
1926
|
+
const allResults = interactions
|
|
1927
|
+
.filter((i) => !i.mocked)
|
|
1928
|
+
.flatMap((i) => i.assertionResults ?? []);
|
|
1929
|
+
if (allResults.length === 0)
|
|
1930
|
+
return undefined;
|
|
1931
|
+
const passed = allResults.filter((r) => r.passed).length;
|
|
1932
|
+
const failed = allResults.length - passed;
|
|
1933
|
+
return {
|
|
1934
|
+
total: allResults.length,
|
|
1935
|
+
passed,
|
|
1936
|
+
failed,
|
|
1937
|
+
};
|
|
1938
|
+
}
|
|
1939
|
+
//# sourceMappingURL=interviewer.js.map
|