@dotsetlabs/bellwether 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (403) hide show
  1. package/CHANGELOG.md +291 -0
  2. package/LICENSE +21 -0
  3. package/README.md +739 -0
  4. package/dist/auth/credentials.d.ts +64 -0
  5. package/dist/auth/credentials.js +218 -0
  6. package/dist/auth/index.d.ts +6 -0
  7. package/dist/auth/index.js +6 -0
  8. package/dist/auth/keychain.d.ts +64 -0
  9. package/dist/auth/keychain.js +268 -0
  10. package/dist/baseline/ab-testing.d.ts +80 -0
  11. package/dist/baseline/ab-testing.js +236 -0
  12. package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
  13. package/dist/baseline/ai-compatibility-scorer.js +606 -0
  14. package/dist/baseline/calibration.d.ts +77 -0
  15. package/dist/baseline/calibration.js +136 -0
  16. package/dist/baseline/category-matching.d.ts +85 -0
  17. package/dist/baseline/category-matching.js +289 -0
  18. package/dist/baseline/change-impact-analyzer.d.ts +98 -0
  19. package/dist/baseline/change-impact-analyzer.js +592 -0
  20. package/dist/baseline/comparator.d.ts +64 -0
  21. package/dist/baseline/comparator.js +916 -0
  22. package/dist/baseline/confidence.d.ts +55 -0
  23. package/dist/baseline/confidence.js +122 -0
  24. package/dist/baseline/converter.d.ts +61 -0
  25. package/dist/baseline/converter.js +585 -0
  26. package/dist/baseline/dependency-analyzer.d.ts +89 -0
  27. package/dist/baseline/dependency-analyzer.js +567 -0
  28. package/dist/baseline/deprecation-tracker.d.ts +133 -0
  29. package/dist/baseline/deprecation-tracker.js +322 -0
  30. package/dist/baseline/diff.d.ts +55 -0
  31. package/dist/baseline/diff.js +1584 -0
  32. package/dist/baseline/documentation-scorer.d.ts +205 -0
  33. package/dist/baseline/documentation-scorer.js +466 -0
  34. package/dist/baseline/embeddings.d.ts +118 -0
  35. package/dist/baseline/embeddings.js +251 -0
  36. package/dist/baseline/error-analyzer.d.ts +198 -0
  37. package/dist/baseline/error-analyzer.js +721 -0
  38. package/dist/baseline/evaluation/evaluator.d.ts +42 -0
  39. package/dist/baseline/evaluation/evaluator.js +323 -0
  40. package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
  41. package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
  42. package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
  43. package/dist/baseline/evaluation/golden-dataset.js +717 -0
  44. package/dist/baseline/evaluation/index.d.ts +15 -0
  45. package/dist/baseline/evaluation/index.js +15 -0
  46. package/dist/baseline/evaluation/types.d.ts +186 -0
  47. package/dist/baseline/evaluation/types.js +8 -0
  48. package/dist/baseline/external-dependency-detector.d.ts +181 -0
  49. package/dist/baseline/external-dependency-detector.js +524 -0
  50. package/dist/baseline/golden-output.d.ts +162 -0
  51. package/dist/baseline/golden-output.js +636 -0
  52. package/dist/baseline/health-scorer.d.ts +174 -0
  53. package/dist/baseline/health-scorer.js +451 -0
  54. package/dist/baseline/incremental-checker.d.ts +97 -0
  55. package/dist/baseline/incremental-checker.js +174 -0
  56. package/dist/baseline/index.d.ts +31 -0
  57. package/dist/baseline/index.js +42 -0
  58. package/dist/baseline/migration-generator.d.ts +137 -0
  59. package/dist/baseline/migration-generator.js +554 -0
  60. package/dist/baseline/migrations.d.ts +60 -0
  61. package/dist/baseline/migrations.js +197 -0
  62. package/dist/baseline/performance-tracker.d.ts +214 -0
  63. package/dist/baseline/performance-tracker.js +577 -0
  64. package/dist/baseline/pr-comment-generator.d.ts +117 -0
  65. package/dist/baseline/pr-comment-generator.js +546 -0
  66. package/dist/baseline/response-fingerprint.d.ts +127 -0
  67. package/dist/baseline/response-fingerprint.js +728 -0
  68. package/dist/baseline/response-schema-tracker.d.ts +129 -0
  69. package/dist/baseline/response-schema-tracker.js +420 -0
  70. package/dist/baseline/risk-scorer.d.ts +54 -0
  71. package/dist/baseline/risk-scorer.js +434 -0
  72. package/dist/baseline/saver.d.ts +89 -0
  73. package/dist/baseline/saver.js +554 -0
  74. package/dist/baseline/scenario-generator.d.ts +151 -0
  75. package/dist/baseline/scenario-generator.js +905 -0
  76. package/dist/baseline/schema-compare.d.ts +86 -0
  77. package/dist/baseline/schema-compare.js +557 -0
  78. package/dist/baseline/schema-evolution.d.ts +189 -0
  79. package/dist/baseline/schema-evolution.js +467 -0
  80. package/dist/baseline/semantic.d.ts +203 -0
  81. package/dist/baseline/semantic.js +908 -0
  82. package/dist/baseline/synonyms.d.ts +60 -0
  83. package/dist/baseline/synonyms.js +386 -0
  84. package/dist/baseline/telemetry.d.ts +165 -0
  85. package/dist/baseline/telemetry.js +294 -0
  86. package/dist/baseline/test-pruner.d.ts +120 -0
  87. package/dist/baseline/test-pruner.js +387 -0
  88. package/dist/baseline/types.d.ts +449 -0
  89. package/dist/baseline/types.js +5 -0
  90. package/dist/baseline/version.d.ts +138 -0
  91. package/dist/baseline/version.js +206 -0
  92. package/dist/cache/index.d.ts +5 -0
  93. package/dist/cache/index.js +5 -0
  94. package/dist/cache/response-cache.d.ts +151 -0
  95. package/dist/cache/response-cache.js +287 -0
  96. package/dist/ci/index.d.ts +60 -0
  97. package/dist/ci/index.js +342 -0
  98. package/dist/cli/commands/auth.d.ts +12 -0
  99. package/dist/cli/commands/auth.js +352 -0
  100. package/dist/cli/commands/badge.d.ts +3 -0
  101. package/dist/cli/commands/badge.js +74 -0
  102. package/dist/cli/commands/baseline-accept.d.ts +15 -0
  103. package/dist/cli/commands/baseline-accept.js +178 -0
  104. package/dist/cli/commands/baseline-migrate.d.ts +12 -0
  105. package/dist/cli/commands/baseline-migrate.js +164 -0
  106. package/dist/cli/commands/baseline.d.ts +14 -0
  107. package/dist/cli/commands/baseline.js +449 -0
  108. package/dist/cli/commands/beta.d.ts +10 -0
  109. package/dist/cli/commands/beta.js +231 -0
  110. package/dist/cli/commands/check.d.ts +11 -0
  111. package/dist/cli/commands/check.js +820 -0
  112. package/dist/cli/commands/cloud/badge.d.ts +3 -0
  113. package/dist/cli/commands/cloud/badge.js +74 -0
  114. package/dist/cli/commands/cloud/diff.d.ts +6 -0
  115. package/dist/cli/commands/cloud/diff.js +79 -0
  116. package/dist/cli/commands/cloud/history.d.ts +6 -0
  117. package/dist/cli/commands/cloud/history.js +102 -0
  118. package/dist/cli/commands/cloud/link.d.ts +9 -0
  119. package/dist/cli/commands/cloud/link.js +119 -0
  120. package/dist/cli/commands/cloud/login.d.ts +7 -0
  121. package/dist/cli/commands/cloud/login.js +499 -0
  122. package/dist/cli/commands/cloud/projects.d.ts +6 -0
  123. package/dist/cli/commands/cloud/projects.js +44 -0
  124. package/dist/cli/commands/cloud/shared.d.ts +7 -0
  125. package/dist/cli/commands/cloud/shared.js +42 -0
  126. package/dist/cli/commands/cloud/teams.d.ts +8 -0
  127. package/dist/cli/commands/cloud/teams.js +169 -0
  128. package/dist/cli/commands/cloud/upload.d.ts +8 -0
  129. package/dist/cli/commands/cloud/upload.js +181 -0
  130. package/dist/cli/commands/contract.d.ts +11 -0
  131. package/dist/cli/commands/contract.js +280 -0
  132. package/dist/cli/commands/discover.d.ts +3 -0
  133. package/dist/cli/commands/discover.js +82 -0
  134. package/dist/cli/commands/eval.d.ts +9 -0
  135. package/dist/cli/commands/eval.js +187 -0
  136. package/dist/cli/commands/explore.d.ts +11 -0
  137. package/dist/cli/commands/explore.js +437 -0
  138. package/dist/cli/commands/feedback.d.ts +9 -0
  139. package/dist/cli/commands/feedback.js +174 -0
  140. package/dist/cli/commands/golden.d.ts +12 -0
  141. package/dist/cli/commands/golden.js +407 -0
  142. package/dist/cli/commands/history.d.ts +10 -0
  143. package/dist/cli/commands/history.js +202 -0
  144. package/dist/cli/commands/init.d.ts +9 -0
  145. package/dist/cli/commands/init.js +219 -0
  146. package/dist/cli/commands/interview.d.ts +3 -0
  147. package/dist/cli/commands/interview.js +903 -0
  148. package/dist/cli/commands/link.d.ts +10 -0
  149. package/dist/cli/commands/link.js +169 -0
  150. package/dist/cli/commands/login.d.ts +7 -0
  151. package/dist/cli/commands/login.js +499 -0
  152. package/dist/cli/commands/preset.d.ts +33 -0
  153. package/dist/cli/commands/preset.js +297 -0
  154. package/dist/cli/commands/profile.d.ts +33 -0
  155. package/dist/cli/commands/profile.js +286 -0
  156. package/dist/cli/commands/registry.d.ts +11 -0
  157. package/dist/cli/commands/registry.js +146 -0
  158. package/dist/cli/commands/shared.d.ts +79 -0
  159. package/dist/cli/commands/shared.js +196 -0
  160. package/dist/cli/commands/teams.d.ts +8 -0
  161. package/dist/cli/commands/teams.js +169 -0
  162. package/dist/cli/commands/test.d.ts +9 -0
  163. package/dist/cli/commands/test.js +500 -0
  164. package/dist/cli/commands/upload.d.ts +8 -0
  165. package/dist/cli/commands/upload.js +223 -0
  166. package/dist/cli/commands/validate-config.d.ts +6 -0
  167. package/dist/cli/commands/validate-config.js +35 -0
  168. package/dist/cli/commands/verify.d.ts +11 -0
  169. package/dist/cli/commands/verify.js +283 -0
  170. package/dist/cli/commands/watch.d.ts +12 -0
  171. package/dist/cli/commands/watch.js +253 -0
  172. package/dist/cli/index.d.ts +3 -0
  173. package/dist/cli/index.js +178 -0
  174. package/dist/cli/interactive.d.ts +47 -0
  175. package/dist/cli/interactive.js +216 -0
  176. package/dist/cli/output/terminal-reporter.d.ts +19 -0
  177. package/dist/cli/output/terminal-reporter.js +104 -0
  178. package/dist/cli/output.d.ts +226 -0
  179. package/dist/cli/output.js +438 -0
  180. package/dist/cli/utils/env.d.ts +5 -0
  181. package/dist/cli/utils/env.js +14 -0
  182. package/dist/cli/utils/progress.d.ts +59 -0
  183. package/dist/cli/utils/progress.js +206 -0
  184. package/dist/cli/utils/server-context.d.ts +10 -0
  185. package/dist/cli/utils/server-context.js +36 -0
  186. package/dist/cloud/auth.d.ts +144 -0
  187. package/dist/cloud/auth.js +374 -0
  188. package/dist/cloud/client.d.ts +24 -0
  189. package/dist/cloud/client.js +65 -0
  190. package/dist/cloud/http-client.d.ts +38 -0
  191. package/dist/cloud/http-client.js +215 -0
  192. package/dist/cloud/index.d.ts +23 -0
  193. package/dist/cloud/index.js +25 -0
  194. package/dist/cloud/mock-client.d.ts +107 -0
  195. package/dist/cloud/mock-client.js +545 -0
  196. package/dist/cloud/types.d.ts +515 -0
  197. package/dist/cloud/types.js +15 -0
  198. package/dist/config/defaults.d.ts +160 -0
  199. package/dist/config/defaults.js +169 -0
  200. package/dist/config/loader.d.ts +24 -0
  201. package/dist/config/loader.js +122 -0
  202. package/dist/config/template.d.ts +42 -0
  203. package/dist/config/template.js +647 -0
  204. package/dist/config/validator.d.ts +2112 -0
  205. package/dist/config/validator.js +658 -0
  206. package/dist/constants/cloud.d.ts +107 -0
  207. package/dist/constants/cloud.js +110 -0
  208. package/dist/constants/core.d.ts +521 -0
  209. package/dist/constants/core.js +556 -0
  210. package/dist/constants/testing.d.ts +1283 -0
  211. package/dist/constants/testing.js +1568 -0
  212. package/dist/constants.d.ts +10 -0
  213. package/dist/constants.js +10 -0
  214. package/dist/contract/index.d.ts +6 -0
  215. package/dist/contract/index.js +5 -0
  216. package/dist/contract/validator.d.ts +177 -0
  217. package/dist/contract/validator.js +574 -0
  218. package/dist/cost/index.d.ts +6 -0
  219. package/dist/cost/index.js +5 -0
  220. package/dist/cost/tracker.d.ts +134 -0
  221. package/dist/cost/tracker.js +313 -0
  222. package/dist/discovery/discovery.d.ts +16 -0
  223. package/dist/discovery/discovery.js +173 -0
  224. package/dist/discovery/types.d.ts +51 -0
  225. package/dist/discovery/types.js +2 -0
  226. package/dist/docs/agents.d.ts +3 -0
  227. package/dist/docs/agents.js +995 -0
  228. package/dist/docs/contract.d.ts +51 -0
  229. package/dist/docs/contract.js +1681 -0
  230. package/dist/docs/generator.d.ts +4 -0
  231. package/dist/docs/generator.js +4 -0
  232. package/dist/docs/html-reporter.d.ts +9 -0
  233. package/dist/docs/html-reporter.js +757 -0
  234. package/dist/docs/index.d.ts +10 -0
  235. package/dist/docs/index.js +11 -0
  236. package/dist/docs/junit-reporter.d.ts +18 -0
  237. package/dist/docs/junit-reporter.js +210 -0
  238. package/dist/docs/report.d.ts +14 -0
  239. package/dist/docs/report.js +44 -0
  240. package/dist/docs/sarif-reporter.d.ts +19 -0
  241. package/dist/docs/sarif-reporter.js +335 -0
  242. package/dist/docs/shared.d.ts +35 -0
  243. package/dist/docs/shared.js +162 -0
  244. package/dist/docs/templates.d.ts +12 -0
  245. package/dist/docs/templates.js +76 -0
  246. package/dist/errors/index.d.ts +6 -0
  247. package/dist/errors/index.js +6 -0
  248. package/dist/errors/retry.d.ts +92 -0
  249. package/dist/errors/retry.js +323 -0
  250. package/dist/errors/types.d.ts +321 -0
  251. package/dist/errors/types.js +584 -0
  252. package/dist/index.d.ts +32 -0
  253. package/dist/index.js +32 -0
  254. package/dist/interview/dependency-resolver.d.ts +11 -0
  255. package/dist/interview/dependency-resolver.js +32 -0
  256. package/dist/interview/interviewer.d.ts +232 -0
  257. package/dist/interview/interviewer.js +1939 -0
  258. package/dist/interview/mock-response-generator.d.ts +7 -0
  259. package/dist/interview/mock-response-generator.js +102 -0
  260. package/dist/interview/orchestrator.d.ts +237 -0
  261. package/dist/interview/orchestrator.js +1296 -0
  262. package/dist/interview/rate-limiter.d.ts +15 -0
  263. package/dist/interview/rate-limiter.js +55 -0
  264. package/dist/interview/response-validator.d.ts +10 -0
  265. package/dist/interview/response-validator.js +132 -0
  266. package/dist/interview/schema-inferrer.d.ts +8 -0
  267. package/dist/interview/schema-inferrer.js +71 -0
  268. package/dist/interview/schema-test-generator.d.ts +71 -0
  269. package/dist/interview/schema-test-generator.js +834 -0
  270. package/dist/interview/smart-value-generator.d.ts +155 -0
  271. package/dist/interview/smart-value-generator.js +554 -0
  272. package/dist/interview/stateful-test-runner.d.ts +19 -0
  273. package/dist/interview/stateful-test-runner.js +106 -0
  274. package/dist/interview/types.d.ts +561 -0
  275. package/dist/interview/types.js +2 -0
  276. package/dist/llm/anthropic.d.ts +41 -0
  277. package/dist/llm/anthropic.js +355 -0
  278. package/dist/llm/client.d.ts +123 -0
  279. package/dist/llm/client.js +42 -0
  280. package/dist/llm/factory.d.ts +38 -0
  281. package/dist/llm/factory.js +145 -0
  282. package/dist/llm/fallback.d.ts +140 -0
  283. package/dist/llm/fallback.js +379 -0
  284. package/dist/llm/index.d.ts +18 -0
  285. package/dist/llm/index.js +15 -0
  286. package/dist/llm/ollama.d.ts +37 -0
  287. package/dist/llm/ollama.js +330 -0
  288. package/dist/llm/openai.d.ts +25 -0
  289. package/dist/llm/openai.js +320 -0
  290. package/dist/llm/token-budget.d.ts +161 -0
  291. package/dist/llm/token-budget.js +395 -0
  292. package/dist/logging/logger.d.ts +70 -0
  293. package/dist/logging/logger.js +130 -0
  294. package/dist/metrics/collector.d.ts +106 -0
  295. package/dist/metrics/collector.js +547 -0
  296. package/dist/metrics/index.d.ts +7 -0
  297. package/dist/metrics/index.js +7 -0
  298. package/dist/metrics/prometheus.d.ts +20 -0
  299. package/dist/metrics/prometheus.js +241 -0
  300. package/dist/metrics/types.d.ts +209 -0
  301. package/dist/metrics/types.js +5 -0
  302. package/dist/persona/builtins.d.ts +54 -0
  303. package/dist/persona/builtins.js +219 -0
  304. package/dist/persona/index.d.ts +8 -0
  305. package/dist/persona/index.js +8 -0
  306. package/dist/persona/loader.d.ts +30 -0
  307. package/dist/persona/loader.js +190 -0
  308. package/dist/persona/types.d.ts +144 -0
  309. package/dist/persona/types.js +5 -0
  310. package/dist/persona/validation.d.ts +94 -0
  311. package/dist/persona/validation.js +332 -0
  312. package/dist/prompts/index.d.ts +5 -0
  313. package/dist/prompts/index.js +5 -0
  314. package/dist/prompts/templates.d.ts +180 -0
  315. package/dist/prompts/templates.js +431 -0
  316. package/dist/registry/client.d.ts +49 -0
  317. package/dist/registry/client.js +191 -0
  318. package/dist/registry/index.d.ts +7 -0
  319. package/dist/registry/index.js +6 -0
  320. package/dist/registry/types.d.ts +140 -0
  321. package/dist/registry/types.js +6 -0
  322. package/dist/scenarios/evaluator.d.ts +43 -0
  323. package/dist/scenarios/evaluator.js +206 -0
  324. package/dist/scenarios/index.d.ts +10 -0
  325. package/dist/scenarios/index.js +9 -0
  326. package/dist/scenarios/loader.d.ts +20 -0
  327. package/dist/scenarios/loader.js +285 -0
  328. package/dist/scenarios/types.d.ts +153 -0
  329. package/dist/scenarios/types.js +8 -0
  330. package/dist/security/index.d.ts +17 -0
  331. package/dist/security/index.js +18 -0
  332. package/dist/security/payloads.d.ts +61 -0
  333. package/dist/security/payloads.js +268 -0
  334. package/dist/security/security-tester.d.ts +42 -0
  335. package/dist/security/security-tester.js +582 -0
  336. package/dist/security/types.d.ts +166 -0
  337. package/dist/security/types.js +8 -0
  338. package/dist/transport/base-transport.d.ts +59 -0
  339. package/dist/transport/base-transport.js +38 -0
  340. package/dist/transport/http-transport.d.ts +67 -0
  341. package/dist/transport/http-transport.js +238 -0
  342. package/dist/transport/mcp-client.d.ts +141 -0
  343. package/dist/transport/mcp-client.js +496 -0
  344. package/dist/transport/sse-transport.d.ts +88 -0
  345. package/dist/transport/sse-transport.js +316 -0
  346. package/dist/transport/stdio-transport.d.ts +43 -0
  347. package/dist/transport/stdio-transport.js +238 -0
  348. package/dist/transport/types.d.ts +125 -0
  349. package/dist/transport/types.js +16 -0
  350. package/dist/utils/concurrency.d.ts +123 -0
  351. package/dist/utils/concurrency.js +213 -0
  352. package/dist/utils/formatters.d.ts +16 -0
  353. package/dist/utils/formatters.js +37 -0
  354. package/dist/utils/index.d.ts +8 -0
  355. package/dist/utils/index.js +8 -0
  356. package/dist/utils/jsonpath.d.ts +87 -0
  357. package/dist/utils/jsonpath.js +326 -0
  358. package/dist/utils/markdown.d.ts +113 -0
  359. package/dist/utils/markdown.js +265 -0
  360. package/dist/utils/network.d.ts +14 -0
  361. package/dist/utils/network.js +17 -0
  362. package/dist/utils/sanitize.d.ts +92 -0
  363. package/dist/utils/sanitize.js +191 -0
  364. package/dist/utils/semantic.d.ts +194 -0
  365. package/dist/utils/semantic.js +1051 -0
  366. package/dist/utils/smart-truncate.d.ts +94 -0
  367. package/dist/utils/smart-truncate.js +361 -0
  368. package/dist/utils/timeout.d.ts +153 -0
  369. package/dist/utils/timeout.js +205 -0
  370. package/dist/utils/yaml-parser.d.ts +58 -0
  371. package/dist/utils/yaml-parser.js +86 -0
  372. package/dist/validation/index.d.ts +32 -0
  373. package/dist/validation/index.js +32 -0
  374. package/dist/validation/semantic-test-generator.d.ts +50 -0
  375. package/dist/validation/semantic-test-generator.js +176 -0
  376. package/dist/validation/semantic-types.d.ts +66 -0
  377. package/dist/validation/semantic-types.js +94 -0
  378. package/dist/validation/semantic-validator.d.ts +38 -0
  379. package/dist/validation/semantic-validator.js +340 -0
  380. package/dist/verification/index.d.ts +6 -0
  381. package/dist/verification/index.js +5 -0
  382. package/dist/verification/types.d.ts +133 -0
  383. package/dist/verification/types.js +5 -0
  384. package/dist/verification/verifier.d.ts +30 -0
  385. package/dist/verification/verifier.js +309 -0
  386. package/dist/version.d.ts +19 -0
  387. package/dist/version.js +48 -0
  388. package/dist/workflow/auto-generator.d.ts +27 -0
  389. package/dist/workflow/auto-generator.js +513 -0
  390. package/dist/workflow/discovery.d.ts +40 -0
  391. package/dist/workflow/discovery.js +195 -0
  392. package/dist/workflow/executor.d.ts +82 -0
  393. package/dist/workflow/executor.js +611 -0
  394. package/dist/workflow/index.d.ts +10 -0
  395. package/dist/workflow/index.js +10 -0
  396. package/dist/workflow/loader.d.ts +24 -0
  397. package/dist/workflow/loader.js +194 -0
  398. package/dist/workflow/state-tracker.d.ts +98 -0
  399. package/dist/workflow/state-tracker.js +424 -0
  400. package/dist/workflow/types.d.ts +337 -0
  401. package/dist/workflow/types.js +5 -0
  402. package/package.json +94 -0
  403. package/schemas/bellwether-check.schema.json +651 -0
@@ -0,0 +1,1939 @@
1
+ import { Orchestrator } from './orchestrator.js';
2
+ import { categorizeErrorSource, detectExternalServiceFromTool, getExternalServiceStatus, } from '../baseline/external-dependency-detector.js';
3
+ import { DEFAULT_PERSONA } from '../persona/builtins.js';
4
+ import { getLogger, startTiming } from '../logging/logger.js';
5
+ import { evaluateAssertions } from '../scenarios/evaluator.js';
6
+ import { withTimeout, DEFAULT_TIMEOUTS, parallelLimit, createMutex } from '../utils/index.js';
7
+ import { INTERVIEW, WORKFLOW, DISPLAY_LIMITS, SCHEMA_TESTING, OUTCOME_ASSESSMENT } from '../constants.js';
8
+ import { generateSchemaTests } from './schema-test-generator.js';
9
+ import { WorkflowDiscoverer } from '../workflow/discovery.js';
10
+ import { WorkflowExecutor } from '../workflow/executor.js';
11
+ import { RateLimiter, calculateBackoffMs, isRateLimitError } from './rate-limiter.js';
12
+ import { inferResponseSchema } from './schema-inferrer.js';
13
+ import { validateResponseAssertions } from './response-validator.js';
14
+ import { StatefulTestRunner } from './stateful-test-runner.js';
15
+ import { resolveToolDependencies, getDependencyOrder } from './dependency-resolver.js';
16
+ import { generateMockResponse } from './mock-response-generator.js';
17
+ /**
18
+ * Default interview configuration.
19
+ */
20
+ export const DEFAULT_CONFIG = {
21
+ maxQuestionsPerTool: INTERVIEW.MAX_QUESTIONS_PER_TOOL,
22
+ timeout: INTERVIEW.TOOL_TIMEOUT,
23
+ skipErrorTests: false,
24
+ };
25
+ /**
26
+ * Default personas to use if none specified.
27
+ * Uses Technical Writer only for a fast, cost-effective default experience.
28
+ * Use --security or --personas to add more personas.
29
+ */
30
+ export const DEFAULT_PERSONAS = [DEFAULT_PERSONA];
31
+ /**
32
+ * Interviewer conducts the interview process using the orchestrator.
33
+ * Supports streaming output for real-time feedback during LLM operations.
34
+ * Supports parallel persona execution for improved performance.
35
+ * Supports caching tool responses and LLM analysis for efficiency.
36
+ *
37
+ * Two modes of operation:
38
+ * - Check mode: No LLM required, uses fallback questions and simple analysis
39
+ * - Explore mode: LLM required for question generation and behavioral analysis
40
+ */
41
+ export class Interviewer {
42
+ llm;
43
+ config;
44
+ personas;
45
+ logger = getLogger('interviewer');
46
+ serverContext;
47
+ cache;
48
+ rateLimiter;
49
+ responseSchemas = new Map();
50
+ rateLimitEvents = new Map();
51
+ rateLimitRetries = 0;
52
+ externalServiceStatuses = new Map();
53
+ skippedTools = new Set();
54
+ mockedTools = new Set();
55
+ constructor(llm, config) {
56
+ this.llm = llm;
57
+ this.config = { ...DEFAULT_CONFIG, ...config };
58
+ // Validate: if no LLM provided, must be in check mode
59
+ if (!llm && !this.config.checkMode) {
60
+ throw new Error('LLM client is required for explore mode. Use checkMode: true for check mode.');
61
+ }
62
+ // Use multiple personas by default for better coverage
63
+ // Fall back to DEFAULT_PERSONAS if no personas provided or empty array
64
+ const providedPersonas = config?.personas;
65
+ this.personas = (providedPersonas && providedPersonas.length > 0) ? providedPersonas : DEFAULT_PERSONAS;
66
+ // Store cache reference for tool response and analysis caching
67
+ this.cache = config?.cache;
68
+ if (this.config.rateLimit?.enabled) {
69
+ this.rateLimiter = new RateLimiter(this.config.rateLimit);
70
+ }
71
+ }
72
+ /**
73
+ * Create an orchestrator with streaming and caching enabled if configured.
74
+ * Throws an error if called in check mode since orchestrator requires LLM.
75
+ */
76
+ createOrchestrator(persona) {
77
+ if (!this.llm) {
78
+ throw new Error('Cannot create orchestrator in check mode - LLM client is required');
79
+ }
80
+ const orchestrator = new Orchestrator(this.llm, persona, this.serverContext, this.cache);
81
+ // Enable streaming if configured
82
+ if (this.config.enableStreaming && this.config.streamingCallbacks) {
83
+ orchestrator.enableStreaming(this.config.streamingCallbacks);
84
+ }
85
+ return orchestrator;
86
+ }
87
+ /**
88
+ * Generate simple analysis for check/fast mode.
89
+ * Avoids LLM calls by providing basic success/error messages.
90
+ */
91
+ generateSimpleAnalysis(error, hasResponse, successMessage) {
92
+ if (error) {
93
+ return `Error: ${error}`;
94
+ }
95
+ if (hasResponse) {
96
+ return successMessage;
97
+ }
98
+ return 'No response received.';
99
+ }
100
+ /**
101
+ * Assess whether the tool interaction outcome matched expectations.
102
+ */
103
+ assessOutcome(question, response, error) {
104
+ const expected = this.inferExpectedOutcome(question);
105
+ const actual = error || response?.isError ? 'error' : 'success';
106
+ const correct = expected === 'either' || expected === actual;
107
+ const isValidationSuccess = expected === 'error' && actual === 'error';
108
+ return {
109
+ expected,
110
+ actual,
111
+ correct,
112
+ isValidationSuccess,
113
+ };
114
+ }
115
+ /**
116
+ * Infer expected outcome when not explicitly provided.
117
+ */
118
+ inferExpectedOutcome(question) {
119
+ if (question.expectedOutcome)
120
+ return question.expectedOutcome;
121
+ if (OUTCOME_ASSESSMENT.EXPECTS_ERROR_CATEGORIES.includes(question.category)) {
122
+ return 'error';
123
+ }
124
+ if (OUTCOME_ASSESSMENT.EXPECTS_SUCCESS_CATEGORIES.includes(question.category)) {
125
+ return 'success';
126
+ }
127
+ if (OUTCOME_ASSESSMENT.EITHER_OUTCOME_CATEGORIES.includes(question.category)) {
128
+ return 'either';
129
+ }
130
+ if (OUTCOME_ASSESSMENT.EXPECTS_ERROR_PATTERNS.some((pattern) => pattern.test(question.description))) {
131
+ return 'error';
132
+ }
133
+ return 'success';
134
+ }
135
+ extractErrorMessage(response, error) {
136
+ if (error)
137
+ return error;
138
+ const errorContent = response?.content?.find((c) => c.type === 'text');
139
+ if (errorContent && 'text' in errorContent) {
140
+ return String(errorContent.text);
141
+ }
142
+ return null;
143
+ }
144
+ resolveExternalServiceDecision(tool) {
145
+ const externalConfig = this.config.externalServices;
146
+ if (!externalConfig) {
147
+ return { action: 'allow' };
148
+ }
149
+ const detected = detectExternalServiceFromTool(tool.name, tool.description);
150
+ if (!detected) {
151
+ return { action: 'allow' };
152
+ }
153
+ const status = getExternalServiceStatus(detected.serviceName, externalConfig);
154
+ this.externalServiceStatuses.set(detected.serviceName, status);
155
+ if (status.configured) {
156
+ return { action: 'allow', serviceName: detected.serviceName };
157
+ }
158
+ const missing = status.missingCredentials.length > 0
159
+ ? `Missing: ${status.missingCredentials.join(', ')}`
160
+ : 'Service not configured';
161
+ if (externalConfig.mode === 'fail') {
162
+ throw new Error(`External service "${detected.displayName}" is not configured. ${missing}`);
163
+ }
164
+ if (externalConfig.mode === 'mock' && status.mockAvailable) {
165
+ return {
166
+ action: 'mock',
167
+ serviceName: detected.serviceName,
168
+ reason: missing,
169
+ };
170
+ }
171
+ return {
172
+ action: 'skip',
173
+ serviceName: detected.serviceName,
174
+ reason: missing,
175
+ };
176
+ }
177
+ recordRateLimitEvent(toolName) {
178
+ const current = this.rateLimitEvents.get(toolName) ?? 0;
179
+ this.rateLimitEvents.set(toolName, current + 1);
180
+ }
181
+ async callToolWithPolicies(client, tool, args, decisionOverride) {
182
+ const decision = decisionOverride ?? this.resolveExternalServiceDecision(tool);
183
+ if (decision.action === 'skip') {
184
+ this.skippedTools.add(tool.name);
185
+ return {
186
+ response: null,
187
+ error: null,
188
+ skipped: true,
189
+ skipReason: decision.reason,
190
+ toolExecutionMs: 0,
191
+ };
192
+ }
193
+ if (decision.action === 'mock') {
194
+ if (decision.serviceName) {
195
+ this.mockedTools.add(tool.name);
196
+ return {
197
+ response: generateMockResponse(tool, decision.serviceName),
198
+ error: null,
199
+ mocked: true,
200
+ mockService: decision.serviceName,
201
+ toolExecutionMs: 0,
202
+ };
203
+ }
204
+ this.skippedTools.add(tool.name);
205
+ return {
206
+ response: null,
207
+ error: null,
208
+ skipped: true,
209
+ skipReason: 'Mock response unavailable',
210
+ toolExecutionMs: 0,
211
+ };
212
+ }
213
+ const rateLimitEnabled = this.config.rateLimit?.enabled ?? false;
214
+ let attempts = 0;
215
+ let lastError = null;
216
+ let toolExecutionMs = 0;
217
+ while (attempts <= (this.config.rateLimit?.maxRetries ?? 0)) {
218
+ if (this.rateLimiter) {
219
+ await this.rateLimiter.acquire();
220
+ }
221
+ const toolCallStart = Date.now();
222
+ try {
223
+ const response = await client.callTool(tool.name, args);
224
+ toolExecutionMs = Date.now() - toolCallStart;
225
+ const errorMessage = response.isError ? this.extractErrorMessage(response, null) : null;
226
+ if (rateLimitEnabled && response.isError && isRateLimitError(errorMessage)) {
227
+ this.recordRateLimitEvent(tool.name);
228
+ this.rateLimitRetries += 1;
229
+ attempts += 1;
230
+ const backoff = calculateBackoffMs(attempts, this.config.rateLimit?.backoffStrategy ?? 'exponential');
231
+ await new Promise((resolve) => setTimeout(resolve, backoff));
232
+ lastError = errorMessage ?? 'Rate limit exceeded';
233
+ continue;
234
+ }
235
+ return { response, error: errorMessage, toolExecutionMs };
236
+ }
237
+ catch (error) {
238
+ toolExecutionMs = Date.now() - toolCallStart;
239
+ const message = error instanceof Error ? error.message : String(error);
240
+ if (rateLimitEnabled && isRateLimitError(message)) {
241
+ this.recordRateLimitEvent(tool.name);
242
+ this.rateLimitRetries += 1;
243
+ attempts += 1;
244
+ const backoff = calculateBackoffMs(attempts, this.config.rateLimit?.backoffStrategy ?? 'exponential');
245
+ await new Promise((resolve) => setTimeout(resolve, backoff));
246
+ lastError = message;
247
+ continue;
248
+ }
249
+ return { response: null, error: message, toolExecutionMs };
250
+ }
251
+ }
252
+ return { response: null, error: lastError ?? 'Rate limit exceeded', toolExecutionMs };
253
+ }
254
+ /**
255
+ * Check if we're in fast/check mode (no LLM calls).
256
+ */
257
+ isCheckMode() {
258
+ return this.config.customScenariosOnly || this.config.checkMode || false;
259
+ }
260
+ /**
261
+ * Extract server context by probing discovery tools.
262
+ * Looks for tools like list_allowed_directories to understand constraints.
263
+ */
264
+ async extractServerContext(client, discovery) {
265
+ const context = {
266
+ allowedDirectories: [],
267
+ allowedHosts: [],
268
+ constraints: [],
269
+ hints: [],
270
+ };
271
+ // Look for tools that reveal server constraints
272
+ for (const toolName of INTERVIEW.CONSTRAINT_DISCOVERY_TOOLS) {
273
+ const tool = discovery.tools.find(t => t.name === toolName);
274
+ if (tool) {
275
+ try {
276
+ const result = await client.callTool(toolName, {});
277
+ if (result?.content) {
278
+ const textContent = result.content.find(c => c.type === 'text');
279
+ if (textContent && 'text' in textContent) {
280
+ const text = String(textContent.text);
281
+ // Parse allowed directories from response
282
+ const dirs = this.parseAllowedDirectories(text);
283
+ if (dirs.length > 0) {
284
+ context.allowedDirectories = dirs;
285
+ this.logger.info({ dirs }, 'Extracted allowed directories from server');
286
+ }
287
+ }
288
+ }
289
+ }
290
+ catch (error) {
291
+ this.logger.debug({
292
+ toolName,
293
+ error: error instanceof Error ? error.message : String(error),
294
+ }, 'Tool probe failed during context extraction');
295
+ }
296
+ }
297
+ }
298
+ // Extract hints and hosts from tool descriptions
299
+ for (const tool of discovery.tools) {
300
+ if (tool.description) {
301
+ const desc = tool.description.toLowerCase();
302
+ // Look for path restrictions mentioned in descriptions
303
+ if (desc.includes('allowed director') || desc.includes('within allowed')) {
304
+ context.hints?.push(`${tool.name}: operates within allowed directories only`);
305
+ }
306
+ if (desc.includes('only works within')) {
307
+ const match = tool.description.match(/only works within (.+?)(?:\.|$)/i);
308
+ if (match) {
309
+ context.hints?.push(`${tool.name}: ${match[0]}`);
310
+ }
311
+ }
312
+ // Extract allowed hosts/URLs from descriptions
313
+ const urlMatch = tool.description.match(/https?:\/\/[^\s"'<>]+/gi);
314
+ if (urlMatch) {
315
+ for (const url of urlMatch) {
316
+ try {
317
+ const parsed = new URL(url);
318
+ const baseUrl = `${parsed.protocol}//${parsed.host}`;
319
+ if (!context.allowedHosts?.includes(baseUrl)) {
320
+ context.allowedHosts?.push(baseUrl);
321
+ }
322
+ }
323
+ catch {
324
+ // Invalid URL, skip
325
+ }
326
+ }
327
+ }
328
+ }
329
+ }
330
+ // If we didn't find explicit directories but have hints, try to infer from CLI args
331
+ // This will be populated by the interview command based on server args
332
+ if (context.allowedDirectories?.length === 0) {
333
+ // Default fallback - will be overridden if server args specify directories
334
+ context.constraints?.push('Server may have directory restrictions - watch for access denied errors');
335
+ }
336
+ return context;
337
+ }
338
+ /**
339
+ * Parse allowed directories from tool response text.
340
+ */
341
+ parseAllowedDirectories(text) {
342
+ const dirs = [];
343
+ // Try to parse as JSON array
344
+ try {
345
+ const parsed = JSON.parse(text);
346
+ if (Array.isArray(parsed)) {
347
+ return parsed.filter(d => typeof d === 'string' && d.startsWith('/'));
348
+ }
349
+ }
350
+ catch (error) {
351
+ this.logger.debug({
352
+ error: error instanceof Error ? error.message : String(error),
353
+ textPreview: text.substring(0, 100),
354
+ }, 'Directory list not JSON, trying line-by-line parsing');
355
+ }
356
+ // Parse line by line looking for paths
357
+ const lines = text.split('\n');
358
+ for (const line of lines) {
359
+ const trimmed = line.trim();
360
+ // Match absolute paths
361
+ if (trimmed.startsWith('/') && !trimmed.includes(' ')) {
362
+ dirs.push(trimmed);
363
+ }
364
+ // Match "Allowed: /path" format
365
+ const match = trimmed.match(/allowed[:\s]+(.+)/i);
366
+ if (match) {
367
+ const path = match[1].trim();
368
+ if (path.startsWith('/')) {
369
+ dirs.push(path);
370
+ }
371
+ }
372
+ }
373
+ return [...new Set(dirs)]; // Dedupe
374
+ }
375
+ /**
376
+ * Set server context directly (e.g., from CLI arguments).
377
+ */
378
+ setServerContext(context) {
379
+ this.serverContext = context;
380
+ }
381
+ /**
382
+ * Run a complete interview on a connected MCP server.
383
+ * Supports multiple personas - runs each persona's interview and aggregates findings.
384
+ */
385
+ async interview(client, discovery, onProgress) {
386
+ const done = startTiming(this.logger, 'interview');
387
+ const startTime = new Date();
388
+ this.logger.info({
389
+ serverName: discovery.serverInfo.name,
390
+ toolCount: discovery.tools.length,
391
+ personaCount: this.personas.length,
392
+ }, 'Starting interview');
393
+ // Extract server context if not already set
394
+ if (!this.serverContext) {
395
+ this.serverContext = await this.extractServerContext(client, discovery);
396
+ }
397
+ // Track stats per persona
398
+ const personaStats = new Map();
399
+ for (const persona of this.personas) {
400
+ personaStats.set(persona.id, {
401
+ id: persona.id,
402
+ name: persona.name,
403
+ questionsAsked: 0,
404
+ toolCallCount: 0,
405
+ errorCount: 0,
406
+ });
407
+ }
408
+ const progress = {
409
+ phase: 'starting',
410
+ personasCompleted: 0,
411
+ totalPersonas: this.personas.length,
412
+ toolsCompleted: 0,
413
+ totalTools: discovery.tools.length,
414
+ questionsAsked: 0,
415
+ promptsCompleted: 0,
416
+ totalPrompts: discovery.prompts.length,
417
+ resourcesCompleted: 0,
418
+ totalResources: (discovery.resources ?? []).length,
419
+ };
420
+ onProgress?.(progress);
421
+ // Aggregate interactions by tool across all personas
422
+ const toolInteractionsMap = new Map();
423
+ // Initialize map for each tool
424
+ for (const tool of discovery.tools) {
425
+ toolInteractionsMap.set(tool.name, {
426
+ interactions: [],
427
+ findingsByPersona: [],
428
+ });
429
+ }
430
+ // Track all scenario results
431
+ let allScenarioResults = [];
432
+ let checkModeResult = null;
433
+ // Interview with each persona
434
+ progress.phase = 'interviewing';
435
+ // Check if parallel execution is enabled
436
+ const useParallel = this.config.parallelPersonas && this.personas.length > 1;
437
+ if (useParallel) {
438
+ // Parallel persona execution
439
+ const concurrency = this.config.personaConcurrency ?? INTERVIEW.DEFAULT_PERSONA_CONCURRENCY;
440
+ const toolCallMutex = createMutex(); // Shared mutex for serializing MCP tool calls
441
+ this.logger.info({
442
+ personaCount: this.personas.length,
443
+ concurrency,
444
+ }, 'Running persona interviews in parallel');
445
+ // Create tasks for each persona
446
+ const personaTasks = this.personas.map(persona => async () => {
447
+ progress.currentPersona = persona.name;
448
+ onProgress?.(progress);
449
+ const result = await this.interviewPersona(client, discovery, persona, toolCallMutex);
450
+ progress.personasCompleted++;
451
+ progress.questionsAsked += result.stats.questionsAsked;
452
+ onProgress?.(progress);
453
+ return result;
454
+ });
455
+ // Execute personas in parallel with concurrency limit
456
+ const parallelResults = await parallelLimit(personaTasks, { concurrency });
457
+ // Check for errors
458
+ if (!parallelResults.allSucceeded) {
459
+ for (const [index, error] of parallelResults.errors) {
460
+ this.logger.error({
461
+ persona: this.personas[index]?.name,
462
+ error: error.message,
463
+ }, 'Persona interview failed');
464
+ }
465
+ }
466
+ // Aggregate results
467
+ const successfulResults = parallelResults.results.filter((r) => r !== undefined);
468
+ const aggregated = this.aggregateParallelResults(successfulResults, discovery);
469
+ // Update tracking maps
470
+ for (const [toolName, data] of aggregated.toolInteractionsMap) {
471
+ const existing = toolInteractionsMap.get(toolName);
472
+ if (existing) {
473
+ existing.interactions = data.interactions;
474
+ existing.findingsByPersona = data.findingsByPersona;
475
+ }
476
+ }
477
+ // Update persona stats
478
+ for (const [personaId, stats] of aggregated.personaStats) {
479
+ personaStats.set(personaId, stats);
480
+ }
481
+ allScenarioResults = aggregated.allScenarioResults;
482
+ }
483
+ else if (this.config.checkMode) {
484
+ // Check mode tool testing (parallel or sequential based on config)
485
+ // This path doesn't require an LLM - uses fallback questions and simple analysis
486
+ const statefulConfig = this.config.statefulTesting;
487
+ const statefulEnabled = statefulConfig?.enabled ?? false;
488
+ const dependencies = statefulEnabled ? resolveToolDependencies(discovery.tools) : [];
489
+ const dependencyMap = new Map(dependencies.map((d) => [d.tool, d]));
490
+ const toolMap = new Map(discovery.tools.map((tool) => [tool.name, tool]));
491
+ const orderedTools = statefulEnabled
492
+ ? getDependencyOrder(dependencies)
493
+ .map((name) => toolMap.get(name))
494
+ .filter((tool) => !!tool)
495
+ : discovery.tools;
496
+ const effectiveConcurrency = statefulEnabled
497
+ ? 1
498
+ : this.config.parallelTools
499
+ ? (this.config.toolConcurrency ?? INTERVIEW.DEFAULT_TOOL_CONCURRENCY)
500
+ : 1; // Sequential when parallelTools is disabled
501
+ if (statefulEnabled) {
502
+ this.logger.info({ toolCount: orderedTools.length }, 'Stateful testing enabled');
503
+ }
504
+ this.logger.info({ parallel: this.config.parallelTools && !statefulEnabled, concurrency: effectiveConcurrency }, 'Using check mode tool testing');
505
+ const statefulRunner = statefulEnabled
506
+ ? new StatefulTestRunner({ shareOutputs: statefulConfig?.shareOutputsBetweenTools ?? true })
507
+ : undefined;
508
+ const parallelResult = await this.interviewToolsInParallel(client, orderedTools, progress, onProgress, {
509
+ statefulRunner,
510
+ dependencyMap,
511
+ statefulConfig,
512
+ });
513
+ checkModeResult = parallelResult;
514
+ // Update tool interactions map with parallel results
515
+ for (const profile of parallelResult.toolProfiles) {
516
+ const toolData = toolInteractionsMap.get(profile.name);
517
+ if (toolData) {
518
+ toolData.interactions = profile.interactions;
519
+ toolData.findingsByPersona = [{
520
+ personaId: 'check_mode',
521
+ personaName: 'Check Mode',
522
+ behavioralNotes: [],
523
+ limitations: [],
524
+ securityNotes: [],
525
+ }];
526
+ }
527
+ }
528
+ // Update persona stats with aggregated counts
529
+ const checkModeStats = personaStats.get(this.personas[0].id);
530
+ if (checkModeStats) {
531
+ checkModeStats.questionsAsked = parallelResult.totalQuestionsAsked;
532
+ checkModeStats.toolCallCount = parallelResult.totalToolCallCount;
533
+ checkModeStats.errorCount = parallelResult.totalErrorCount;
534
+ }
535
+ allScenarioResults = parallelResult.scenarioResults;
536
+ }
537
+ else {
538
+ // Sequential persona execution (original behavior)
539
+ for (const persona of this.personas) {
540
+ progress.currentPersona = persona.name;
541
+ onProgress?.(progress);
542
+ // Create orchestrator with server context and streaming if enabled
543
+ const orchestrator = this.createOrchestrator(persona);
544
+ const stats = personaStats.get(persona.id);
545
+ // Interview each tool with this persona
546
+ for (const tool of discovery.tools) {
547
+ progress.currentTool = tool.name;
548
+ onProgress?.(progress);
549
+ const personaInteractions = [];
550
+ const previousErrors = [];
551
+ // Check for custom scenarios for this tool
552
+ const customScenarios = this.getScenariosForTool(tool.name);
553
+ // If customScenariosOnly and we have scenarios, skip LLM generation
554
+ let questions = [];
555
+ if (customScenarios.length > 0) {
556
+ // Execute custom scenarios
557
+ const scenarioResults = await this.executeToolScenarios(client, tool.name, customScenarios);
558
+ allScenarioResults.push(...scenarioResults);
559
+ // Convert scenarios to interview questions for integration with profiling
560
+ questions = customScenarios.map(s => this.scenarioToQuestion(s));
561
+ // If not custom-only mode, also generate LLM questions (skip in fast CI mode)
562
+ if (!this.config.customScenariosOnly && !this.config.checkMode) {
563
+ const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
564
+ questions = [...questions, ...llmQuestions];
565
+ }
566
+ }
567
+ else if (!this.config.customScenariosOnly) {
568
+ // No custom scenarios - generate questions
569
+ if (this.config.checkMode) {
570
+ // Fast CI mode: use fallback questions (no LLM call)
571
+ questions = orchestrator.getFallbackQuestions(tool, this.config.skipErrorTests)
572
+ .slice(0, this.config.maxQuestionsPerTool);
573
+ }
574
+ else {
575
+ // Normal mode: generate LLM questions
576
+ questions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
577
+ }
578
+ }
579
+ // If customScenariosOnly and no scenarios for this tool, skip it
580
+ // Ask each question with retry logic
581
+ for (const question of questions) {
582
+ const { interaction, hadError } = await this.executeWithRetry(client, tool, question, orchestrator, persona.id, stats);
583
+ personaInteractions.push(interaction);
584
+ // Track errors for learning
585
+ if (hadError && interaction.error) {
586
+ previousErrors.push({
587
+ args: question.args,
588
+ error: interaction.error,
589
+ });
590
+ // If we have multiple failures, regenerate remaining questions with error context
591
+ // Skip in scenarios-only mode and fast CI mode
592
+ if (!this.config.customScenariosOnly && !this.config.checkMode &&
593
+ previousErrors.length >= 2 && personaInteractions.length < questions.length) {
594
+ const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
595
+ if (remaining > 0) {
596
+ this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
597
+ const newQuestions = await orchestrator.generateQuestions(tool, remaining, this.config.skipErrorTests, previousErrors);
598
+ // Replace remaining questions with newly generated ones
599
+ questions = [...questions.slice(0, personaInteractions.length), ...newQuestions];
600
+ }
601
+ }
602
+ }
603
+ stats.questionsAsked++;
604
+ progress.questionsAsked++;
605
+ onProgress?.(progress);
606
+ }
607
+ // Synthesize this persona's findings for this tool
608
+ // Skip LLM synthesis in scenarios-only mode and fast CI mode
609
+ let personaProfile;
610
+ if (this.config.customScenariosOnly || this.config.checkMode) {
611
+ // Check mode: minimal profile, no misleading error counts
612
+ personaProfile = {
613
+ behavioralNotes: [],
614
+ limitations: [],
615
+ securityNotes: [],
616
+ };
617
+ }
618
+ else {
619
+ personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
620
+ question: i.question,
621
+ response: i.response,
622
+ error: i.error,
623
+ analysis: i.analysis,
624
+ })));
625
+ }
626
+ // Store findings
627
+ const toolData = toolInteractionsMap.get(tool.name);
628
+ toolData.interactions.push(...personaInteractions);
629
+ toolData.findingsByPersona.push({
630
+ personaId: persona.id,
631
+ personaName: persona.name,
632
+ behavioralNotes: personaProfile.behavioralNotes,
633
+ limitations: personaProfile.limitations,
634
+ securityNotes: personaProfile.securityNotes,
635
+ });
636
+ progress.toolsCompleted++;
637
+ onProgress?.(progress);
638
+ }
639
+ progress.personasCompleted++;
640
+ // Reset tool count for next persona
641
+ progress.toolsCompleted = 0;
642
+ onProgress?.(progress);
643
+ }
644
+ }
645
+ // Build aggregated tool profiles
646
+ let toolProfiles = [];
647
+ if (this.config.checkMode && checkModeResult) {
648
+ toolProfiles = checkModeResult.toolProfiles;
649
+ }
650
+ else {
651
+ for (const tool of discovery.tools) {
652
+ const toolData = toolInteractionsMap.get(tool.name);
653
+ // Aggregate findings across personas (deduplicate)
654
+ const aggregatedProfile = this.aggregateFindings(tool.name, tool.description ?? '', toolData);
655
+ toolProfiles.push(aggregatedProfile);
656
+ }
657
+ }
658
+ // Interview prompts (if server has prompts capability)
659
+ const promptProfiles = [];
660
+ if (discovery.prompts.length > 0) {
661
+ this.logger.info({ promptCount: discovery.prompts.length }, 'Interviewing prompts');
662
+ // Update phase for prompts
663
+ progress.phase = 'prompts';
664
+ progress.promptsCompleted = 0;
665
+ onProgress?.(progress);
666
+ // Only create orchestrator if NOT in check mode (requires LLM)
667
+ const primaryOrchestrator = this.isCheckMode() ? null : this.createOrchestrator(this.personas[0]);
668
+ for (const prompt of discovery.prompts) {
669
+ progress.currentTool = `prompt:${prompt.name}`;
670
+ onProgress?.(progress);
671
+ const promptInteractions = [];
672
+ // Check for custom scenarios for this prompt
673
+ const customScenarios = this.getScenariosForPrompt(prompt.name);
674
+ // Build questions list - custom scenarios + LLM-generated (unless customScenariosOnly)
675
+ let questions = [];
676
+ if (customScenarios.length > 0) {
677
+ // Execute custom prompt scenarios
678
+ const scenarioResults = await this.executePromptScenarios(client, prompt.name, customScenarios);
679
+ allScenarioResults.push(...scenarioResults);
680
+ // Convert scenarios to prompt questions for profiling
681
+ questions = customScenarios.map(s => ({
682
+ description: s.description,
683
+ args: s.args,
684
+ }));
685
+ // If not custom-only mode and not fast CI mode, also generate LLM questions
686
+ if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
687
+ const llmQuestions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
688
+ questions = [...questions, ...llmQuestions];
689
+ }
690
+ }
691
+ else if (!this.config.customScenariosOnly && !this.config.checkMode && primaryOrchestrator) {
692
+ // No custom scenarios - generate LLM questions as usual
693
+ questions = await primaryOrchestrator.generatePromptQuestions(prompt, 2);
694
+ }
695
+ else if (this.config.checkMode) {
696
+ // Fast CI mode: use simple fallback question for prompt
697
+ questions = [{ description: 'Basic prompt test', args: {} }];
698
+ }
699
+ // If customScenariosOnly and no scenarios for this prompt, skip it
700
+ for (const question of questions) {
701
+ const interactionStart = Date.now();
702
+ let response = null;
703
+ let error = null;
704
+ try {
705
+ response = await client.getPrompt(prompt.name, question.args);
706
+ }
707
+ catch (e) {
708
+ error = e instanceof Error ? e.message : String(e);
709
+ }
710
+ // Skip LLM analysis in scenarios-only mode and fast CI mode
711
+ let analysis;
712
+ if (this.isCheckMode() || !primaryOrchestrator) {
713
+ analysis = this.generateSimpleAnalysis(error, !!response, 'Prompt call succeeded.');
714
+ }
715
+ else {
716
+ analysis = await primaryOrchestrator.analyzePromptResponse(prompt, question, response, error);
717
+ }
718
+ promptInteractions.push({
719
+ promptName: prompt.name,
720
+ question,
721
+ response,
722
+ error,
723
+ analysis,
724
+ durationMs: Date.now() - interactionStart,
725
+ });
726
+ progress.questionsAsked++;
727
+ onProgress?.(progress);
728
+ }
729
+ // Synthesize prompt profile
730
+ // Skip LLM synthesis in scenarios-only mode and fast CI mode
731
+ let profile;
732
+ if (this.config.customScenariosOnly || this.config.checkMode || !primaryOrchestrator) {
733
+ // Check mode: minimal profile, no misleading error counts
734
+ profile = {
735
+ name: prompt.name,
736
+ description: prompt.description || prompt.name,
737
+ arguments: prompt.arguments || [],
738
+ behavioralNotes: [],
739
+ limitations: [],
740
+ };
741
+ }
742
+ else {
743
+ profile = await primaryOrchestrator.synthesizePromptProfile(prompt, promptInteractions.map(i => ({
744
+ question: i.question,
745
+ response: i.response,
746
+ error: i.error,
747
+ analysis: i.analysis,
748
+ })));
749
+ }
750
+ promptProfiles.push({
751
+ ...profile,
752
+ interactions: promptInteractions,
753
+ });
754
+ // Update prompt progress
755
+ progress.promptsCompleted = (progress.promptsCompleted ?? 0) + 1;
756
+ onProgress?.(progress);
757
+ }
758
+ }
759
+ // Interview resources (if server has resources capability)
760
+ // Skip in scenarios-only mode since there's no resource scenario format
761
+ const resourceProfiles = [];
762
+ let resourceReadCount = 0;
763
+ const discoveredResources = discovery.resources ?? [];
764
+ if (discoveredResources.length > 0 && !this.config.customScenariosOnly) {
765
+ this.logger.info({ resourceCount: discoveredResources.length }, 'Interviewing resources');
766
+ // Update phase for resources
767
+ progress.phase = 'resources';
768
+ progress.resourcesCompleted = 0;
769
+ onProgress?.(progress);
770
+ // Only create orchestrator if NOT in check mode (requires LLM)
771
+ const primaryOrchestrator = this.isCheckMode() ? null : this.createOrchestrator(this.personas[0]);
772
+ for (const resource of discoveredResources) {
773
+ progress.currentTool = `resource:${resource.name}`;
774
+ onProgress?.(progress);
775
+ const resourceInteractions = [];
776
+ // Generate resource questions (skip LLM in fast CI mode)
777
+ let questions;
778
+ if (this.config.checkMode || !primaryOrchestrator) {
779
+ // Fast CI mode: use simple fallback question
780
+ questions = [{ description: 'Basic resource read test', category: 'happy_path' }];
781
+ }
782
+ else {
783
+ questions = await primaryOrchestrator.generateResourceQuestions(resource, 2);
784
+ }
785
+ for (const question of questions) {
786
+ const interactionStart = Date.now();
787
+ let response = null;
788
+ let error = null;
789
+ try {
790
+ // Apply timeout to resource read to prevent indefinite hangs
791
+ response = await withTimeout(client.readResource(resource.uri), this.config.resourceTimeout ?? DEFAULT_TIMEOUTS.resourceRead, `Resource read: ${resource.uri}`);
792
+ resourceReadCount++;
793
+ }
794
+ catch (e) {
795
+ error = e instanceof Error ? e.message : String(e);
796
+ resourceReadCount++;
797
+ }
798
+ // Skip LLM analysis in fast CI mode
799
+ let analysis;
800
+ if (this.isCheckMode() || !primaryOrchestrator) {
801
+ analysis = this.generateSimpleAnalysis(error, !!response, 'Resource read succeeded.');
802
+ }
803
+ else {
804
+ analysis = await primaryOrchestrator.analyzeResourceResponse(resource, question, response, error);
805
+ }
806
+ resourceInteractions.push({
807
+ resourceUri: resource.uri,
808
+ resourceName: resource.name,
809
+ question,
810
+ response,
811
+ error,
812
+ analysis,
813
+ durationMs: Date.now() - interactionStart,
814
+ });
815
+ progress.questionsAsked++;
816
+ onProgress?.(progress);
817
+ }
818
+ // Synthesize resource profile (skip LLM in fast CI mode)
819
+ let profile;
820
+ if (this.config.checkMode || !primaryOrchestrator) {
821
+ // Check mode: minimal profile, no misleading error counts
822
+ profile = {
823
+ name: resource.name,
824
+ uri: resource.uri,
825
+ description: resource.description || resource.name,
826
+ mimeType: resource.mimeType,
827
+ behavioralNotes: [],
828
+ limitations: [],
829
+ };
830
+ }
831
+ else {
832
+ profile = await primaryOrchestrator.synthesizeResourceProfile(resource, resourceInteractions.map(i => ({
833
+ question: i.question,
834
+ response: i.response,
835
+ error: i.error,
836
+ analysis: i.analysis,
837
+ })));
838
+ }
839
+ // Extract content preview from first successful read
840
+ let contentPreview;
841
+ const successfulRead = resourceInteractions.find(i => i.response && !i.error);
842
+ if (successfulRead?.response?.contents?.[0]) {
843
+ const content = successfulRead.response.contents[0];
844
+ if (content.text) {
845
+ contentPreview = content.text.length > DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW
846
+ ? content.text.substring(0, DISPLAY_LIMITS.CONTENT_TEXT_PREVIEW) + '...'
847
+ : content.text;
848
+ }
849
+ else if (content.blob) {
850
+ contentPreview = `[Binary data: ${content.blob.length} bytes base64]`;
851
+ }
852
+ }
853
+ resourceProfiles.push({
854
+ ...profile,
855
+ interactions: resourceInteractions,
856
+ contentPreview,
857
+ });
858
+ // Update resource progress
859
+ progress.resourcesCompleted = (progress.resourcesCompleted ?? 0) + 1;
860
+ onProgress?.(progress);
861
+ }
862
+ }
863
+ // Execute workflows if configured
864
+ let workflowResults;
865
+ let workflowSummary;
866
+ const workflowConfig = this.config.workflowConfig;
867
+ if (workflowConfig && (workflowConfig.workflows?.length || workflowConfig.discoverWorkflows)) {
868
+ progress.phase = 'workflows';
869
+ onProgress?.(progress);
870
+ const { results, summary } = await this.executeWorkflows(client, discovery, workflowConfig, progress, onProgress);
871
+ workflowResults = results.length > 0 ? results : undefined;
872
+ workflowSummary = summary;
873
+ }
874
+ // Synthesize overall findings (use first persona's orchestrator for synthesis)
875
+ // Skip LLM synthesis in scenarios-only mode and fast CI mode
876
+ progress.phase = 'synthesizing';
877
+ onProgress?.(progress);
878
+ let overall;
879
+ if (this.config.customScenariosOnly || this.config.checkMode) {
880
+ // Check mode: simple summary focused on verification, not pass/fail
881
+ const serverName = discovery.serverInfo.name || 'This MCP server';
882
+ overall = {
883
+ summary: `${serverName} provides ${toolProfiles.length} tool(s) for MCP integration.`,
884
+ limitations: [],
885
+ recommendations: [],
886
+ };
887
+ }
888
+ else {
889
+ const primaryOrchestrator = this.createOrchestrator(this.personas[0]);
890
+ overall = await primaryOrchestrator.synthesizeOverall(discovery, toolProfiles);
891
+ }
892
+ // Calculate totals
893
+ let totalToolCallCount = 0;
894
+ let totalErrorCount = 0;
895
+ for (const stats of personaStats.values()) {
896
+ totalToolCallCount += stats.toolCallCount;
897
+ totalErrorCount += stats.errorCount;
898
+ }
899
+ const endTime = new Date();
900
+ const allInteractions = toolProfiles.flatMap((p) => p.interactions);
901
+ const assertionSummary = summarizeAssertions(allInteractions);
902
+ const rateLimitSummary = this.rateLimitEvents.size > 0
903
+ ? {
904
+ totalEvents: Array.from(this.rateLimitEvents.values()).reduce((sum, v) => sum + v, 0),
905
+ totalRetries: this.rateLimitRetries,
906
+ tools: Array.from(this.rateLimitEvents.keys()),
907
+ }
908
+ : undefined;
909
+ const externalServicesSummary = this.externalServiceStatuses.size > 0
910
+ ? {
911
+ mode: this.config.externalServices?.mode ?? 'skip',
912
+ unconfiguredServices: Array.from(this.externalServiceStatuses.values())
913
+ .filter((s) => !s.configured)
914
+ .map((s) => s.service),
915
+ skippedTools: Array.from(this.skippedTools),
916
+ mockedTools: Array.from(this.mockedTools),
917
+ }
918
+ : undefined;
919
+ const statefulSummary = this.config.statefulTesting?.enabled
920
+ ? {
921
+ enabled: true,
922
+ toolCount: toolProfiles.length,
923
+ dependencyCount: toolProfiles.reduce((sum, profile) => sum + (profile.dependencyInfo?.dependsOn.length ?? 0), 0),
924
+ maxChainLength: this.config.statefulTesting?.maxChainLength ?? 0,
925
+ }
926
+ : undefined;
927
+ const metadata = {
928
+ startTime,
929
+ endTime,
930
+ durationMs: endTime.getTime() - startTime.getTime(),
931
+ toolCallCount: totalToolCallCount,
932
+ resourceReadCount: resourceReadCount > 0 ? resourceReadCount : undefined,
933
+ errorCount: totalErrorCount,
934
+ model: this.config.checkMode ? 'check' : this.config.model,
935
+ personas: Array.from(personaStats.values()),
936
+ workflows: workflowSummary,
937
+ serverCommand: this.config.serverCommand,
938
+ rateLimit: rateLimitSummary,
939
+ externalServices: externalServicesSummary,
940
+ assertions: assertionSummary,
941
+ statefulTesting: statefulSummary,
942
+ };
943
+ progress.phase = 'complete';
944
+ onProgress?.(progress);
945
+ this.logger.info({
946
+ toolsProfiled: toolProfiles.length,
947
+ totalToolCalls: totalToolCallCount,
948
+ totalErrors: totalErrorCount,
949
+ durationMs: metadata.durationMs,
950
+ }, 'Interview complete');
951
+ done();
952
+ return {
953
+ discovery,
954
+ toolProfiles,
955
+ promptProfiles: promptProfiles.length > 0 ? promptProfiles : undefined,
956
+ resourceProfiles: resourceProfiles.length > 0 ? resourceProfiles : undefined,
957
+ workflowResults,
958
+ scenarioResults: allScenarioResults.length > 0 ? allScenarioResults : undefined,
959
+ summary: overall.summary,
960
+ limitations: overall.limitations,
961
+ recommendations: overall.recommendations,
962
+ metadata,
963
+ };
964
+ }
965
+ /**
966
+ * Classify errors from interactions to separate tool correctness from environment issues.
967
+ */
968
+ classifyErrors(interactions, toolName, toolDescription) {
969
+ let externalServiceErrors = 0;
970
+ let environmentErrors = 0;
971
+ let codeBugErrors = 0;
972
+ let unknownErrors = 0;
973
+ const detectedServices = new Set();
974
+ for (const interaction of interactions) {
975
+ if (interaction.error) {
976
+ const analysis = categorizeErrorSource(interaction.error, toolName, toolDescription);
977
+ switch (analysis.source) {
978
+ case 'external_dependency':
979
+ externalServiceErrors++;
980
+ if (analysis.dependency?.displayName) {
981
+ detectedServices.add(analysis.dependency.displayName);
982
+ }
983
+ break;
984
+ case 'environment':
985
+ environmentErrors++;
986
+ break;
987
+ case 'code_bug':
988
+ codeBugErrors++;
989
+ break;
990
+ default:
991
+ unknownErrors++;
992
+ }
993
+ }
994
+ }
995
+ return {
996
+ externalServiceErrors,
997
+ environmentErrors,
998
+ codeBugErrors,
999
+ unknownErrors,
1000
+ detectedServices: detectedServices.size > 0 ? Array.from(detectedServices) : undefined,
1001
+ };
1002
+ }
1003
+ /**
1004
+ * Aggregate findings from multiple personas into a single tool profile.
1005
+ */
1006
+ aggregateFindings(toolName, description, data) {
1007
+ // Collect all notes, deduplicating similar content
1008
+ const behavioralNotes = new Set();
1009
+ const limitations = new Set();
1010
+ const securityNotes = new Set();
1011
+ for (const findings of data.findingsByPersona) {
1012
+ for (const note of findings.behavioralNotes) {
1013
+ behavioralNotes.add(note);
1014
+ }
1015
+ for (const limitation of findings.limitations) {
1016
+ limitations.add(limitation);
1017
+ }
1018
+ for (const note of findings.securityNotes) {
1019
+ securityNotes.add(note);
1020
+ }
1021
+ }
1022
+ // Classify errors to separate tool correctness from environment issues
1023
+ const errorClassification = this.classifyErrors(data.interactions, toolName, description);
1024
+ return {
1025
+ name: toolName,
1026
+ description,
1027
+ interactions: data.interactions,
1028
+ behavioralNotes: Array.from(behavioralNotes),
1029
+ limitations: Array.from(limitations),
1030
+ securityNotes: Array.from(securityNotes),
1031
+ findingsByPersona: data.findingsByPersona,
1032
+ errorClassification,
1033
+ };
1034
+ }
1035
+ /**
1036
+ * Execute a tool call with retry logic for recoverable errors.
1037
+ * Learns from errors and can update server context based on error messages.
1038
+ * Uses caching to avoid redundant tool calls with identical arguments.
1039
+ */
1040
+ async executeWithRetry(client, tool, question, orchestrator, personaId, stats) {
1041
+ const interactionStart = Date.now();
1042
+ let response = null;
1043
+ let error = null;
1044
+ let hadError = false;
1045
+ let fromCache = false;
1046
+ let toolExecutionMs = 0;
1047
+ let llmAnalysisMs = 0;
1048
+ let mocked = false;
1049
+ let mockService;
1050
+ // Check cache for tool response (same tool + same args = same response)
1051
+ if (this.cache) {
1052
+ const cachedResponse = this.cache.getToolResponse(tool.name, question.args);
1053
+ if (cachedResponse) {
1054
+ response = cachedResponse;
1055
+ fromCache = true;
1056
+ this.logger.debug({ toolName: tool.name, args: question.args }, 'Tool response served from cache');
1057
+ stats.toolCallCount++; // Still count as a tool call for metrics
1058
+ if (response.isError) {
1059
+ stats.errorCount++;
1060
+ hadError = true;
1061
+ const errorContent = response.content?.find(c => c.type === 'text');
1062
+ if (errorContent && 'text' in errorContent) {
1063
+ error = String(errorContent.text);
1064
+ }
1065
+ }
1066
+ }
1067
+ }
1068
+ // Make actual tool call if not cached
1069
+ if (!fromCache) {
1070
+ const result = await this.callToolWithPolicies(client, tool, question.args);
1071
+ response = result.response;
1072
+ error = result.error;
1073
+ toolExecutionMs = result.toolExecutionMs;
1074
+ mocked = !!result.mocked;
1075
+ mockService = result.mockService;
1076
+ if (result.skipped) {
1077
+ error = result.skipReason ?? 'Skipped: external service not configured';
1078
+ hadError = true;
1079
+ }
1080
+ else {
1081
+ stats.toolCallCount++;
1082
+ if (error || response?.isError) {
1083
+ stats.errorCount++;
1084
+ hadError = true;
1085
+ if (error) {
1086
+ this.learnFromError(error, orchestrator);
1087
+ }
1088
+ }
1089
+ else if (this.cache && response) {
1090
+ // Cache successful responses for reuse by other personas
1091
+ // Don't cache errors as they may be transient
1092
+ this.cache.setToolResponse(tool.name, question.args, response);
1093
+ this.logger.debug({ toolName: tool.name, args: question.args }, 'Tool response cached');
1094
+ }
1095
+ }
1096
+ }
1097
+ // Analyze the response with this persona's perspective
1098
+ // Skip LLM analysis in scenarios-only mode and fast CI mode
1099
+ let analysis;
1100
+ const llmAnalysisStart = Date.now();
1101
+ if (this.isCheckMode()) {
1102
+ // In fast mode, generate simple analysis (no LLM call)
1103
+ analysis = this.generateSimpleAnalysis(error, !!response, 'Tool call succeeded.');
1104
+ llmAnalysisMs = 0; // No LLM call in fast mode
1105
+ }
1106
+ else {
1107
+ const analysisTool = { name: tool.name, description: tool.description ?? '' };
1108
+ analysis = await orchestrator.analyzeResponse(analysisTool, question, response, error);
1109
+ llmAnalysisMs = Date.now() - llmAnalysisStart;
1110
+ }
1111
+ const interaction = {
1112
+ toolName: tool.name,
1113
+ question,
1114
+ response,
1115
+ error,
1116
+ analysis,
1117
+ durationMs: Date.now() - interactionStart,
1118
+ toolExecutionMs: fromCache ? 0 : toolExecutionMs,
1119
+ llmAnalysisMs,
1120
+ personaId,
1121
+ outcomeAssessment: this.assessOutcome(question, response, error),
1122
+ mocked,
1123
+ mockService,
1124
+ };
1125
+ return { interaction, hadError };
1126
+ }
1127
+ /**
1128
+ * Learn server constraints from error messages.
1129
+ * Updates server context with discovered restrictions.
1130
+ */
1131
+ learnFromError(error, orchestrator) {
1132
+ // Extract allowed directories from error messages
1133
+ const pathMatch = error.match(/access denied|not allowed|outside.*(?:allowed|permitted).*?([/\\][^\s"']+)/i);
1134
+ if (pathMatch) {
1135
+ // Error mentions a path restriction
1136
+ const constraint = `Path access restricted: ${error.substring(0, DISPLAY_LIMITS.ERROR_CONSTRAINT_LENGTH)}`;
1137
+ const currentContext = orchestrator.getServerContext() ?? { constraints: [] };
1138
+ if (!currentContext.constraints?.includes(constraint)) {
1139
+ currentContext.constraints = [...(currentContext.constraints ?? []), constraint];
1140
+ orchestrator.setServerContext(currentContext);
1141
+ }
1142
+ }
1143
+ // Extract allowed directories explicitly mentioned
1144
+ const allowedMatch = error.match(/allowed director(?:y|ies)[:\s]+([^\n]+)/i);
1145
+ if (allowedMatch) {
1146
+ const dirs = allowedMatch[1].split(/[,\s]+/).filter(d => d.startsWith('/'));
1147
+ if (dirs.length > 0) {
1148
+ const currentContext = orchestrator.getServerContext() ?? { allowedDirectories: [] };
1149
+ const existingDirs = currentContext.allowedDirectories ?? [];
1150
+ const newDirs = [...new Set([...existingDirs, ...dirs])];
1151
+ if (newDirs.length > existingDirs.length) {
1152
+ currentContext.allowedDirectories = newDirs;
1153
+ orchestrator.setServerContext(currentContext);
1154
+ this.logger.debug({ dirs: newDirs }, 'Learned allowed directories from error');
1155
+ }
1156
+ }
1157
+ }
1158
+ }
1159
+ /**
1160
+ * Interview all tools with a single persona.
1161
+ * Designed for parallel execution across personas.
1162
+ *
1163
+ * @param client - MCP client for tool calls
1164
+ * @param discovery - Discovery result with available tools
1165
+ * @param persona - Persona to use for this interview
1166
+ * @param toolCallMutex - Mutex for serializing tool calls (shared resource)
1167
+ * @returns PersonaInterviewData with all interactions and findings
1168
+ */
1169
+ async interviewPersona(client, discovery, persona, toolCallMutex) {
1170
+ const orchestrator = this.createOrchestrator(persona);
1171
+ const stats = {
1172
+ id: persona.id,
1173
+ name: persona.name,
1174
+ questionsAsked: 0,
1175
+ toolCallCount: 0,
1176
+ errorCount: 0,
1177
+ };
1178
+ const toolInteractions = new Map();
1179
+ const toolFindings = new Map();
1180
+ const scenarioResults = [];
1181
+ // Interview each tool with this persona
1182
+ for (const tool of discovery.tools) {
1183
+ const personaInteractions = [];
1184
+ const previousErrors = [];
1185
+ // Check for custom scenarios for this tool
1186
+ const customScenarios = this.getScenariosForTool(tool.name);
1187
+ // Build questions list
1188
+ let questions = [];
1189
+ if (customScenarios.length > 0) {
1190
+ // Execute custom scenarios (need mutex for tool calls)
1191
+ await toolCallMutex.acquire();
1192
+ try {
1193
+ const results = await this.executeToolScenarios(client, tool.name, customScenarios);
1194
+ scenarioResults.push(...results);
1195
+ }
1196
+ finally {
1197
+ toolCallMutex.release();
1198
+ }
1199
+ // Convert scenarios to interview questions
1200
+ questions = customScenarios.map(s => this.scenarioToQuestion(s));
1201
+ // If not custom-only mode, also generate LLM questions
1202
+ if (!this.config.customScenariosOnly) {
1203
+ const llmQuestions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
1204
+ questions = [...questions, ...llmQuestions];
1205
+ }
1206
+ }
1207
+ else if (!this.config.customScenariosOnly) {
1208
+ // No custom scenarios - generate LLM questions as usual
1209
+ questions = await orchestrator.generateQuestions(tool, this.config.maxQuestionsPerTool, this.config.skipErrorTests);
1210
+ }
1211
+ // Ask each question with retry logic
1212
+ for (const question of questions) {
1213
+ // Acquire mutex for tool calls (shared MCP client)
1214
+ await toolCallMutex.acquire();
1215
+ let interaction;
1216
+ let hadError;
1217
+ try {
1218
+ const result = await this.executeWithRetry(client, tool, question, orchestrator, persona.id, stats);
1219
+ interaction = result.interaction;
1220
+ hadError = result.hadError;
1221
+ }
1222
+ finally {
1223
+ toolCallMutex.release();
1224
+ }
1225
+ personaInteractions.push(interaction);
1226
+ // Track errors for learning
1227
+ if (hadError && interaction.error) {
1228
+ previousErrors.push({
1229
+ args: question.args,
1230
+ error: interaction.error,
1231
+ });
1232
+ // If we have multiple failures, regenerate remaining questions
1233
+ if (!this.config.customScenariosOnly &&
1234
+ previousErrors.length >= 2 && personaInteractions.length < questions.length) {
1235
+ const remaining = this.config.maxQuestionsPerTool - personaInteractions.length;
1236
+ if (remaining > 0) {
1237
+ this.logger.debug({ tool: tool.name, errors: previousErrors.length }, 'Regenerating questions after errors');
1238
+ const newQuestions = await orchestrator.generateQuestions(tool, remaining, this.config.skipErrorTests, previousErrors);
1239
+ questions = [...questions.slice(0, personaInteractions.length), ...newQuestions];
1240
+ }
1241
+ }
1242
+ }
1243
+ stats.questionsAsked++;
1244
+ }
1245
+ // Synthesize this persona's findings for this tool
1246
+ let personaProfile;
1247
+ if (this.config.customScenariosOnly) {
1248
+ // Scenarios-only mode: minimal profile, no misleading error counts
1249
+ personaProfile = {
1250
+ behavioralNotes: [],
1251
+ limitations: [],
1252
+ securityNotes: [],
1253
+ };
1254
+ }
1255
+ else {
1256
+ personaProfile = await orchestrator.synthesizeToolProfile(tool, personaInteractions.map(i => ({
1257
+ question: i.question,
1258
+ response: i.response,
1259
+ error: i.error,
1260
+ analysis: i.analysis,
1261
+ })));
1262
+ }
1263
+ // Store interactions and findings
1264
+ toolInteractions.set(tool.name, personaInteractions);
1265
+ toolFindings.set(tool.name, {
1266
+ personaId: persona.id,
1267
+ personaName: persona.name,
1268
+ behavioralNotes: personaProfile.behavioralNotes,
1269
+ limitations: personaProfile.limitations,
1270
+ securityNotes: personaProfile.securityNotes,
1271
+ });
1272
+ }
1273
+ this.logger.debug({
1274
+ persona: persona.name,
1275
+ toolCount: discovery.tools.length,
1276
+ questionsAsked: stats.questionsAsked,
1277
+ }, 'Persona interview complete');
1278
+ return {
1279
+ persona,
1280
+ stats,
1281
+ toolInteractions,
1282
+ toolFindings,
1283
+ scenarioResults,
1284
+ };
1285
+ }
1286
+ /**
1287
+ * Aggregate results from parallel persona interviews.
1288
+ */
1289
+ aggregateParallelResults(personaResults, discovery) {
1290
+ const toolInteractionsMap = new Map();
1291
+ // Initialize map for each tool
1292
+ for (const tool of discovery.tools) {
1293
+ toolInteractionsMap.set(tool.name, {
1294
+ interactions: [],
1295
+ findingsByPersona: [],
1296
+ });
1297
+ }
1298
+ const personaStats = new Map();
1299
+ const allScenarioResults = [];
1300
+ // Aggregate results from each persona
1301
+ for (const result of personaResults) {
1302
+ personaStats.set(result.persona.id, result.stats);
1303
+ allScenarioResults.push(...result.scenarioResults);
1304
+ // Merge tool interactions
1305
+ for (const [toolName, interactions] of result.toolInteractions) {
1306
+ const toolData = toolInteractionsMap.get(toolName);
1307
+ if (toolData) {
1308
+ toolData.interactions.push(...interactions);
1309
+ }
1310
+ }
1311
+ // Merge tool findings
1312
+ for (const [toolName, findings] of result.toolFindings) {
1313
+ const toolData = toolInteractionsMap.get(toolName);
1314
+ if (toolData) {
1315
+ toolData.findingsByPersona.push(findings);
1316
+ }
1317
+ }
1318
+ }
1319
+ return { toolInteractionsMap, personaStats, allScenarioResults };
1320
+ }
1321
+ /**
1322
+ * Interview a single tool in check mode (parallel-safe).
1323
+ * Designed for parallel tool testing with minimal overhead.
1324
+ *
1325
+ * @param client - MCP client for tool calls
1326
+ * @param tool - Tool to test
1327
+ * @param toolCallMutex - Mutex for serializing tool calls (shared resource)
1328
+ * @returns ToolCheckResult with interactions and stats
1329
+ */
1330
+ async interviewToolInCheckMode(client, tool, toolCallMutex, statefulRunner, dependencyInfo, statefulConfig) {
1331
+ const interactions = [];
1332
+ const scenarioResults = [];
1333
+ let questionsAsked = 0;
1334
+ let toolCallCount = 0;
1335
+ let errorCount = 0;
1336
+ const maxChainLength = statefulConfig?.maxChainLength ?? Number.POSITIVE_INFINITY;
1337
+ const allowStateful = !!statefulRunner && (dependencyInfo?.sequencePosition ?? 0) < maxChainLength;
1338
+ const externalDecision = this.resolveExternalServiceDecision(tool);
1339
+ if (externalDecision.action === 'skip') {
1340
+ this.skippedTools.add(tool.name);
1341
+ return {
1342
+ toolName: tool.name,
1343
+ interactions: [],
1344
+ scenarioResults,
1345
+ questionsAsked,
1346
+ toolCallCount,
1347
+ errorCount,
1348
+ skipped: true,
1349
+ skipReason: externalDecision.reason,
1350
+ dependencyInfo,
1351
+ };
1352
+ }
1353
+ // Check for custom scenarios for this tool
1354
+ const customScenarios = this.getScenariosForTool(tool.name);
1355
+ // Build questions list - custom scenarios or fallback questions
1356
+ let questions = [];
1357
+ if (customScenarios.length > 0) {
1358
+ // Execute custom scenarios
1359
+ await toolCallMutex.acquire();
1360
+ try {
1361
+ const results = await this.executeToolScenarios(client, tool.name, customScenarios);
1362
+ scenarioResults.push(...results);
1363
+ toolCallCount += results.length;
1364
+ errorCount += results.filter(r => !r.passed).length;
1365
+ }
1366
+ finally {
1367
+ toolCallMutex.release();
1368
+ }
1369
+ // Convert scenarios to interview questions
1370
+ questions = customScenarios.map(s => this.scenarioToQuestion(s));
1371
+ }
1372
+ else {
1373
+ // No custom scenarios - use fallback questions (check mode, no LLM)
1374
+ // We need an orchestrator for fallback questions, but we won't use LLM
1375
+ // Get fallback questions directly
1376
+ questions = this.getFallbackQuestionsForTool(tool, this.config.skipErrorTests)
1377
+ .slice(0, this.config.maxQuestionsPerTool);
1378
+ }
1379
+ // Execute warmup runs if configured (helps reduce cold-start timing variance)
1380
+ // Warmup runs are not recorded in interactions
1381
+ const warmupRuns = this.config.warmupRuns ?? 1;
1382
+ if (warmupRuns > 0 && questions.length > 0) {
1383
+ const warmupQuestion = questions[0]; // Use first question for warmup
1384
+ await toolCallMutex.acquire();
1385
+ try {
1386
+ for (let i = 0; i < warmupRuns; i++) {
1387
+ try {
1388
+ await this.callToolWithPolicies(client, tool, warmupQuestion.args, externalDecision);
1389
+ }
1390
+ catch {
1391
+ // Ignore warmup errors - we just want to warm up the system
1392
+ }
1393
+ }
1394
+ }
1395
+ finally {
1396
+ toolCallMutex.release();
1397
+ }
1398
+ this.logger.debug({ tool: tool.name, warmupRuns }, 'Warmup runs complete');
1399
+ }
1400
+ // Ask each question
1401
+ for (const question of questions) {
1402
+ const interactionStart = Date.now();
1403
+ let response = null;
1404
+ let error = null;
1405
+ let toolExecutionMs = 0;
1406
+ let assertionResults;
1407
+ let assertionsPassed;
1408
+ let mocked = false;
1409
+ let mockService;
1410
+ const expectedOutcome = this.inferExpectedOutcome(question);
1411
+ const shouldUseState = allowStateful && expectedOutcome !== 'error';
1412
+ const statefulArgs = shouldUseState && statefulRunner
1413
+ ? statefulRunner.applyStateToQuestion(tool.name, question)
1414
+ : { args: { ...question.args }, usedKeys: [] };
1415
+ const resolvedQuestion = {
1416
+ ...question,
1417
+ args: statefulArgs.args,
1418
+ metadata: {
1419
+ ...question.metadata,
1420
+ stateful: {
1421
+ usedKeys: statefulArgs.usedKeys,
1422
+ },
1423
+ },
1424
+ };
1425
+ // Acquire mutex for tool calls (shared MCP client)
1426
+ await toolCallMutex.acquire();
1427
+ try {
1428
+ const result = await this.callToolWithPolicies(client, tool, resolvedQuestion.args, externalDecision);
1429
+ response = result.response;
1430
+ error = result.error;
1431
+ toolExecutionMs = result.toolExecutionMs;
1432
+ mocked = !!result.mocked;
1433
+ mockService = result.mockService;
1434
+ if (!result.skipped) {
1435
+ toolCallCount++;
1436
+ if (error || response?.isError) {
1437
+ errorCount++;
1438
+ }
1439
+ }
1440
+ }
1441
+ finally {
1442
+ toolCallMutex.release();
1443
+ }
1444
+ // Generate simple analysis (no LLM in check mode)
1445
+ const analysis = this.generateSimpleAnalysis(error, !!response, 'Tool call succeeded.');
1446
+ const outcomeAssessment = this.assessOutcome(resolvedQuestion, response, error);
1447
+ if (this.config.assertions?.enabled && outcomeAssessment.expected === 'success' && response && !response.isError) {
1448
+ let schema = this.responseSchemas.get(tool.name);
1449
+ if (!schema && this.config.assertions?.infer) {
1450
+ const inferred = inferResponseSchema(response);
1451
+ if (inferred) {
1452
+ schema = inferred;
1453
+ this.responseSchemas.set(tool.name, inferred);
1454
+ }
1455
+ }
1456
+ if (schema) {
1457
+ assertionResults = validateResponseAssertions(response, schema);
1458
+ assertionsPassed = assertionResults.every((r) => r.passed);
1459
+ }
1460
+ }
1461
+ if (allowStateful && response && !response.isError && statefulRunner) {
1462
+ const providedKeys = statefulRunner.recordResponse(tool, response);
1463
+ resolvedQuestion.metadata = {
1464
+ ...resolvedQuestion.metadata,
1465
+ stateful: {
1466
+ ...(resolvedQuestion.metadata?.stateful ?? {}),
1467
+ providedKeys,
1468
+ },
1469
+ };
1470
+ }
1471
+ const interaction = {
1472
+ toolName: tool.name,
1473
+ question: resolvedQuestion,
1474
+ response,
1475
+ error,
1476
+ analysis,
1477
+ durationMs: Date.now() - interactionStart,
1478
+ toolExecutionMs,
1479
+ llmAnalysisMs: 0, // No LLM in check mode
1480
+ personaId: 'check_mode',
1481
+ outcomeAssessment,
1482
+ assertionResults,
1483
+ assertionsPassed,
1484
+ mocked,
1485
+ mockService,
1486
+ };
1487
+ interactions.push(interaction);
1488
+ questionsAsked++;
1489
+ }
1490
+ this.logger.debug({
1491
+ tool: tool.name,
1492
+ questionsAsked,
1493
+ toolCallCount,
1494
+ errorCount,
1495
+ }, 'Tool check complete');
1496
+ return {
1497
+ toolName: tool.name,
1498
+ interactions,
1499
+ scenarioResults,
1500
+ questionsAsked,
1501
+ toolCallCount,
1502
+ errorCount,
1503
+ mocked: interactions.some((i) => i.mocked),
1504
+ mockService: interactions.find((i) => i.mockService)?.mockService,
1505
+ responseSchema: this.responseSchemas.get(tool.name),
1506
+ dependencyInfo,
1507
+ };
1508
+ }
1509
+ /**
1510
+ * Get fallback questions for a tool without requiring an orchestrator.
1511
+ * Used in check mode when parallel tool testing is enabled.
1512
+ *
1513
+ * Uses the SchemaTestGenerator to produce comprehensive deterministic tests
1514
+ * including boundaries, type coercion, enum validation, and error handling.
1515
+ */
1516
+ getFallbackQuestionsForTool(tool, skipErrorTests) {
1517
+ // Use the enhanced schema test generator for comprehensive coverage
1518
+ // Allow more tests in check mode since there's no LLM cost
1519
+ const maxTests = Math.max(this.config.maxQuestionsPerTool * 4, SCHEMA_TESTING.MAX_TESTS_PER_TOOL);
1520
+ return generateSchemaTests(tool, {
1521
+ skipErrorTests,
1522
+ maxTestsPerTool: maxTests,
1523
+ });
1524
+ }
1525
+ /**
1526
+ * Run parallel tool testing in check mode.
1527
+ * Tests all tools concurrently with a configurable worker limit.
1528
+ *
1529
+ * @param client - MCP client for tool calls
1530
+ * @param tools - Tools to test
1531
+ * @param onProgress - Progress callback
1532
+ * @returns Aggregated tool profiles
1533
+ */
1534
+ async interviewToolsInParallel(client, tools, progress, onProgress, options) {
1535
+ // Use concurrency=1 for sequential execution when parallelTools is disabled
1536
+ const statefulEnabled = !!options?.statefulRunner;
1537
+ const concurrency = statefulEnabled
1538
+ ? 1
1539
+ : this.config.parallelTools
1540
+ ? (this.config.toolConcurrency ?? INTERVIEW.DEFAULT_TOOL_CONCURRENCY)
1541
+ : 1;
1542
+ const toolCallMutex = createMutex(); // Shared mutex for serializing MCP client calls
1543
+ this.logger.info({
1544
+ toolCount: tools.length,
1545
+ concurrency,
1546
+ parallel: this.config.parallelTools,
1547
+ }, 'Running check mode tool testing');
1548
+ // Create tasks for each tool
1549
+ const toolTasks = tools.map(tool => async () => {
1550
+ progress.currentTool = tool.name;
1551
+ onProgress?.(progress);
1552
+ const result = await this.interviewToolInCheckMode(client, tool, toolCallMutex, options?.statefulRunner, options?.dependencyMap?.get(tool.name), options?.statefulConfig);
1553
+ progress.toolsCompleted++;
1554
+ progress.questionsAsked += result.questionsAsked;
1555
+ progress.lastCompletedTool = this.buildToolProgressSummary(result);
1556
+ onProgress?.(progress);
1557
+ return result;
1558
+ });
1559
+ // Execute tools in parallel with concurrency limit
1560
+ const parallelResults = await parallelLimit(toolTasks, { concurrency });
1561
+ // Check for errors
1562
+ if (!parallelResults.allSucceeded) {
1563
+ for (const [index, error] of parallelResults.errors) {
1564
+ this.logger.error({
1565
+ tool: tools[index]?.name,
1566
+ error: error.message,
1567
+ }, 'Tool check failed');
1568
+ }
1569
+ }
1570
+ // Aggregate results
1571
+ const successfulResults = parallelResults.results.filter((r) => r !== undefined);
1572
+ const toolProfiles = [];
1573
+ const scenarioResults = [];
1574
+ let totalToolCallCount = 0;
1575
+ let totalErrorCount = 0;
1576
+ let totalQuestionsAsked = 0;
1577
+ for (const result of successfulResults) {
1578
+ const tool = tools.find(t => t.name === result.toolName);
1579
+ if (!tool)
1580
+ continue;
1581
+ // Classify errors to separate tool correctness from environment issues
1582
+ const errorClassification = this.classifyErrors(result.interactions, result.toolName, tool.description ?? '');
1583
+ const assertionSummary = summarizeAssertions(result.interactions);
1584
+ // Build minimal profile for check mode
1585
+ toolProfiles.push({
1586
+ name: result.toolName,
1587
+ description: tool.description ?? '',
1588
+ interactions: result.interactions,
1589
+ behavioralNotes: [],
1590
+ limitations: [],
1591
+ securityNotes: [],
1592
+ findingsByPersona: [],
1593
+ errorClassification,
1594
+ skipped: result.skipped,
1595
+ skipReason: result.skipReason,
1596
+ mocked: result.mocked,
1597
+ mockService: result.mockService,
1598
+ responseSchema: result.responseSchema,
1599
+ assertionSummary,
1600
+ dependencyInfo: result.dependencyInfo,
1601
+ });
1602
+ scenarioResults.push(...result.scenarioResults);
1603
+ totalToolCallCount += result.toolCallCount;
1604
+ totalErrorCount += result.errorCount;
1605
+ totalQuestionsAsked += result.questionsAsked;
1606
+ }
1607
+ this.logger.info({
1608
+ toolCount: toolProfiles.length,
1609
+ totalToolCallCount,
1610
+ totalErrorCount,
1611
+ }, 'Parallel tool testing complete');
1612
+ return {
1613
+ toolProfiles,
1614
+ scenarioResults,
1615
+ totalToolCallCount,
1616
+ totalErrorCount,
1617
+ totalQuestionsAsked,
1618
+ };
1619
+ }
1620
+ buildToolProgressSummary(result) {
1621
+ const interactions = result.interactions.filter(i => !i.mocked);
1622
+ const totalTests = interactions.length;
1623
+ let passedTests = 0;
1624
+ let validationTotal = 0;
1625
+ let validationPassed = 0;
1626
+ let totalDuration = 0;
1627
+ for (const interaction of interactions) {
1628
+ totalDuration += interaction.durationMs;
1629
+ const assessment = interaction.outcomeAssessment;
1630
+ if (assessment) {
1631
+ if (assessment.correct) {
1632
+ passedTests += 1;
1633
+ }
1634
+ if (assessment.expected === 'error') {
1635
+ validationTotal += 1;
1636
+ if (assessment.correct) {
1637
+ validationPassed += 1;
1638
+ }
1639
+ }
1640
+ }
1641
+ else {
1642
+ const hasError = interaction.error || interaction.response?.isError;
1643
+ if (!hasError) {
1644
+ passedTests += 1;
1645
+ }
1646
+ }
1647
+ }
1648
+ const avgMs = totalTests > 0 ? Math.round(totalDuration / totalTests) : 0;
1649
+ return {
1650
+ toolName: result.toolName,
1651
+ totalTests,
1652
+ passedTests,
1653
+ validationTotal,
1654
+ validationPassed,
1655
+ avgMs,
1656
+ skipped: result.skipped,
1657
+ skipReason: result.skipReason,
1658
+ mocked: result.mocked,
1659
+ mockService: result.mockService,
1660
+ };
1661
+ }
1662
+ /**
1663
+ * Convert a TestScenario to an InterviewQuestion.
1664
+ */
1665
+ scenarioToQuestion(scenario) {
1666
+ return {
1667
+ description: scenario.description,
1668
+ category: scenario.category,
1669
+ args: scenario.args,
1670
+ };
1671
+ }
1672
+ /**
1673
+ * Get custom scenarios for a specific tool.
1674
+ */
1675
+ getScenariosForTool(toolName) {
1676
+ const scenarios = this.config.customScenarios?.toolScenarios ?? [];
1677
+ return scenarios.filter(s => s.tool === toolName && !s.skip);
1678
+ }
1679
+ /**
1680
+ * Get custom scenarios for a specific prompt.
1681
+ */
1682
+ getScenariosForPrompt(promptName) {
1683
+ const scenarios = this.config.customScenarios?.promptScenarios ?? [];
1684
+ return scenarios.filter(s => s.prompt === promptName && !s.skip);
1685
+ }
1686
+ /**
1687
+ * Execute custom test scenarios for a tool.
1688
+ * Returns scenario results with assertion evaluations.
1689
+ */
1690
+ async executeToolScenarios(client, toolName, scenarios) {
1691
+ const results = [];
1692
+ const tool = { name: toolName, description: '' };
1693
+ for (const scenario of scenarios) {
1694
+ if (scenario.skip) {
1695
+ continue;
1696
+ }
1697
+ const startTime = Date.now();
1698
+ let response = null;
1699
+ let error;
1700
+ let isError = false;
1701
+ try {
1702
+ const result = await this.callToolWithPolicies(client, tool, scenario.args);
1703
+ if (result.skipped) {
1704
+ error = result.skipReason ?? 'Skipped: external service not configured';
1705
+ isError = true;
1706
+ }
1707
+ else {
1708
+ response = result.response;
1709
+ isError = response?.isError ?? false;
1710
+ if (isError) {
1711
+ const errorContent = response?.content?.find(c => c.type === 'text');
1712
+ if (errorContent && 'text' in errorContent) {
1713
+ error = String(errorContent.text);
1714
+ }
1715
+ }
1716
+ if (result.error) {
1717
+ error = result.error;
1718
+ isError = true;
1719
+ }
1720
+ }
1721
+ }
1722
+ catch (e) {
1723
+ error = e instanceof Error ? e.message : String(e);
1724
+ isError = true;
1725
+ }
1726
+ // Evaluate assertions if provided
1727
+ const assertionResults = scenario.assertions
1728
+ ? evaluateAssertions(scenario.assertions, response, isError)
1729
+ : [];
1730
+ // Scenario passes if no error (or expected error) and all assertions pass
1731
+ const allAssertionsPassed = assertionResults.every(r => r.passed);
1732
+ const passed = allAssertionsPassed && (!isError || scenario.category === 'error_handling');
1733
+ const result = {
1734
+ scenario,
1735
+ passed,
1736
+ assertionResults,
1737
+ error,
1738
+ response,
1739
+ durationMs: Date.now() - startTime,
1740
+ };
1741
+ results.push(result);
1742
+ this.logger.debug({
1743
+ tool: toolName,
1744
+ scenario: scenario.description,
1745
+ passed,
1746
+ assertions: assertionResults.length,
1747
+ }, 'Scenario executed');
1748
+ }
1749
+ return results;
1750
+ }
1751
+ /**
1752
+ * Execute custom test scenarios for a prompt.
1753
+ * Returns scenario results with assertion evaluations.
1754
+ */
1755
+ async executePromptScenarios(client, promptName, scenarios) {
1756
+ const results = [];
1757
+ for (const scenario of scenarios) {
1758
+ if (scenario.skip) {
1759
+ continue;
1760
+ }
1761
+ const startTime = Date.now();
1762
+ let response = null;
1763
+ let error;
1764
+ try {
1765
+ response = await client.getPrompt(promptName, scenario.args);
1766
+ }
1767
+ catch (e) {
1768
+ error = e instanceof Error ? e.message : String(e);
1769
+ }
1770
+ // Evaluate assertions if provided
1771
+ const assertionResults = scenario.assertions
1772
+ ? evaluateAssertions(scenario.assertions, response, !!error)
1773
+ : [];
1774
+ const allAssertionsPassed = assertionResults.every(r => r.passed);
1775
+ // Check if this scenario expects an error (has an assertion checking for 'error' to exist)
1776
+ const expectsError = scenario.assertions?.some(a => a.path === 'error' && a.condition === 'exists') ?? false;
1777
+ // Scenario passes if assertions pass AND (no error OR scenario expects error)
1778
+ const passed = allAssertionsPassed && (!error || expectsError);
1779
+ const result = {
1780
+ scenario,
1781
+ passed,
1782
+ assertionResults,
1783
+ error,
1784
+ response,
1785
+ durationMs: Date.now() - startTime,
1786
+ };
1787
+ results.push(result);
1788
+ this.logger.debug({
1789
+ prompt: promptName,
1790
+ scenario: scenario.description,
1791
+ passed,
1792
+ assertions: assertionResults.length,
1793
+ }, 'Prompt scenario executed');
1794
+ }
1795
+ return results;
1796
+ }
1797
+ /**
1798
+ * Execute workflow discovery and/or execution.
1799
+ * Discovers workflows using LLM if enabled, loads from file if provided,
1800
+ * and executes all workflows against the MCP server.
1801
+ */
1802
+ async executeWorkflows(client, discovery, workflowConfig, progress, onProgress) {
1803
+ const allWorkflows = [];
1804
+ let discoveredCount = 0;
1805
+ let loadedCount = 0;
1806
+ // Add user-provided workflows
1807
+ if (workflowConfig.workflows && workflowConfig.workflows.length > 0) {
1808
+ allWorkflows.push(...workflowConfig.workflows);
1809
+ loadedCount = workflowConfig.workflows.length;
1810
+ this.logger.info({ count: loadedCount }, 'Using workflows loaded from file');
1811
+ }
1812
+ // Discover workflows using LLM if enabled (requires LLM - skip in check mode)
1813
+ if (workflowConfig.discoverWorkflows && discovery.tools.length >= 2 && this.llm) {
1814
+ this.logger.info('Discovering workflows using LLM analysis');
1815
+ const discoverer = new WorkflowDiscoverer(this.llm, {
1816
+ maxWorkflows: workflowConfig.maxDiscoveredWorkflows ?? WORKFLOW.MAX_DISCOVERED_WORKFLOWS,
1817
+ minSteps: WORKFLOW.MIN_WORKFLOW_STEPS,
1818
+ maxSteps: WORKFLOW.MAX_WORKFLOW_STEPS,
1819
+ });
1820
+ try {
1821
+ const discovered = await discoverer.discover(discovery.tools);
1822
+ if (discovered.length > 0) {
1823
+ allWorkflows.push(...discovered);
1824
+ discoveredCount = discovered.length;
1825
+ this.logger.info({
1826
+ count: discoveredCount,
1827
+ workflows: discovered.map(w => w.name),
1828
+ }, 'Discovered workflows');
1829
+ }
1830
+ else {
1831
+ this.logger.info('No workflows discovered from tool analysis');
1832
+ }
1833
+ }
1834
+ catch (error) {
1835
+ this.logger.warn({
1836
+ error: error instanceof Error ? error.message : String(error),
1837
+ }, 'Workflow discovery failed');
1838
+ }
1839
+ }
1840
+ // Execute all workflows
1841
+ const results = [];
1842
+ // Execute workflows (requires LLM for analysis - skip in check mode unless analyzeSteps is disabled)
1843
+ if (allWorkflows.length > 0 && !workflowConfig.skipWorkflowExecution && this.llm) {
1844
+ this.logger.info({ count: allWorkflows.length }, 'Executing workflows');
1845
+ progress.totalWorkflows = allWorkflows.length;
1846
+ progress.workflowsCompleted = 0;
1847
+ onProgress?.(progress);
1848
+ const stepTimeout = workflowConfig.stepTimeout ?? WORKFLOW.STEP_TIMEOUT;
1849
+ const timeouts = workflowConfig.timeouts ?? {
1850
+ toolCall: stepTimeout,
1851
+ stateSnapshot: WORKFLOW.STATE_SNAPSHOT_TIMEOUT,
1852
+ probeTool: WORKFLOW.PROBE_TOOL_TIMEOUT,
1853
+ llmAnalysis: WORKFLOW.LLM_ANALYSIS_TIMEOUT,
1854
+ llmSummary: WORKFLOW.LLM_SUMMARY_TIMEOUT,
1855
+ };
1856
+ const executor = new WorkflowExecutor(client, this.llm, discovery.tools, {
1857
+ stepTimeout,
1858
+ analyzeSteps: !this.config.customScenariosOnly,
1859
+ generateSummary: !this.config.customScenariosOnly,
1860
+ stateTracking: workflowConfig.enableStateTracking
1861
+ ? {
1862
+ enabled: true,
1863
+ snapshotBefore: true,
1864
+ snapshotAfter: true,
1865
+ snapshotAfterEachStep: false,
1866
+ }
1867
+ : undefined,
1868
+ timeouts,
1869
+ });
1870
+ for (const workflow of allWorkflows) {
1871
+ progress.currentWorkflow = workflow.name;
1872
+ onProgress?.(progress);
1873
+ this.logger.debug({
1874
+ workflowId: workflow.id,
1875
+ workflowName: workflow.name,
1876
+ stepCount: workflow.steps.length,
1877
+ }, 'Executing workflow');
1878
+ try {
1879
+ const result = await executor.execute(workflow);
1880
+ results.push(result);
1881
+ this.logger.info({
1882
+ workflowId: workflow.id,
1883
+ success: result.success,
1884
+ durationMs: result.durationMs,
1885
+ }, 'Workflow execution complete');
1886
+ }
1887
+ catch (error) {
1888
+ this.logger.error({
1889
+ workflowId: workflow.id,
1890
+ error: error instanceof Error ? error.message : String(error),
1891
+ }, 'Workflow execution failed');
1892
+ // Create a failed result
1893
+ results.push({
1894
+ workflow,
1895
+ steps: [],
1896
+ success: false,
1897
+ failureReason: error instanceof Error ? error.message : String(error),
1898
+ durationMs: 0,
1899
+ dataFlow: [],
1900
+ });
1901
+ }
1902
+ progress.workflowsCompleted = (progress.workflowsCompleted ?? 0) + 1;
1903
+ onProgress?.(progress);
1904
+ }
1905
+ }
1906
+ // Build summary
1907
+ const successfulCount = results.filter(r => r.success).length;
1908
+ const summary = {
1909
+ workflowCount: results.length,
1910
+ successfulCount,
1911
+ failedCount: results.length - successfulCount,
1912
+ discoveredCount,
1913
+ loadedCount,
1914
+ };
1915
+ this.logger.info({
1916
+ total: summary.workflowCount,
1917
+ successful: summary.successfulCount,
1918
+ failed: summary.failedCount,
1919
+ discovered: summary.discoveredCount,
1920
+ loaded: summary.loadedCount,
1921
+ }, 'Workflow execution summary');
1922
+ return { results, summary };
1923
+ }
1924
+ }
1925
+ function summarizeAssertions(interactions) {
1926
+ const allResults = interactions
1927
+ .filter((i) => !i.mocked)
1928
+ .flatMap((i) => i.assertionResults ?? []);
1929
+ if (allResults.length === 0)
1930
+ return undefined;
1931
+ const passed = allResults.filter((r) => r.passed).length;
1932
+ const failed = allResults.length - passed;
1933
+ return {
1934
+ total: allResults.length,
1935
+ passed,
1936
+ failed,
1937
+ };
1938
+ }
1939
+ //# sourceMappingURL=interviewer.js.map