webmcp-cli 1.0.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/dist/agent/features/agent-simulator.d.ts +67 -0
  2. package/dist/agent/features/agent-simulator.js +368 -0
  3. package/dist/agent/features/agent-simulator.js.map +1 -0
  4. package/dist/agent/features/index.d.ts +8 -0
  5. package/dist/agent/features/index.js +9 -0
  6. package/dist/agent/features/index.js.map +1 -0
  7. package/dist/agent/features/simulation-judge.d.ts +78 -0
  8. package/dist/agent/features/simulation-judge.js +276 -0
  9. package/dist/agent/features/simulation-judge.js.map +1 -0
  10. package/dist/agent/features/test-case-generator.d.ts +35 -0
  11. package/dist/agent/features/test-case-generator.js +257 -0
  12. package/dist/agent/features/test-case-generator.js.map +1 -0
  13. package/dist/agent/index.d.ts +7 -0
  14. package/dist/agent/index.js +10 -0
  15. package/dist/agent/index.js.map +1 -0
  16. package/dist/agent/llm-client.d.ts +76 -0
  17. package/dist/agent/llm-client.js +198 -0
  18. package/dist/agent/llm-client.js.map +1 -0
  19. package/dist/audit/run-single-page-audit.d.ts +41 -0
  20. package/dist/audit/run-single-page-audit.js +103 -0
  21. package/dist/audit/run-single-page-audit.js.map +1 -0
  22. package/dist/bin/webmcp.d.ts +5 -0
  23. package/dist/bin/webmcp.js +14 -0
  24. package/dist/bin/webmcp.js.map +1 -0
  25. package/dist/browser/audit-runner.d.ts +30 -0
  26. package/dist/browser/audit-runner.js +77 -0
  27. package/dist/browser/audit-runner.js.map +1 -0
  28. package/dist/browser/index.d.ts +6 -0
  29. package/dist/browser/index.js +7 -0
  30. package/dist/browser/index.js.map +1 -0
  31. package/dist/browser/interceptor.d.ts +68 -0
  32. package/dist/browser/interceptor.js +257 -0
  33. package/dist/browser/interceptor.js.map +1 -0
  34. package/dist/browser/playwright.d.ts +98 -0
  35. package/dist/browser/playwright.js +158 -0
  36. package/dist/browser/playwright.js.map +1 -0
  37. package/dist/cli/commands/audit.d.ts +12 -0
  38. package/dist/cli/commands/audit.js +349 -0
  39. package/dist/cli/commands/audit.js.map +1 -0
  40. package/dist/cli/commands/interactive.d.ts +10 -0
  41. package/dist/cli/commands/interactive.js +34 -0
  42. package/dist/cli/commands/interactive.js.map +1 -0
  43. package/dist/cli/index.d.ts +17 -0
  44. package/dist/cli/index.js +84 -0
  45. package/dist/cli/index.js.map +1 -0
  46. package/dist/cli/options/parse-audit-options.d.ts +12 -0
  47. package/dist/cli/options/parse-audit-options.js +64 -0
  48. package/dist/cli/options/parse-audit-options.js.map +1 -0
  49. package/dist/core/constants.d.ts +102 -0
  50. package/dist/core/constants.js +214 -0
  51. package/dist/core/constants.js.map +1 -0
  52. package/dist/core/types/audit.d.ts +260 -0
  53. package/dist/core/types/audit.js +5 -0
  54. package/dist/core/types/audit.js.map +1 -0
  55. package/dist/core/types/index.d.ts +6 -0
  56. package/dist/core/types/index.js +7 -0
  57. package/dist/core/types/index.js.map +1 -0
  58. package/dist/core/types/rule.d.ts +190 -0
  59. package/dist/core/types/rule.js +26 -0
  60. package/dist/core/types/rule.js.map +1 -0
  61. package/dist/core/types/tool.d.ts +312 -0
  62. package/dist/core/types/tool.js +6 -0
  63. package/dist/core/types/tool.js.map +1 -0
  64. package/dist/detection/declarative.d.ts +27 -0
  65. package/dist/detection/declarative.js +343 -0
  66. package/dist/detection/declarative.js.map +1 -0
  67. package/dist/detection/imperative.d.ts +38 -0
  68. package/dist/detection/imperative.js +99 -0
  69. package/dist/detection/imperative.js.map +1 -0
  70. package/dist/detection/index.d.ts +5 -0
  71. package/dist/detection/index.js +6 -0
  72. package/dist/detection/index.js.map +1 -0
  73. package/dist/index.d.ts +12 -0
  74. package/dist/index.js +19 -0
  75. package/dist/index.js.map +1 -0
  76. package/dist/llm/advice-service.d.ts +38 -0
  77. package/dist/llm/advice-service.js +243 -0
  78. package/dist/llm/advice-service.js.map +1 -0
  79. package/dist/llm/evaluator.d.ts +89 -0
  80. package/dist/llm/evaluator.js +274 -0
  81. package/dist/llm/evaluator.js.map +1 -0
  82. package/dist/llm/index.d.ts +11 -0
  83. package/dist/llm/index.js +15 -0
  84. package/dist/llm/index.js.map +1 -0
  85. package/dist/llm/json-response.d.ts +12 -0
  86. package/dist/llm/json-response.js +67 -0
  87. package/dist/llm/json-response.js.map +1 -0
  88. package/dist/llm/providers/mock.d.ts +29 -0
  89. package/dist/llm/providers/mock.js +324 -0
  90. package/dist/llm/providers/mock.js.map +1 -0
  91. package/dist/llm/providers/openrouter.d.ts +53 -0
  92. package/dist/llm/providers/openrouter.js +321 -0
  93. package/dist/llm/providers/openrouter.js.map +1 -0
  94. package/dist/llm/request-cache.d.ts +28 -0
  95. package/dist/llm/request-cache.js +99 -0
  96. package/dist/llm/request-cache.js.map +1 -0
  97. package/dist/llm/types.d.ts +233 -0
  98. package/dist/llm/types.js +7 -0
  99. package/dist/llm/types.js.map +1 -0
  100. package/dist/rules/best-practices/BP-001.d.ts +11 -0
  101. package/dist/rules/best-practices/BP-001.js +56 -0
  102. package/dist/rules/best-practices/BP-001.js.map +1 -0
  103. package/dist/rules/best-practices/BP-002.d.ts +11 -0
  104. package/dist/rules/best-practices/BP-002.js +63 -0
  105. package/dist/rules/best-practices/BP-002.js.map +1 -0
  106. package/dist/rules/best-practices/BP-003.d.ts +11 -0
  107. package/dist/rules/best-practices/BP-003.js +68 -0
  108. package/dist/rules/best-practices/BP-003.js.map +1 -0
  109. package/dist/rules/coverage/COV-001.d.ts +8 -0
  110. package/dist/rules/coverage/COV-001.js +51 -0
  111. package/dist/rules/coverage/COV-001.js.map +1 -0
  112. package/dist/rules/description/DESC-003.d.ts +13 -0
  113. package/dist/rules/description/DESC-003.js +96 -0
  114. package/dist/rules/description/DESC-003.js.map +1 -0
  115. package/dist/rules/description/DESC-004.d.ts +8 -0
  116. package/dist/rules/description/DESC-004.js +61 -0
  117. package/dist/rules/description/DESC-004.js.map +1 -0
  118. package/dist/rules/description/DESC-005.d.ts +12 -0
  119. package/dist/rules/description/DESC-005.js +70 -0
  120. package/dist/rules/description/DESC-005.js.map +1 -0
  121. package/dist/rules/description/index.d.ts +4 -0
  122. package/dist/rules/description/index.js +5 -0
  123. package/dist/rules/description/index.js.map +1 -0
  124. package/dist/rules/implementation/IMP-001.d.ts +10 -0
  125. package/dist/rules/implementation/IMP-001.js +36 -0
  126. package/dist/rules/implementation/IMP-001.js.map +1 -0
  127. package/dist/rules/implementation/IMP-003.d.ts +9 -0
  128. package/dist/rules/implementation/IMP-003.js +45 -0
  129. package/dist/rules/implementation/IMP-003.js.map +1 -0
  130. package/dist/rules/implementation/IMP-004.d.ts +9 -0
  131. package/dist/rules/implementation/IMP-004.js +48 -0
  132. package/dist/rules/implementation/IMP-004.js.map +1 -0
  133. package/dist/rules/implementation/IMP-005.d.ts +9 -0
  134. package/dist/rules/implementation/IMP-005.js +54 -0
  135. package/dist/rules/implementation/IMP-005.js.map +1 -0
  136. package/dist/rules/implementation/IMP-007.d.ts +8 -0
  137. package/dist/rules/implementation/IMP-007.js +79 -0
  138. package/dist/rules/implementation/IMP-007.js.map +1 -0
  139. package/dist/rules/implementation/IMP-013.d.ts +9 -0
  140. package/dist/rules/implementation/IMP-013.js +55 -0
  141. package/dist/rules/implementation/IMP-013.js.map +1 -0
  142. package/dist/rules/implementation/index.d.ts +9 -0
  143. package/dist/rules/implementation/index.js +10 -0
  144. package/dist/rules/implementation/index.js.map +1 -0
  145. package/dist/rules/index.d.ts +51 -0
  146. package/dist/rules/index.js +100 -0
  147. package/dist/rules/index.js.map +1 -0
  148. package/dist/rules/llm/LLM-001.d.ts +14 -0
  149. package/dist/rules/llm/LLM-001.js +78 -0
  150. package/dist/rules/llm/LLM-001.js.map +1 -0
  151. package/dist/rules/llm/LLM-002.d.ts +14 -0
  152. package/dist/rules/llm/LLM-002.js +77 -0
  153. package/dist/rules/llm/LLM-002.js.map +1 -0
  154. package/dist/rules/llm/LLM-003.d.ts +16 -0
  155. package/dist/rules/llm/LLM-003.js +82 -0
  156. package/dist/rules/llm/LLM-003.js.map +1 -0
  157. package/dist/rules/llm/LLM-004.d.ts +14 -0
  158. package/dist/rules/llm/LLM-004.js +87 -0
  159. package/dist/rules/llm/LLM-004.js.map +1 -0
  160. package/dist/rules/llm/LLM-005.d.ts +16 -0
  161. package/dist/rules/llm/LLM-005.js +105 -0
  162. package/dist/rules/llm/LLM-005.js.map +1 -0
  163. package/dist/rules/llm/index.d.ts +10 -0
  164. package/dist/rules/llm/index.js +11 -0
  165. package/dist/rules/llm/index.js.map +1 -0
  166. package/dist/rules/runner.d.ts +54 -0
  167. package/dist/rules/runner.js +138 -0
  168. package/dist/rules/runner.js.map +1 -0
  169. package/dist/rules/schema/SCHEMA-001.d.ts +9 -0
  170. package/dist/rules/schema/SCHEMA-001.js +57 -0
  171. package/dist/rules/schema/SCHEMA-001.js.map +1 -0
  172. package/dist/rules/schema/SCHEMA-002.d.ts +9 -0
  173. package/dist/rules/schema/SCHEMA-002.js +59 -0
  174. package/dist/rules/schema/SCHEMA-002.js.map +1 -0
  175. package/dist/rules/schema/SCHEMA-003.d.ts +10 -0
  176. package/dist/rules/schema/SCHEMA-003.js +66 -0
  177. package/dist/rules/schema/SCHEMA-003.js.map +1 -0
  178. package/dist/rules/schema/SCHEMA-011.d.ts +10 -0
  179. package/dist/rules/schema/SCHEMA-011.js +62 -0
  180. package/dist/rules/schema/SCHEMA-011.js.map +1 -0
  181. package/dist/rules/security/SEC-001.d.ts +12 -0
  182. package/dist/rules/security/SEC-001.js +66 -0
  183. package/dist/rules/security/SEC-001.js.map +1 -0
  184. package/dist/rules/utils/keywords.d.ts +35 -0
  185. package/dist/rules/utils/keywords.js +100 -0
  186. package/dist/rules/utils/keywords.js.map +1 -0
  187. package/dist/scoring/calculator.d.ts +27 -0
  188. package/dist/scoring/calculator.js +194 -0
  189. package/dist/scoring/calculator.js.map +1 -0
  190. package/dist/scoring/grades.d.ts +34 -0
  191. package/dist/scoring/grades.js +167 -0
  192. package/dist/scoring/grades.js.map +1 -0
  193. package/dist/scoring/index.d.ts +5 -0
  194. package/dist/scoring/index.js +6 -0
  195. package/dist/scoring/index.js.map +1 -0
  196. package/dist/ui/banner.d.ts +21 -0
  197. package/dist/ui/banner.js +60 -0
  198. package/dist/ui/banner.js.map +1 -0
  199. package/dist/ui/design-tokens.d.ts +23 -0
  200. package/dist/ui/design-tokens.js +58 -0
  201. package/dist/ui/design-tokens.js.map +1 -0
  202. package/dist/ui/findings.d.ts +23 -0
  203. package/dist/ui/findings.js +190 -0
  204. package/dist/ui/findings.js.map +1 -0
  205. package/dist/ui/index.d.ts +9 -0
  206. package/dist/ui/index.js +10 -0
  207. package/dist/ui/index.js.map +1 -0
  208. package/dist/ui/ink/App.d.ts +14 -0
  209. package/dist/ui/ink/App.js +113 -0
  210. package/dist/ui/ink/App.js.map +1 -0
  211. package/dist/ui/ink/FullScreenLayout.d.ts +16 -0
  212. package/dist/ui/ink/FullScreenLayout.js +29 -0
  213. package/dist/ui/ink/FullScreenLayout.js.map +1 -0
  214. package/dist/ui/ink/InteractiveApp.d.ts +28 -0
  215. package/dist/ui/ink/InteractiveApp.js +229 -0
  216. package/dist/ui/ink/InteractiveApp.js.map +1 -0
  217. package/dist/ui/ink/RealAuditApp.d.ts +19 -0
  218. package/dist/ui/ink/RealAuditApp.js +170 -0
  219. package/dist/ui/ink/RealAuditApp.js.map +1 -0
  220. package/dist/ui/ink/components/AnimatedProgressBar.d.ts +20 -0
  221. package/dist/ui/ink/components/AnimatedProgressBar.js +46 -0
  222. package/dist/ui/ink/components/AnimatedProgressBar.js.map +1 -0
  223. package/dist/ui/ink/components/AsciiLogo.d.ts +12 -0
  224. package/dist/ui/ink/components/AsciiLogo.js +35 -0
  225. package/dist/ui/ink/components/AsciiLogo.js.map +1 -0
  226. package/dist/ui/ink/components/CategoryBars.d.ts +18 -0
  227. package/dist/ui/ink/components/CategoryBars.js +18 -0
  228. package/dist/ui/ink/components/CategoryBars.js.map +1 -0
  229. package/dist/ui/ink/components/FindingsTable.d.ts +18 -0
  230. package/dist/ui/ink/components/FindingsTable.js +19 -0
  231. package/dist/ui/ink/components/FindingsTable.js.map +1 -0
  232. package/dist/ui/ink/components/Footer.d.ts +15 -0
  233. package/dist/ui/ink/components/Footer.js +20 -0
  234. package/dist/ui/ink/components/Footer.js.map +1 -0
  235. package/dist/ui/ink/components/Header.d.ts +11 -0
  236. package/dist/ui/ink/components/Header.js +12 -0
  237. package/dist/ui/ink/components/Header.js.map +1 -0
  238. package/dist/ui/ink/components/LinkList.d.ts +17 -0
  239. package/dist/ui/ink/components/LinkList.js +44 -0
  240. package/dist/ui/ink/components/LinkList.js.map +1 -0
  241. package/dist/ui/ink/components/Navigation.d.ts +26 -0
  242. package/dist/ui/ink/components/Navigation.js +62 -0
  243. package/dist/ui/ink/components/Navigation.js.map +1 -0
  244. package/dist/ui/ink/components/ProgressBar.d.ts +15 -0
  245. package/dist/ui/ink/components/ProgressBar.js +14 -0
  246. package/dist/ui/ink/components/ProgressBar.js.map +1 -0
  247. package/dist/ui/ink/components/ScoreCard.d.ts +30 -0
  248. package/dist/ui/ink/components/ScoreCard.js +26 -0
  249. package/dist/ui/ink/components/ScoreCard.js.map +1 -0
  250. package/dist/ui/ink/components/SimulationResults.d.ts +33 -0
  251. package/dist/ui/ink/components/SimulationResults.js +23 -0
  252. package/dist/ui/ink/components/SimulationResults.js.map +1 -0
  253. package/dist/ui/ink/components/Spinner.d.ts +11 -0
  254. package/dist/ui/ink/components/Spinner.js +12 -0
  255. package/dist/ui/ink/components/Spinner.js.map +1 -0
  256. package/dist/ui/ink/components/ToolCard.d.ts +23 -0
  257. package/dist/ui/ink/components/ToolCard.js +20 -0
  258. package/dist/ui/ink/components/ToolCard.js.map +1 -0
  259. package/dist/ui/ink/components/shared/Badge.d.ts +21 -0
  260. package/dist/ui/ink/components/shared/Badge.js +39 -0
  261. package/dist/ui/ink/components/shared/Badge.js.map +1 -0
  262. package/dist/ui/ink/components/shared/Card.d.ts +18 -0
  263. package/dist/ui/ink/components/shared/Card.js +11 -0
  264. package/dist/ui/ink/components/shared/Card.js.map +1 -0
  265. package/dist/ui/ink/components/shared/HelpOverlay.d.ts +10 -0
  266. package/dist/ui/ink/components/shared/HelpOverlay.js +28 -0
  267. package/dist/ui/ink/components/shared/HelpOverlay.js.map +1 -0
  268. package/dist/ui/ink/components/shared/LoadingWithTimeout.d.ts +11 -0
  269. package/dist/ui/ink/components/shared/LoadingWithTimeout.js +21 -0
  270. package/dist/ui/ink/components/shared/LoadingWithTimeout.js.map +1 -0
  271. package/dist/ui/ink/components/shared/Menu.d.ts +23 -0
  272. package/dist/ui/ink/components/shared/Menu.js +43 -0
  273. package/dist/ui/ink/components/shared/Menu.js.map +1 -0
  274. package/dist/ui/ink/components/shared/Table.d.ts +23 -0
  275. package/dist/ui/ink/components/shared/Table.js +40 -0
  276. package/dist/ui/ink/components/shared/Table.js.map +1 -0
  277. package/dist/ui/ink/components/views/CrawlingView.d.ts +12 -0
  278. package/dist/ui/ink/components/views/CrawlingView.js +34 -0
  279. package/dist/ui/ink/components/views/CrawlingView.js.map +1 -0
  280. package/dist/ui/ink/components/views/DashboardView.d.ts +21 -0
  281. package/dist/ui/ink/components/views/DashboardView.js +51 -0
  282. package/dist/ui/ink/components/views/DashboardView.js.map +1 -0
  283. package/dist/ui/ink/components/views/FindingDetailView.d.ts +16 -0
  284. package/dist/ui/ink/components/views/FindingDetailView.js +34 -0
  285. package/dist/ui/ink/components/views/FindingDetailView.js.map +1 -0
  286. package/dist/ui/ink/components/views/FindingsView.d.ts +16 -0
  287. package/dist/ui/ink/components/views/FindingsView.js +79 -0
  288. package/dist/ui/ink/components/views/FindingsView.js.map +1 -0
  289. package/dist/ui/ink/components/views/OnboardingView.d.ts +12 -0
  290. package/dist/ui/ink/components/views/OnboardingView.js +40 -0
  291. package/dist/ui/ink/components/views/OnboardingView.js.map +1 -0
  292. package/dist/ui/ink/components/views/SimulationView.d.ts +17 -0
  293. package/dist/ui/ink/components/views/SimulationView.js +53 -0
  294. package/dist/ui/ink/components/views/SimulationView.js.map +1 -0
  295. package/dist/ui/ink/components/views/TestCaseDetailView.d.ts +11 -0
  296. package/dist/ui/ink/components/views/TestCaseDetailView.js +53 -0
  297. package/dist/ui/ink/components/views/TestCaseDetailView.js.map +1 -0
  298. package/dist/ui/ink/components/views/ToolDetailView.d.ts +15 -0
  299. package/dist/ui/ink/components/views/ToolDetailView.js +25 -0
  300. package/dist/ui/ink/components/views/ToolDetailView.js.map +1 -0
  301. package/dist/ui/ink/components/views/ToolsView.d.ts +15 -0
  302. package/dist/ui/ink/components/views/ToolsView.js +43 -0
  303. package/dist/ui/ink/components/views/ToolsView.js.map +1 -0
  304. package/dist/ui/ink/demo.d.ts +6 -0
  305. package/dist/ui/ink/demo.js +254 -0
  306. package/dist/ui/ink/demo.js.map +1 -0
  307. package/dist/ui/ink/hooks/useAnimation.d.ts +29 -0
  308. package/dist/ui/ink/hooks/useAnimation.js +89 -0
  309. package/dist/ui/ink/hooks/useAnimation.js.map +1 -0
  310. package/dist/ui/ink/hooks/useAudit.d.ts +69 -0
  311. package/dist/ui/ink/hooks/useAudit.js +99 -0
  312. package/dist/ui/ink/hooks/useAudit.js.map +1 -0
  313. package/dist/ui/ink/hooks/useCrawlAnimation.d.ts +19 -0
  314. package/dist/ui/ink/hooks/useCrawlAnimation.js +204 -0
  315. package/dist/ui/ink/hooks/useCrawlAnimation.js.map +1 -0
  316. package/dist/ui/ink/hooks/useKeyboardNav.d.ts +23 -0
  317. package/dist/ui/ink/hooks/useKeyboardNav.js +81 -0
  318. package/dist/ui/ink/hooks/useKeyboardNav.js.map +1 -0
  319. package/dist/ui/ink/hooks/useNavigation.d.ts +16 -0
  320. package/dist/ui/ink/hooks/useNavigation.js +42 -0
  321. package/dist/ui/ink/hooks/useNavigation.js.map +1 -0
  322. package/dist/ui/ink/hooks/useTerminalSize.d.ts +10 -0
  323. package/dist/ui/ink/hooks/useTerminalSize.js +29 -0
  324. package/dist/ui/ink/hooks/useTerminalSize.js.map +1 -0
  325. package/dist/ui/ink/index.d.ts +43 -0
  326. package/dist/ui/ink/index.js +50 -0
  327. package/dist/ui/ink/index.js.map +1 -0
  328. package/dist/ui/ink/render.d.ts +24 -0
  329. package/dist/ui/ink/render.js +14 -0
  330. package/dist/ui/ink/render.js.map +1 -0
  331. package/dist/ui/ink/theme.d.ts +37 -0
  332. package/dist/ui/ink/theme.js +38 -0
  333. package/dist/ui/ink/theme.js.map +1 -0
  334. package/dist/ui/ink/types.d.ts +77 -0
  335. package/dist/ui/ink/types.js +5 -0
  336. package/dist/ui/ink/types.js.map +1 -0
  337. package/dist/ui/score-display.d.ts +16 -0
  338. package/dist/ui/score-display.js +201 -0
  339. package/dist/ui/score-display.js.map +1 -0
  340. package/dist/ui/spinner.d.ts +45 -0
  341. package/dist/ui/spinner.js +112 -0
  342. package/dist/ui/spinner.js.map +1 -0
  343. package/dist/ui/utils.d.ts +13 -0
  344. package/dist/ui/utils.js +25 -0
  345. package/dist/ui/utils.js.map +1 -0
  346. package/package.json +61 -9
  347. package/index.js +0 -105
@@ -0,0 +1,276 @@
1
+ /**
2
+ * Simulation Judge
3
+ *
4
+ * Uses LLM to evaluate simulation results and provide structured scoring
5
+ * with actionable improvement suggestions.
6
+ */
7
+ import chalk from 'chalk';
8
+ import { LLMClient, createLLMClient } from '../llm-client.js';
9
+ import { parseJsonObject } from '../../llm/json-response.js';
10
+ // ═══════════════════════════════════════════════════════
11
+ // JUDGE PROMPTS
12
+ // ═══════════════════════════════════════════════════════
13
+ const JUDGE_SYSTEM_PROMPT = `You are an expert judge evaluating an AI agent's performance when interacting with website tools via the WebMCP protocol.
14
+
15
+ You evaluate with strict criteria but fair judgment. Your evaluation should be:
16
+ 1. SPECIFIC — Point to exact moments in the conversation
17
+ 2. ACTIONABLE — Every criticism comes with a fix
18
+ 3. BALANCED — Acknowledge what worked well, not just failures
19
+ 4. QUANTIFIED — Use numbers, percentages, and scores`;
20
+ function buildJudgmentPrompt(simulation, tools) {
21
+ const conversationLog = simulation.rounds.map(r => `[USER] ${r.userMessage}\n[AGENT] ${r.agentResponse.text || '(tool call)'}${r.agentResponse.toolCalls.length > 0
22
+ ? ` → called: ${r.agentResponse.toolCalls.map(tc => `${tc.toolName}(${JSON.stringify(tc.toolInput)})`).join(', ')}`
23
+ : ''}`).join('\n\n');
24
+ return `TOOLS AVAILABLE:
25
+ ${tools.map(t => ` ${t.name}: ${t.description}`).join('\n')}
26
+
27
+ TEST CASE:
28
+ Prompt: "${simulation.testCase.prompt}"
29
+ Expected tool: ${simulation.testCase.expectedBehavior.toolName || 'none'}
30
+ Expected params: ${JSON.stringify(simulation.testCase.expectedBehavior.expectedParams)}
31
+ Category: ${simulation.testCase.category}
32
+ Difficulty: ${simulation.testCase.difficulty}/5
33
+
34
+ CONVERSATION TRANSCRIPT:
35
+ ${conversationLog}
36
+
37
+ FINAL OUTCOME:
38
+ Success: ${simulation.finalSuccess}
39
+ Tool invocations: ${simulation.toolInvocations}
40
+ Rounds used: ${simulation.totalRounds}
41
+ ${simulation.evaluation.failureReason ? ` Failure reason: ${simulation.evaluation.failureReason}` : ''}
42
+
43
+ Judge this interaction on the following criteria:
44
+
45
+ 1. TOOL SELECTION ACCURACY (0-100)
46
+ Did the agent select the correct tool(s) for the user's request?
47
+ Did it avoid selecting wrong or unnecessary tools?
48
+
49
+ 2. PARAMETER ACCURACY (0-100)
50
+ Did the agent correctly extract and format parameters from the user's input?
51
+ Did it hallucinate any values not mentioned by the user?
52
+
53
+ 3. CONVERSATION EFFICIENCY (0-100)
54
+ Did the agent complete the task in the minimum number of exchanges?
55
+ Were there unnecessary clarifying questions or redundant tool calls?
56
+
57
+ 4. ERROR HANDLING (0-100)
58
+ When errors occurred, did the agent recover gracefully?
59
+ Did it provide helpful context to the user about what went wrong?
60
+
61
+ 5. USER EXPERIENCE (0-100)
62
+ Was the agent's communication clear and natural?
63
+ Did it keep the user informed about what it was doing?
64
+
65
+ 6. TOOL DEFINITION QUALITY (0-100)
66
+ Based on the agent's behavior, how well are the tools defined?
67
+ Did the tool descriptions/schemas cause any agent confusion?
68
+ This score reflects the TOOL QUALITY, not the agent's ability.
69
+
70
+ OUTPUT FORMAT (strict JSON):
71
+ {
72
+ "scores": {
73
+ "toolSelection": { "score": 85, "reasoning": "..." },
74
+ "parameterAccuracy": { "score": 72, "reasoning": "..." },
75
+ "conversationEfficiency": { "score": 60, "reasoning": "..." },
76
+ "errorHandling": { "score": 90, "reasoning": "..." },
77
+ "userExperience": { "score": 78, "reasoning": "..." },
78
+ "toolDefinitionQuality": { "score": 65, "reasoning": "..." }
79
+ },
80
+ "overallScore": 75,
81
+ "verdict": "PASS",
82
+ "keyMoments": [
83
+ {
84
+ "moment": "Round 1: Agent correctly identified search intent",
85
+ "impact": "positive",
86
+ "severity": "minor"
87
+ }
88
+ ],
89
+ "toolImprovements": [
90
+ {
91
+ "tool": "search-flights",
92
+ "issue": "Description doesn't mention airport code format",
93
+ "suggestedFix": "Add 'Use 3-letter IATA airport codes' to description",
94
+ "estimatedImpact": "Would improve parameter accuracy"
95
+ }
96
+ ],
97
+ "summary": "Overall assessment in 2-3 sentences"
98
+ }`;
99
+ }
100
+ // ═══════════════════════════════════════════════════════
101
+ // JUDGE FUNCTIONS
102
+ // ═══════════════════════════════════════════════════════
103
+ /**
104
+ * Judge a single simulation result
105
+ */
106
+ export async function judgeSimulation(simulation, tools, llm) {
107
+ const client = llm || createLLMClient();
108
+ if (!client.isAvailable()) {
109
+ return createFallbackJudgment(simulation);
110
+ }
111
+ try {
112
+ const response = await client.chat([
113
+ { role: 'system', content: JUDGE_SYSTEM_PROMPT },
114
+ { role: 'user', content: buildJudgmentPrompt(simulation, tools) },
115
+ ]);
116
+ if (!response.text) {
117
+ throw new Error('No response from LLM');
118
+ }
119
+ return parseJudgmentResponse(response.text);
120
+ }
121
+ catch (error) {
122
+ console.warn(chalk.yellow(`Judgment failed: ${error instanceof Error ? error.message : error}`));
123
+ return createFallbackJudgment(simulation);
124
+ }
125
+ }
126
+ /**
127
+ * Judge multiple simulations and provide aggregate analysis
128
+ */
129
+ export async function judgeSimulations(simulations, tools, llm) {
130
+ const client = llm || createLLMClient();
131
+ // Get individual judgments (sample if too many)
132
+ const sampleSize = Math.min(simulations.length, 10);
133
+ const sampled = simulations.slice(0, sampleSize);
134
+ const judgments = [];
135
+ for (const sim of sampled) {
136
+ const judgment = await judgeSimulation(sim, tools, client);
137
+ judgments.push(judgment);
138
+ }
139
+ // Aggregate results
140
+ const passCount = judgments.filter(j => j.verdict === 'PASS').length;
141
+ const partialCount = judgments.filter(j => j.verdict === 'PARTIAL').length;
142
+ const avgScore = judgments.reduce((sum, j) => sum + j.overallScore, 0) / judgments.length;
143
+ // Collect failure patterns
144
+ const failureMap = new Map();
145
+ for (const judgment of judgments) {
146
+ for (const improvement of judgment.toolImprovements) {
147
+ const key = improvement.issue;
148
+ if (!failureMap.has(key)) {
149
+ failureMap.set(key, { count: 0, tools: new Set() });
150
+ }
151
+ const entry = failureMap.get(key);
152
+ entry.count++;
153
+ entry.tools.add(improvement.tool);
154
+ }
155
+ }
156
+ const failurePatterns = Array.from(failureMap.entries())
157
+ .sort((a, b) => b[1].count - a[1].count)
158
+ .slice(0, 5)
159
+ .map(([pattern, data]) => {
160
+ const matchingJudgment = judgments.find(j => j.toolImprovements.some(i => i.issue === pattern));
161
+ const matchingImprovement = matchingJudgment?.toolImprovements.find(i => i.issue === pattern);
162
+ return {
163
+ pattern,
164
+ frequency: data.count,
165
+ affectedTools: Array.from(data.tools),
166
+ suggestedFix: matchingImprovement?.suggestedFix ?? 'Review tool definition',
167
+ };
168
+ });
169
+ // Collect unique improvements
170
+ const improvementMap = new Map();
171
+ for (const judgment of judgments) {
172
+ for (const improvement of judgment.toolImprovements) {
173
+ const key = `${improvement.tool}:${improvement.issue}`;
174
+ if (!improvementMap.has(key)) {
175
+ improvementMap.set(key, improvement);
176
+ }
177
+ }
178
+ }
179
+ const topImprovements = Array.from(improvementMap.values()).slice(0, 5);
180
+ return {
181
+ totalSimulations: simulations.length,
182
+ passRate: (passCount + partialCount * 0.5) / judgments.length,
183
+ avgScore,
184
+ failurePatterns,
185
+ topImprovements,
186
+ summary: `Evaluated ${simulations.length} simulations. ` +
187
+ `Pass rate: ${Math.round((passCount / judgments.length) * 100)}%. ` +
188
+ `Average score: ${Math.round(avgScore)}/100. ` +
189
+ `${failurePatterns.length > 0 && failurePatterns[0] ? `Top issue: ${failurePatterns[0].pattern}` : 'No major issues found.'}`,
190
+ };
191
+ }
192
+ /**
193
+ * Parse the LLM judgment response
194
+ */
195
+ function parseJudgmentResponse(response) {
196
+ const parsed = parseJsonObject(response);
197
+ // Validate and fill defaults
198
+ return {
199
+ scores: parsed.scores || createDefaultScores(),
200
+ overallScore: parsed.overallScore || 50,
201
+ verdict: parsed.verdict || 'PARTIAL',
202
+ keyMoments: parsed.keyMoments || [],
203
+ toolImprovements: parsed.toolImprovements || [],
204
+ summary: parsed.summary || 'Evaluation complete.',
205
+ };
206
+ }
207
+ /**
208
+ * Create fallback judgment when LLM is unavailable
209
+ */
210
+ function createFallbackJudgment(simulation) {
211
+ const eval_ = simulation.evaluation;
212
+ return {
213
+ scores: {
214
+ toolSelection: {
215
+ score: Math.round(eval_.routingAccuracy * 100),
216
+ reasoning: eval_.routingAccuracy === 1
217
+ ? 'Correct tool selected on first try'
218
+ : 'Tool selection issues detected',
219
+ },
220
+ parameterAccuracy: {
221
+ score: Math.round(eval_.parameterAccuracy * 100),
222
+ reasoning: eval_.parameterAccuracy >= 0.8
223
+ ? 'Most parameters extracted correctly'
224
+ : 'Some parameters were incorrect or missing',
225
+ },
226
+ conversationEfficiency: {
227
+ score: Math.round(eval_.roundEfficiency * 100),
228
+ reasoning: simulation.totalRounds === 1
229
+ ? 'Completed in single round'
230
+ : `Took ${simulation.totalRounds} rounds`,
231
+ },
232
+ errorHandling: {
233
+ score: simulation.finalSuccess ? 80 : 40,
234
+ reasoning: simulation.finalSuccess
235
+ ? 'Task completed successfully'
236
+ : 'Task did not complete successfully',
237
+ },
238
+ userExperience: {
239
+ score: 70,
240
+ reasoning: 'Heuristic evaluation - LLM not available for detailed analysis',
241
+ },
242
+ toolDefinitionQuality: {
243
+ score: Math.round((eval_.routingAccuracy + eval_.parameterAccuracy) * 50),
244
+ reasoning: 'Based on agent performance metrics',
245
+ },
246
+ },
247
+ overallScore: eval_.overallScore,
248
+ verdict: simulation.finalSuccess ? 'PASS' : eval_.overallScore >= 50 ? 'PARTIAL' : 'FAIL',
249
+ keyMoments: simulation.evaluation.failureReason ? [{
250
+ moment: simulation.evaluation.failureReason,
251
+ impact: 'negative',
252
+ severity: 'major',
253
+ }] : [],
254
+ toolImprovements: eval_.hallucinationRate > 0 ? [{
255
+ tool: simulation.testCase.expectedBehavior.toolName || 'unknown',
256
+ issue: 'Agent hallucinated parameters',
257
+ suggestedFix: 'Add clearer parameter descriptions to schema',
258
+ estimatedImpact: `Would reduce hallucination rate of ${Math.round(eval_.hallucinationRate * 100)}%`,
259
+ }] : [],
260
+ summary: simulation.finalSuccess
261
+ ? `Simulation passed. Tool ${simulation.testCase.expectedBehavior.toolName} invoked correctly.`
262
+ : `Simulation failed: ${simulation.evaluation.failureReason || 'Unknown reason'}`,
263
+ };
264
+ }
265
+ function createDefaultScores() {
266
+ return {
267
+ toolSelection: { score: 50, reasoning: 'Unable to evaluate' },
268
+ parameterAccuracy: { score: 50, reasoning: 'Unable to evaluate' },
269
+ conversationEfficiency: { score: 50, reasoning: 'Unable to evaluate' },
270
+ errorHandling: { score: 50, reasoning: 'Unable to evaluate' },
271
+ userExperience: { score: 50, reasoning: 'Unable to evaluate' },
272
+ toolDefinitionQuality: { score: 50, reasoning: 'Unable to evaluate' },
273
+ };
274
+ }
275
+ export default judgeSimulation;
276
+ //# sourceMappingURL=simulation-judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"simulation-judge.js","sourceRoot":"","sources":["../../../src/agent/features/simulation-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAG9D,OAAO,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAsD7D,0DAA0D;AAC1D,gBAAgB;AAChB,0DAA0D;AAE1D,MAAM,mBAAmB,GAAG;;;;;;qDAMyB,CAAC;AAEtD,SAAS,mBAAmB,CAC1B,UAA4B,EAC5B,KAAqB;IAErB,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAChD,UAAU,CAAC,CAAC,WAAW,aAAa,CAAC,CAAC,aAAa,CAAC,IAAI,IAAI,aAAa,GACvE,CAAC,CAAC,aAAa,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;QAClC,CAAC,CAAC,cAAc,CAAC,CAAC,aAAa,CAAC,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAC/C,GAAG,EAAE,CAAC,QAAQ,IAAI,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,SAAS,CAAC,GAAG,CAClD,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;QAChB,CAAC,CAAC,EACN,EAAE,CACH,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEf,OAAO;EACP,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;aAG/C,UAAU,CAAC,QAAQ,CAAC,MAAM;mBACpB,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,IAAI,MAAM;qBACrD,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,cAAc,CAAC;cAC1E,UAAU,CAAC,QAAQ,CAAC,QAAQ;gBAC1B,UAAU,CAAC,QAAQ,CAAC,UAAU;;;EAG5C,eAAe;;;aAGJ,UAAU,CAAC,YAAY;sBACd,UAAU,CAAC,eAAe;iBAC/B,UAAU,CAAC,WAAW;EACrC,UAAU,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,CAAC,qBAAqB,UAAU,CAAC,UAAU,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAyDrG,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,kBAAkB;AAClB,0DAA0D;AAE1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAA4B,EAC5B,KAAqB,EACrB,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,EAAE,CAAC;IAExC,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,EAAE,CAAC;QAC1B,OAAO,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAC5C,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC;YACjC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,mBAAmB,EAAE;YAChD,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,mBAAmB,CAAC,UAAU,EAAE,KAAK,CAAC,EAAE;SAClE,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,OAAO,qBAAqB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACjG,OAAO,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,WAA+B,EAC/B,KAAqB,EACrB,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,EAAE,CAAC;IAExC,gDAAgD;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACpD,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IAEjD,MAAM,SAAS,GAAyB,EAAE,CAAC;IAE3C,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC;QAC3D,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC3B,CAAC;IAED,oBAAoB;IACpB,MAAM,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACrE,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,MAAM,CAAC;IAC3E,MAAM,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;IAE1F,2BAA2B;IAC3B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAiD,CAAC;IAE5E,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,WAAW,CAAC,KAAK,CAAC;YAC9B,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;YACtD,CAAC;YACD,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC;YACnC,KAAK,CAAC,KAAK,EAAE,CAAC;YACd,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACpC,CAAC;IACH,CAAC;IAED,MAAM,eAAe,GAAqB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;SACvE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;SACvC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;SACX,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,EAAE;QACvB,MAAM,gBAAgB,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAC1C,CAAC,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,OAAO,CAAC,CAClD,CAAC;QACF,MAAM,mBAAmB,GAAG,gBAAgB,EAAE,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,OAAO,CAAC,CAAC;QAC9F,OAAO;YACL,OAAO;YACP,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,aAAa,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;YACrC,YAAY,EAAE,mBAAmB,EAAE,YAAY,IAAI,wBAAwB;SAC5E,CAAC;IACJ,CAAC,CAAC,CAAC;IAEL,8BAA8B;IAC9B,MAAM,cAAc,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC1D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,GAAG,WAAW,CAAC,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACvD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,cAAc,CAAC,GAAG,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAExE,OAAO;QACL,gBAAgB,EAAE,WAAW,CAAC,MAAM;QACpC,QAAQ,EAAE,CAAC,SAAS,GAAG,YAAY,GAAG,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM;QAC7D,QAAQ;QACR,eAAe;QACf,eAAe;QACf,OAAO,EAAE,aAAa,WAAW,CAAC,MAAM,gBAAgB;YACtD,cAAc,IAAI,CAAC,KAAK,CAAC,CAAC,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,KAAK;YACnE,kBAAkB,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,QAAQ;YAC9C,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC,IAAI,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,cAAc,eAAe,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,wBAAwB,EAAE;KAChI,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,eAAe,CAAqB,QAAQ,CAAC,CAAC;IAE7D,6BAA6B;IAC7B,OAAO;QACL,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,mBAAmB,EAAE;QAC9C,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,EAAE;QACvC,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,SAAS;QACpC,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE;QACnC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,IAAI,EAAE;QAC/C,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,sBAAsB;KAClD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,sBAAsB,CAAC,UAA4B;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,UAAU,CAAC;IAEpC,OAAO;QACL,MAAM,EAAE;YACN,aAAa,EAAE;gBACb,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,eAAe,GAAG,GAAG,CAAC;gBAC9C,SAAS,EAAE,KAAK,CAAC,eAAe,KAAK,CAAC;oBACpC,CAAC,CAAC,oCAAoC;oBACtC,CAAC,CAAC,gCAAgC;aACrC;YACD,iBAAiB,EAAE;gBACjB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,iBAAiB,GAAG,GAAG,CAAC;gBAChD,SAAS,EAAE,KAAK,CAAC,iBAAiB,IAAI,GAAG;oBACvC,CAAC,CAAC,qCAAqC;oBACvC,CAAC,CAAC,2CAA2C;aAChD;YACD,sBAAsB,EAAE;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,eAAe,GAAG,GAAG,CAAC;gBAC9C,SAAS,EAAE,UAAU,CAAC,WAAW,KAAK,CAAC;oBACrC,CAAC,CAAC,2BAA2B;oBAC7B,CAAC,CAAC,QAAQ,UAAU,CAAC,WAAW,SAAS;aAC5C;YACD,aAAa,EAAE;gBACb,KAAK,EAAE,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE;gBACxC,SAAS,EAAE,UAAU,CAAC,YAAY;oBAChC,CAAC,CAAC,6BAA6B;oBAC/B,CAAC,CAAC,oCAAoC;aACzC;YACD,cAAc,EAAE;gBACd,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,gEAAgE;aAC5E;YACD,qBAAqB,EAAE;gBACrB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,eAAe,GAAG,KAAK,CAAC,iBAAiB,CAAC,GAAG,EAAE,CAAC;gBACzE,SAAS,EAAE,oCAAoC;aAChD;SACF;QACD,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,OAAO,EAAE,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;QACzF,UAAU,EAAE,UAAU,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;gBACjD,MAAM,EAAE,UAAU,CAAC,UAAU,CAAC,aAAa;gBAC3C,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,OAAO;aAClB,CAAC,CAAC,CAAC,CAAC,EAAE;QACP,gBAAgB,EAAE,KAAK,CAAC,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/C,IAAI,EAAE,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,IAAI,SAAS;gBAChE,KAAK,EAAE,+BAA+B;gBACtC,YAAY,EAAE,8CAA8C;gBAC5D,eAAe,EAAE,sCAAsC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,iBAAiB,GAAG,GAAG,CAAC,GAAG;aACpG,CAAC,CAAC,CAAC,CAAC,EAAE;QACP,OAAO,EAAE,UAAU,CAAC,YAAY;YAC9B,CAAC,CAAC,2BAA2B,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,qBAAqB;YAC/F,CAAC,CAAC,sBAAsB,UAAU,CAAC,UAAU,CAAC,aAAa,IAAI,gBAAgB,EAAE;KACpF,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB;IAC1B,OAAO;QACL,aAAa,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC7D,iBAAiB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QACjE,sBAAsB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QACtE,aAAa,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC7D,cAAc,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC9D,qBAAqB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;KACtE,CAAC;AACJ,CAAC;AAED,eAAe,eAAe,CAAC"}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Test Case Generator
3
+ *
4
+ * Uses LLM to generate realistic user prompts for testing WebMCP tools.
5
+ * Generates diverse test cases covering happy paths, edge cases, and adversarial inputs.
6
+ */
7
+ import { LLMClient } from '../llm-client.js';
8
+ import type { DetectedTool } from '../../core/types/tool.js';
9
+ export type TestCaseCategory = 'happy_path' | 'partial_info' | 'casual' | 'verbose' | 'ambiguous' | 'adversarial' | 'out_of_scope';
10
+ export interface TestCase {
11
+ id: string;
12
+ prompt: string;
13
+ category: TestCaseCategory;
14
+ difficulty: 1 | 2 | 3 | 4 | 5;
15
+ expectedBehavior: {
16
+ shouldInvokeTool: boolean;
17
+ toolName: string | null;
18
+ expectedParams: Record<string, unknown>;
19
+ paramsExplanation: Record<string, string>;
20
+ missingParams: string[];
21
+ shouldAskForMissing: boolean;
22
+ acceptableAlternatives: string[];
23
+ };
24
+ failureModes: string[];
25
+ }
26
+ export interface TestCaseGenerationConfig {
27
+ count: number;
28
+ categories: TestCaseCategory[];
29
+ model?: string;
30
+ }
31
+ /**
32
+ * Generate test cases for the given tools
33
+ */
34
+ export declare function generateTestCases(tools: DetectedTool[], siteContext: string, config: TestCaseGenerationConfig, llm?: LLMClient): Promise<TestCase[]>;
35
+ export default generateTestCases;
@@ -0,0 +1,257 @@
1
+ /**
2
+ * Test Case Generator
3
+ *
4
+ * Uses LLM to generate realistic user prompts for testing WebMCP tools.
5
+ * Generates diverse test cases covering happy paths, edge cases, and adversarial inputs.
6
+ */
7
+ import chalk from 'chalk';
8
+ import { LLMClient, createLLMClient } from '../llm-client.js';
9
+ import { parseJsonObject } from '../../llm/json-response.js';
10
+ // ═══════════════════════════════════════════════════════
11
+ // PROMPTS
12
+ // ═══════════════════════════════════════════════════════
13
+ const SYSTEM_PROMPT = `You are a QA engineer specialized in testing AI agent tool interactions. Your job is to generate realistic test cases — natural language prompts that real users would say to an AI assistant, expecting specific tools to be invoked.
14
+
15
+ IMPORTANT RULES:
16
+ 1. Generate diverse phrasings — formal, casual, abbreviated, verbose
17
+ 2. Include edge cases — partial information, ambiguous requests
18
+ 3. Include realistic mistakes — wrong terminology, vague descriptions
19
+ 4. For each prompt, specify the EXPECTED correct behavior
20
+ 5. Be creative — real users are unpredictable
21
+ 6. Output valid JSON only — no markdown, no commentary outside the JSON`;
22
+ function buildGenerationPrompt(tools, siteContext, config) {
23
+ const { count, categories } = config;
24
+ return `Given the following WebMCP tools registered on a ${siteContext} website, generate exactly ${count} test case prompts.
25
+
26
+ TOOLS AVAILABLE:
27
+ ${tools.map(t => `
28
+ Tool: "${t.name}"
29
+ Description: "${t.description}"
30
+ Parameters: ${JSON.stringify(t.inputSchema, null, 2)}
31
+ `).join('\n---\n')}
32
+
33
+ Generate test cases across these categories: ${categories.join(', ')}
34
+
35
+ Category definitions:
36
+ - "happy_path": Clear, complete requests that should invoke the tool correctly with all required parameters
37
+ - "partial_info": Requests missing some required info — agent should ask for clarification OR use defaults
38
+ - "casual": Very informal, shorthand, or slang phrasings ("yo find me flights to nyc asap")
39
+ - "verbose": Overly detailed requests with extra context the agent must filter
40
+ - "ambiguous": Requests that COULD match multiple tools — tests disambiguation
41
+ - "adversarial": Typos, wrong terminology, misleading phrasing, or requests just outside the tool's scope
42
+ - "out_of_scope": Requests that NONE of the available tools should handle — agent should decline
43
+
44
+ For each test case, provide:
45
+ - The natural language prompt a user would say
46
+ - Which tool(s) should be invoked (or "none" if out of scope)
47
+ - The expected parameters the agent should extract
48
+ - What a correct agent response looks like
49
+ - What category this test case belongs to
50
+ - Difficulty rating for the agent (1=trivial, 5=very hard)
51
+
52
+ OUTPUT FORMAT (strict JSON):
53
+ {
54
+ "testCases": [
55
+ {
56
+ "id": "TC-001",
57
+ "prompt": "the exact user prompt",
58
+ "category": "happy_path",
59
+ "difficulty": 2,
60
+ "expectedBehavior": {
61
+ "shouldInvokeTool": true,
62
+ "toolName": "tool-name",
63
+ "expectedParams": {
64
+ "param1": "value1",
65
+ "param2": "value2"
66
+ },
67
+ "paramsExplanation": {
68
+ "param1": "Explicitly stated as X",
69
+ "param2": "Inferred from Y"
70
+ },
71
+ "missingParams": ["param3"],
72
+ "shouldAskForMissing": true,
73
+ "acceptableAlternatives": [
74
+ "Agent could interpret 'X' as 'Y'"
75
+ ]
76
+ },
77
+ "failureModes": [
78
+ "Agent might not recognize X as Y",
79
+ "Agent might pick wrong tool"
80
+ ]
81
+ }
82
+ ]
83
+ }`;
84
+ }
85
+ // ═══════════════════════════════════════════════════════
86
+ // GENERATOR
87
+ // ═══════════════════════════════════════════════════════
88
+ /**
89
+ * Generate test cases for the given tools
90
+ */
91
+ export async function generateTestCases(tools, siteContext, config, llm) {
92
+ const client = llm || createLLMClient({ model: config.model });
93
+ if (!client.isAvailable()) {
94
+ // Return fallback test cases when no API key
95
+ return generateFallbackTestCases(tools, config);
96
+ }
97
+ try {
98
+ const response = await client.chat([
99
+ { role: 'system', content: SYSTEM_PROMPT },
100
+ { role: 'user', content: buildGenerationPrompt(tools, siteContext, config) },
101
+ ]);
102
+ if (!response.text) {
103
+ throw new Error('No response from LLM');
104
+ }
105
+ // Extract JSON from response
106
+ const parsed = parseTestCaseResponse(response.text);
107
+ return parsed.testCases;
108
+ }
109
+ catch (error) {
110
+ console.warn(chalk.yellow(`Test case generation failed: ${error instanceof Error ? error.message : error}`));
111
+ return generateFallbackTestCases(tools, config);
112
+ }
113
+ }
114
+ /**
115
+ * Parse the LLM response, handling code blocks
116
+ */
117
+ function parseTestCaseResponse(response) {
118
+ return parseJsonObject(response);
119
+ }
120
+ /**
121
+ * Generate basic fallback test cases when LLM is unavailable
122
+ */
123
+ function generateFallbackTestCases(tools, config) {
124
+ const testCases = [];
125
+ let id = 1;
126
+ for (const tool of tools) {
127
+ // Extract parameters from schema
128
+ const properties = tool.inputSchema?.properties || {};
129
+ const required = (tool.inputSchema?.required || []);
130
+ const paramNames = Object.keys(properties);
131
+ // Happy path test
132
+ if (config.categories.includes('happy_path')) {
133
+ const params = {};
134
+ const explanations = {};
135
+ for (const param of paramNames) {
136
+ const prop = properties[param];
137
+ params[param] = getExampleValue(param, prop);
138
+ explanations[param] = 'Example value for testing';
139
+ }
140
+ testCases.push({
141
+ id: `TC-${String(id++).padStart(3, '0')}`,
142
+ prompt: `Use ${tool.name} with ${paramNames.join(', ')}`,
143
+ category: 'happy_path',
144
+ difficulty: 1,
145
+ expectedBehavior: {
146
+ shouldInvokeTool: true,
147
+ toolName: tool.name,
148
+ expectedParams: params,
149
+ paramsExplanation: explanations,
150
+ missingParams: [],
151
+ shouldAskForMissing: false,
152
+ acceptableAlternatives: [],
153
+ },
154
+ failureModes: ['Agent might not recognize the tool name'],
155
+ });
156
+ }
157
+ // Partial info test
158
+ if (config.categories.includes('partial_info') && required.length > 0) {
159
+ const params = {};
160
+ const explanations = {};
161
+ const firstRequired = required[0];
162
+ if (firstRequired) {
163
+ const prop = properties[firstRequired];
164
+ params[firstRequired] = getExampleValue(firstRequired, prop);
165
+ explanations[firstRequired] = 'Only provided this parameter';
166
+ }
167
+ testCases.push({
168
+ id: `TC-${String(id++).padStart(3, '0')}`,
169
+ prompt: `I want to ${tool.name.replace(/[-_]/g, ' ')}`,
170
+ category: 'partial_info',
171
+ difficulty: 3,
172
+ expectedBehavior: {
173
+ shouldInvokeTool: true,
174
+ toolName: tool.name,
175
+ expectedParams: params,
176
+ paramsExplanation: explanations,
177
+ missingParams: required.slice(1),
178
+ shouldAskForMissing: true,
179
+ acceptableAlternatives: [],
180
+ },
181
+ failureModes: ['Agent might guess missing values'],
182
+ });
183
+ }
184
+ // Casual test
185
+ if (config.categories.includes('casual')) {
186
+ testCases.push({
187
+ id: `TC-${String(id++).padStart(3, '0')}`,
188
+ prompt: `hey can u ${tool.name.replace(/[-_]/g, ' ')} for me`,
189
+ category: 'casual',
190
+ difficulty: 2,
191
+ expectedBehavior: {
192
+ shouldInvokeTool: true,
193
+ toolName: tool.name,
194
+ expectedParams: {},
195
+ paramsExplanation: {},
196
+ missingParams: required,
197
+ shouldAskForMissing: true,
198
+ acceptableAlternatives: [],
199
+ },
200
+ failureModes: ['Agent might not understand casual phrasing'],
201
+ });
202
+ }
203
+ }
204
+ // Out of scope test
205
+ if (config.categories.includes('out_of_scope')) {
206
+ testCases.push({
207
+ id: `TC-${String(id++).padStart(3, '0')}`,
208
+ prompt: 'What is the meaning of life?',
209
+ category: 'out_of_scope',
210
+ difficulty: 1,
211
+ expectedBehavior: {
212
+ shouldInvokeTool: false,
213
+ toolName: null,
214
+ expectedParams: {},
215
+ paramsExplanation: {},
216
+ missingParams: [],
217
+ shouldAskForMissing: false,
218
+ acceptableAlternatives: [],
219
+ },
220
+ failureModes: ['Agent might try to use a tool anyway'],
221
+ });
222
+ }
223
+ return testCases.slice(0, config.count);
224
+ }
225
+ /**
226
+ * Get an example value for a parameter
227
+ */
228
+ function getExampleValue(name, prop) {
229
+ const type = prop['type'];
230
+ const enumValues = prop['enum'];
231
+ if (enumValues && enumValues.length > 0) {
232
+ return enumValues[0];
233
+ }
234
+ switch (type) {
235
+ case 'string':
236
+ if (name.toLowerCase().includes('date'))
237
+ return '2026-06-15';
238
+ if (name.toLowerCase().includes('email'))
239
+ return 'test@example.com';
240
+ if (name.toLowerCase().includes('url'))
241
+ return 'https://example.com';
242
+ return 'example_value';
243
+ case 'number':
244
+ case 'integer':
245
+ return 1;
246
+ case 'boolean':
247
+ return true;
248
+ case 'array':
249
+ return [];
250
+ case 'object':
251
+ return {};
252
+ default:
253
+ return 'example';
254
+ }
255
+ }
256
+ export default generateTestCases;
257
+ //# sourceMappingURL=test-case-generator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-case-generator.js","sourceRoot":"","sources":["../../../src/agent/features/test-case-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAE9D,OAAO,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAsC7D,0DAA0D;AAC1D,UAAU;AACV,0DAA0D;AAE1D,MAAM,aAAa,GAAG;;;;;;;;wEAQkD,CAAC;AAEzE,SAAS,qBAAqB,CAC5B,KAAqB,EACrB,WAAmB,EACnB,MAAgC;IAEhC,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,MAAM,CAAC;IAErC,OAAO,oDAAoD,WAAW,8BAA8B,KAAK;;;EAGzG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;SACR,CAAC,CAAC,IAAI;gBACC,CAAC,CAAC,WAAW;cACf,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;CACnD,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;;+CAE6B,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkDlE,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,YAAY;AACZ,0DAA0D;AAE1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAqB,EACrB,WAAmB,EACnB,MAAgC,EAChC,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IAE/D,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,EAAE,CAAC;QAC1B,6CAA6C;QAC7C,OAAO,yBAAyB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC;YACjC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,aAAa,EAAE;YAC1C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,qBAAqB,CAAC,KAAK,EAAE,WAAW,EAAE,MAAM,CAAC,EAAE;SAC7E,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,6BAA6B;QAC7B,MAAM,MAAM,GAAG,qBAAqB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACpD,OAAO,MAAM,CAAC,SAAS,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,gCAAgC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAC7G,OAAO,yBAAyB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,OAAO,eAAe,CAA4B,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED;;GAEG;AACH,SAAS,yBAAyB,CAChC,KAAqB,EACrB,MAAgC;IAEhC,MAAM,SAAS,GAAe,EAAE,CAAC;IACjC,IAAI,EAAE,GAAG,CAAC,CAAC;IAEX,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,iCAAiC;QACjC,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,EAAE,UAAU,IAAI,EAAE,CAAC;QACtD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,QAAQ,IAAI,EAAE,CAAa,CAAC;QAChE,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAE3C,kBAAkB;QAClB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;YAC7C,MAAM,MAAM,GAA4B,EAAE,CAAC;YAC3C,MAAM,YAAY,GAA2B,EAAE,CAAC;YAEhD,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;gBAC/B,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAA4B,CAAC;gBAC1D,MAAM,CAAC,KAAK,CAAC,GAAG,eAAe,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;gBAC7C,YAAY,CAAC,KAAK,CAAC,GAAG,2BAA2B,CAAC;YACpD,CAAC;YAED,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,OAAO,IAAI,CAAC,IAAI,SAAS,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;gBACxD,QAAQ,EAAE,YAAY;gBACtB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,MAAM;oBACtB,iBAAiB,EAAE,YAAY;oBAC/B,aAAa,EAAE,EAAE;oBACjB,mBAAmB,EAAE,KAAK;oBAC1B,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,yCAAyC,CAAC;aAC1D,CAAC,CAAC;QACL,CAAC;QAED,oBAAoB;QACpB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtE,MAAM,MAAM,GAA4B,EAAE,CAAC;YAC3C,MAAM,YAAY,GAA2B,EAAE,CAAC;YAChD,MAAM,aAAa,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;YAClC,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,UAAU,CAAC,aAAa,CAA4B,CAAC;gBAClE,MAAM,CAAC,aAAa,CAAC,GAAG,eAAe,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;gBAC7D,YAAY,CAAC,aAAa,CAAC,GAAG,8BAA8B,CAAC;YAC/D,CAAC;YAED,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,aAAa,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE;gBACtD,QAAQ,EAAE,cAAc;gBACxB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,MAAM;oBACtB,iBAAiB,EAAE,YAAY;oBAC/B,aAAa,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;oBAChC,mBAAmB,EAAE,IAAI;oBACzB,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,kCAAkC,CAAC;aACnD,CAAC,CAAC;QACL,CAAC;QAED,cAAc;QACd,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzC,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,aAAa,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,SAAS;gBAC7D,QAAQ,EAAE,QAAQ;gBAClB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,EAAE;oBAClB,iBAAiB,EAAE,EAAE;oBACrB,aAAa,EAAE,QAAQ;oBACvB,mBAAmB,EAAE,IAAI;oBACzB,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,4CAA4C,CAAC;aAC7D,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;QAC/C,SAAS,CAAC,IAAI,CAAC;YACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;YACzC,MAAM,EAAE,8BAA8B;YACtC,QAAQ,EAAE,cAAc;YACxB,UAAU,EAAE,CAAC;YACb,gBAAgB,EAAE;gBAChB,gBAAgB,EAAE,KAAK;gBACvB,QAAQ,EAAE,IAAI;gBACd,cAAc,EAAE,EAAE;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,aAAa,EAAE,EAAE;gBACjB,mBAAmB,EAAE,KAAK;gBAC1B,sBAAsB,EAAE,EAAE;aAC3B;YACD,YAAY,EAAE,CAAC,sCAAsC,CAAC;SACvD,CAAC,CAAC;IACL,CAAC;IAED,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,IAAY,EAAE,IAA6B;IAClE,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAW,CAAC;IACpC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAc,CAAC;IAE7C,IAAI,UAAU,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,QAAQ;YACX,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,OAAO,YAAY,CAAC;YAC7D,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;gBAAE,OAAO,kBAAkB,CAAC;YACpE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC;gBAAE,OAAO,qBAAqB,CAAC;YACrE,OAAO,eAAe,CAAC;QACzB,KAAK,QAAQ,CAAC;QACd,KAAK,SAAS;YACZ,OAAO,CAAC,CAAC;QACX,KAAK,SAAS;YACZ,OAAO,IAAI,CAAC;QACd,KAAK,OAAO;YACV,OAAO,EAAE,CAAC;QACZ,KAAK,QAAQ;YACX,OAAO,EAAE,CAAC;QACZ;YACE,OAAO,SAAS,CAAC;IACrB,CAAC;AACH,CAAC;AAED,eAAe,iBAAiB,CAAC"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Agent Module
3
+ *
4
+ * LLM-powered agent simulation for testing WebMCP tools.
5
+ */
6
+ export { LLMClient, createLLMClient, type LLMClientConfig, type LLMMessage, type LLMTool, type LLMToolCall, type LLMResponse, } from './llm-client.js';
7
+ export { generateTestCases, runSimulation, runSimulations, judgeSimulation, judgeSimulations, type TestCase, type TestCaseCategory, type TestCaseGenerationConfig, type SimulationConfig, type SimulationResult, type SimulationRound, type ToolResult, type RoundEvaluation, type SimulationJudgment, type JudgmentResult, type KeyMoment, type ToolImprovement, type AggregateJudgment, type FailurePattern, } from './features/index.js';
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Agent Module
3
+ *
4
+ * LLM-powered agent simulation for testing WebMCP tools.
5
+ */
6
+ // LLM Client
7
+ export { LLMClient, createLLMClient, } from './llm-client.js';
8
+ // Features
9
+ export { generateTestCases, runSimulation, runSimulations, judgeSimulation, judgeSimulations, } from './features/index.js';
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/agent/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,aAAa;AACb,OAAO,EACL,SAAS,EACT,eAAe,GAMhB,MAAM,iBAAiB,CAAC;AAEzB,WAAW;AACX,OAAO,EACL,iBAAiB,EACjB,aAAa,EACb,cAAc,EACd,eAAe,EACf,gBAAgB,GAejB,MAAM,qBAAqB,CAAC"}