@tyvm/knowhow 0.0.32 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (508) hide show
  1. package/autodoc/plugins/downloader/downloader.mdx +2 -2
  2. package/benchmarks/.dockerignore +7 -0
  3. package/benchmarks/README.md +166 -0
  4. package/benchmarks/docker/Dockerfile +68 -0
  5. package/benchmarks/example-config.yml +27 -0
  6. package/benchmarks/jest.config.js +13 -0
  7. package/benchmarks/package-lock.json +4297 -0
  8. package/benchmarks/package.json +39 -0
  9. package/benchmarks/results/4542435/2025-08-05/lms/lms-openai-gpt-oss-20b.json +2814 -0
  10. package/benchmarks/results/4542435/2025-08-05/lms/lms-qwen-qwen3-30b-a3b-2507.json +2014 -0
  11. package/benchmarks/results/4fb9125/2025-08-07/anthropic/anthropic-claude-sonnet-4-20250514.json +3121 -0
  12. package/benchmarks/results/5766aee/2025-08-02/lms-qwen/qwen3-coder-30b.json +98 -0
  13. package/benchmarks/results/6d73808/2025-08-07/openai/openai-gpt-5.json +3256 -0
  14. package/benchmarks/results/77bf0a6/2025-08-02/lms-qwen/qwen3-30b-a3b-2507.json +4298 -0
  15. package/benchmarks/results/8c0d445/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3031 -0
  16. package/benchmarks/results/8c0d445/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +2990 -0
  17. package/benchmarks/results/ac6b2ab/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3256 -0
  18. package/benchmarks/results/ac6b2ab/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3007 -0
  19. package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +3256 -0
  20. package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-mini-2025-04-14.json +3036 -0
  21. package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-nano-2025-04-14.json +3280 -0
  22. package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-30b-a3b-2507.json +1920 -0
  23. package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-coder-30b.json +3281 -0
  24. package/benchmarks/results/b502ed9/2025-08-03/lms-qwen/qwen3-coder-30b.json +2896 -0
  25. package/benchmarks/results/d1a8129/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3011 -0
  26. package/benchmarks/results/e60471c/2025-08-03/lms/qwen3-30b-a3b-2507.json +3003 -0
  27. package/benchmarks/scripts/build-and-run.sh +47 -0
  28. package/benchmarks/scripts/clone-exercism.sh +92 -0
  29. package/benchmarks/scripts/validate.sh +48 -0
  30. package/benchmarks/src/__tests__/runner.test.ts +27 -0
  31. package/benchmarks/src/cli.ts +90 -0
  32. package/benchmarks/src/evaluators/EvaluatorRegistry.ts +64 -0
  33. package/benchmarks/src/evaluators/JavaScriptEvaluator.ts +183 -0
  34. package/benchmarks/src/evaluators/index.ts +3 -0
  35. package/benchmarks/src/evaluators/types.ts +22 -0
  36. package/benchmarks/src/index.ts +3 -0
  37. package/benchmarks/src/providers.ts +13 -0
  38. package/benchmarks/src/runner.ts +824 -0
  39. package/benchmarks/src/types.ts +63 -0
  40. package/benchmarks/tsconfig.json +19 -0
  41. package/jest.config.js +2 -1
  42. package/leaderboard/README.md +148 -0
  43. package/leaderboard/app/api/benchmark-data/route.ts +131 -0
  44. package/leaderboard/app/api/benchmark-detail/route.ts +172 -0
  45. package/leaderboard/app/details/[model]/[provider]/[language]/page.tsx +501 -0
  46. package/leaderboard/app/exercise/[model]/[provider]/[language]/[exercise]/page.tsx +375 -0
  47. package/leaderboard/app/globals.css +27 -0
  48. package/leaderboard/app/layout.tsx +21 -0
  49. package/leaderboard/app/page.tsx +170 -0
  50. package/leaderboard/components/LeaderboardTable.tsx +168 -0
  51. package/leaderboard/components/PerformanceChart.tsx +109 -0
  52. package/leaderboard/next-env.d.ts +5 -0
  53. package/leaderboard/next.config.js +4 -0
  54. package/leaderboard/package-lock.json +6363 -0
  55. package/leaderboard/package.json +28 -0
  56. package/leaderboard/postcss.config.js +6 -0
  57. package/leaderboard/tailwind.config.js +17 -0
  58. package/leaderboard/tsconfig.json +28 -0
  59. package/leaderboard/types/benchmark.ts +67 -0
  60. package/leaderboard/utils/dataProcessor.ts +33 -0
  61. package/package.json +2 -1
  62. package/src/agents/base/base.ts +147 -21
  63. package/src/agents/base/prompt.ts +28 -0
  64. package/src/agents/index.ts +3 -0
  65. package/src/agents/patcher/patcher.ts +6 -4
  66. package/src/agents/setup/setup.ts +56 -0
  67. package/src/agents/tools/agentCall.ts +6 -2
  68. package/src/agents/tools/aiClient.ts +74 -8
  69. package/src/agents/tools/execCommand.ts +13 -14
  70. package/src/agents/tools/executeScript/README.md +16 -0
  71. package/src/agents/tools/index.ts +2 -0
  72. package/src/agents/tools/list.ts +73 -16
  73. package/src/agents/tools/startAgentTask.ts +109 -0
  74. package/src/agents/tools/textSearch.ts +1 -1
  75. package/src/agents/tools/visionTool.ts +31 -2
  76. package/src/agents/tools/ycmd/client.ts +608 -0
  77. package/src/agents/tools/ycmd/definitions.ts +294 -0
  78. package/src/agents/tools/ycmd/detection.ts +211 -0
  79. package/src/agents/tools/ycmd/index.ts +11 -0
  80. package/src/agents/tools/ycmd/installer.ts +251 -0
  81. package/src/agents/tools/ycmd/server.ts +535 -0
  82. package/src/agents/tools/ycmd/serverManager.ts +316 -0
  83. package/src/agents/tools/ycmd/tools/completion.ts +113 -0
  84. package/src/agents/tools/ycmd/tools/diagnostics.ts +155 -0
  85. package/src/agents/tools/ycmd/tools/getLocations.ts +173 -0
  86. package/src/agents/tools/ycmd/tools/goto.ts +169 -0
  87. package/src/agents/tools/ycmd/tools/refactor.ts +204 -0
  88. package/src/agents/tools/ycmd/tools/signature.ts +174 -0
  89. package/src/agents/tools/ycmd/tools/start.ts +95 -0
  90. package/src/agents/tools/ycmd/utils/pathUtils.ts +59 -0
  91. package/src/ai.ts +15 -0
  92. package/src/chat/CliChatService.ts +277 -0
  93. package/src/chat/modules/AgentModule.ts +980 -0
  94. package/src/chat/modules/AskModule.ts +98 -0
  95. package/src/chat/modules/BaseChatModule.ts +66 -0
  96. package/src/chat/modules/InternalChatModule.ts +174 -0
  97. package/src/chat/modules/SearchModule.ts +166 -0
  98. package/src/chat/modules/SetupModule.ts +185 -0
  99. package/src/chat/modules/SystemModule.ts +120 -0
  100. package/src/chat/modules/VoiceModule.ts +70 -0
  101. package/src/chat/modules/index.js +5 -0
  102. package/src/chat/types.ts +97 -0
  103. package/src/chat.ts +9 -1
  104. package/src/chat2.ts +62 -0
  105. package/src/cli.ts +264 -35
  106. package/src/clients/anthropic.ts +14 -7
  107. package/src/clients/gemini.ts +15 -7
  108. package/src/clients/http.ts +17 -7
  109. package/src/clients/index.ts +117 -4
  110. package/src/clients/knowhow.ts +7 -2
  111. package/src/clients/knowhowMcp.ts +118 -0
  112. package/src/clients/openai.ts +32 -8
  113. package/src/clients/types.ts +1 -0
  114. package/src/clients/xai.ts +17 -5
  115. package/src/config.ts +30 -5
  116. package/src/conversion.ts +4 -1
  117. package/src/embeddings.ts +79 -23
  118. package/src/login.ts +26 -9
  119. package/src/microphone.ts +0 -1
  120. package/src/plugins/downloader/downloader.ts +72 -24
  121. package/src/plugins/downloader/plugin.ts +3 -1
  122. package/src/plugins/plugins.ts +3 -0
  123. package/src/processors/CustomVariables.ts +425 -0
  124. package/src/processors/HarmonyToolProcessor.ts +264 -0
  125. package/src/processors/XmlToolCallProcessor.ts +533 -0
  126. package/src/processors/index.ts +3 -0
  127. package/src/prompts/KnowhowConfigExamples.ts +376 -0
  128. package/src/services/KnowhowClient.ts +49 -3
  129. package/src/services/Mcp.ts +42 -3
  130. package/src/services/McpServer.ts +14 -4
  131. package/src/services/McpWebsocketTransport.ts +21 -7
  132. package/src/services/MessageProcessor.ts +10 -5
  133. package/src/services/index.ts +5 -0
  134. package/src/services/script-execution/ScriptExecutor.ts +34 -1
  135. package/src/services/types.ts +17 -14
  136. package/src/types.ts +17 -0
  137. package/src/utils/index.ts +138 -0
  138. package/tests/XmlToolCallProcessor.test.ts +468 -0
  139. package/tests/manual/ycmd/debug_diagnostics_test.ts +127 -0
  140. package/tests/manual/ycmd/fixtures/debug_diagnostics.ts +26 -0
  141. package/tests/manual/ycmd/fixtures/file_change_test.ts +17 -0
  142. package/tests/manual/ycmd/minimal_advanced_test.ts +108 -0
  143. package/tests/manual/ycmd/simple_diagnostics_test.ts +61 -0
  144. package/tests/manual/ycmd/simple_test.ts +74 -0
  145. package/tests/manual/ycmd/test-typescript-sample.ts +34 -0
  146. package/tests/manual/ycmd/test_advanced_features.ts +407 -0
  147. package/tests/manual/ycmd/test_advanced_with_tools.ts +320 -0
  148. package/tests/manual/ycmd/test_comprehensive_typescript.ts +179 -0
  149. package/tests/manual/ycmd/test_diagnostics_file_changes.ts +249 -0
  150. package/tests/manual/ycmd/test_diagnostics_fix.ts +99 -0
  151. package/tests/manual/ycmd/test_diagnostics_simple.ts +100 -0
  152. package/tests/manual/ycmd/test_diagnostics_timing.ts +120 -0
  153. package/tests/manual/ycmd/test_discover_commands.ts +310 -0
  154. package/tests/manual/ycmd/test_endpoints.ts +115 -0
  155. package/tests/manual/ycmd/test_final_comprehensive.ts +218 -0
  156. package/tests/manual/ycmd/test_final_validation.ts +150 -0
  157. package/tests/manual/ycmd/test_implementation.js +42 -0
  158. package/tests/manual/ycmd/test_individual_ycmd_tool.ts +39 -0
  159. package/tests/manual/ycmd/test_server_manager.ts +52 -0
  160. package/tests/manual/ycmd/test_simple_debug.ts +86 -0
  161. package/tests/manual/ycmd/test_tsserver_workflow.js +83 -0
  162. package/tests/manual/ycmd/test_tsserver_workflow.ts +122 -0
  163. package/tests/manual/ycmd/test_typescript_simple.ts +48 -0
  164. package/tests/manual/ycmd/test_typescript_ycmd.ts +105 -0
  165. package/tests/manual/ycmd/test_workspace_config.ts +90 -0
  166. package/tests/manual/ycmd/test_ycmd_auto_start.ts +137 -0
  167. package/tests/manual/ycmd/test_ycmd_comprehensive.ts +73 -0
  168. package/tests/manual/ycmd/test_ycmd_connection.py +10 -0
  169. package/tests/manual/ycmd/test_ycmd_direct.ts +142 -0
  170. package/tests/manual/ycmd/test_ycmd_experiment.ts +48 -0
  171. package/tests/manual/ycmd/test_ycmd_final.ts +200 -0
  172. package/tests/manual/ycmd/test_ycmd_fixed.py +18 -0
  173. package/tests/manual/ycmd/test_ycmd_integration.ts +112 -0
  174. package/tests/manual/ycmd/test_ycmd_simple.ts +45 -0
  175. package/tests/manual/ycmd/test_ycmd_usage.py +27 -0
  176. package/tests/manual/ycmd/working_simple_test.ts +134 -0
  177. package/ts_build/src/agents/base/base.d.ts +14 -1
  178. package/ts_build/src/agents/base/base.js +91 -17
  179. package/ts_build/src/agents/base/base.js.map +1 -1
  180. package/ts_build/src/agents/base/prompt.d.ts +1 -1
  181. package/ts_build/src/agents/base/prompt.js +28 -0
  182. package/ts_build/src/agents/base/prompt.js.map +1 -1
  183. package/ts_build/src/agents/index.d.ts +2 -0
  184. package/ts_build/src/agents/index.js +2 -0
  185. package/ts_build/src/agents/index.js.map +1 -1
  186. package/ts_build/src/agents/patcher/patcher.js +6 -3
  187. package/ts_build/src/agents/patcher/patcher.js.map +1 -1
  188. package/ts_build/src/agents/setup/setup.d.ts +8 -0
  189. package/ts_build/src/agents/setup/setup.js +59 -0
  190. package/ts_build/src/agents/setup/setup.js.map +1 -0
  191. package/ts_build/src/agents/tools/agentCall.js +5 -2
  192. package/ts_build/src/agents/tools/agentCall.js.map +1 -1
  193. package/ts_build/src/agents/tools/aiClient.d.ts +6 -5
  194. package/ts_build/src/agents/tools/aiClient.js +37 -6
  195. package/ts_build/src/agents/tools/aiClient.js.map +1 -1
  196. package/ts_build/src/agents/tools/execCommand.d.ts +2 -2
  197. package/ts_build/src/agents/tools/execCommand.js +5 -6
  198. package/ts_build/src/agents/tools/execCommand.js.map +1 -1
  199. package/ts_build/src/agents/tools/executeScript/index.d.ts +1 -1
  200. package/ts_build/src/agents/tools/index.d.ts +2 -0
  201. package/ts_build/src/agents/tools/index.js +2 -0
  202. package/ts_build/src/agents/tools/index.js.map +1 -1
  203. package/ts_build/src/agents/tools/list.js +66 -16
  204. package/ts_build/src/agents/tools/list.js.map +1 -1
  205. package/ts_build/src/agents/tools/startAgentTask.d.ts +13 -0
  206. package/ts_build/src/agents/tools/startAgentTask.js +74 -0
  207. package/ts_build/src/agents/tools/startAgentTask.js.map +1 -0
  208. package/ts_build/src/agents/tools/startChatTask.d.ts +13 -0
  209. package/ts_build/src/agents/tools/startChatTask.js +73 -0
  210. package/ts_build/src/agents/tools/startChatTask.js.map +1 -0
  211. package/ts_build/src/agents/tools/textSearch.js +1 -1
  212. package/ts_build/src/agents/tools/textSearch.js.map +1 -1
  213. package/ts_build/src/agents/tools/visionTool.d.ts +1 -1
  214. package/ts_build/src/agents/tools/visionTool.js +23 -3
  215. package/ts_build/src/agents/tools/visionTool.js.map +1 -1
  216. package/ts_build/src/agents/tools/ycmd/client.d.ts +93 -0
  217. package/ts_build/src/agents/tools/ycmd/client.js +355 -0
  218. package/ts_build/src/agents/tools/ycmd/client.js.map +1 -0
  219. package/ts_build/src/agents/tools/ycmd/definitions.d.ts +345 -0
  220. package/ts_build/src/agents/tools/ycmd/definitions.js +298 -0
  221. package/ts_build/src/agents/tools/ycmd/definitions.js.map +1 -0
  222. package/ts_build/src/agents/tools/ycmd/detection.d.ts +11 -0
  223. package/ts_build/src/agents/tools/ycmd/detection.js +175 -0
  224. package/ts_build/src/agents/tools/ycmd/detection.js.map +1 -0
  225. package/ts_build/src/agents/tools/ycmd/index.d.ts +8 -0
  226. package/ts_build/src/agents/tools/ycmd/index.js +20 -0
  227. package/ts_build/src/agents/tools/ycmd/index.js.map +1 -0
  228. package/ts_build/src/agents/tools/ycmd/installer.d.ts +19 -0
  229. package/ts_build/src/agents/tools/ycmd/installer.js +196 -0
  230. package/ts_build/src/agents/tools/ycmd/installer.js.map +1 -0
  231. package/ts_build/src/agents/tools/ycmd/server.d.ts +35 -0
  232. package/ts_build/src/agents/tools/ycmd/server.js +363 -0
  233. package/ts_build/src/agents/tools/ycmd/server.js.map +1 -0
  234. package/ts_build/src/agents/tools/ycmd/serverManager.d.ts +39 -0
  235. package/ts_build/src/agents/tools/ycmd/serverManager.js +210 -0
  236. package/ts_build/src/agents/tools/ycmd/serverManager.js.map +1 -0
  237. package/ts_build/src/agents/tools/ycmd/tools/completion.d.ts +22 -0
  238. package/ts_build/src/agents/tools/ycmd/tools/completion.js +72 -0
  239. package/ts_build/src/agents/tools/ycmd/tools/completion.js.map +1 -0
  240. package/ts_build/src/agents/tools/ycmd/tools/diagnostics.d.ts +42 -0
  241. package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js +88 -0
  242. package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js.map +1 -0
  243. package/ts_build/src/agents/tools/ycmd/tools/getLocations.d.ts +22 -0
  244. package/ts_build/src/agents/tools/ycmd/tools/getLocations.js +142 -0
  245. package/ts_build/src/agents/tools/ycmd/tools/getLocations.js.map +1 -0
  246. package/ts_build/src/agents/tools/ycmd/tools/goto.d.ts +20 -0
  247. package/ts_build/src/agents/tools/ycmd/tools/goto.js +101 -0
  248. package/ts_build/src/agents/tools/ycmd/tools/goto.js.map +1 -0
  249. package/ts_build/src/agents/tools/ycmd/tools/refactor.d.ts +32 -0
  250. package/ts_build/src/agents/tools/ycmd/tools/refactor.js +123 -0
  251. package/ts_build/src/agents/tools/ycmd/tools/refactor.js.map +1 -0
  252. package/ts_build/src/agents/tools/ycmd/tools/signature.d.ts +25 -0
  253. package/ts_build/src/agents/tools/ycmd/tools/signature.js +110 -0
  254. package/ts_build/src/agents/tools/ycmd/tools/signature.js.map +1 -0
  255. package/ts_build/src/agents/tools/ycmd/tools/start.d.ts +17 -0
  256. package/ts_build/src/agents/tools/ycmd/tools/start.js +65 -0
  257. package/ts_build/src/agents/tools/ycmd/tools/start.js.map +1 -0
  258. package/ts_build/src/agents/tools/ycmd/utils/pathUtils.d.ts +4 -0
  259. package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js +67 -0
  260. package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js.map +1 -0
  261. package/ts_build/src/ai.d.ts +1 -0
  262. package/ts_build/src/ai.js +40 -1
  263. package/ts_build/src/ai.js.map +1 -1
  264. package/ts_build/src/chat/ChatCommandHandler.d.ts +36 -0
  265. package/ts_build/src/chat/ChatCommandHandler.js +268 -0
  266. package/ts_build/src/chat/ChatCommandHandler.js.map +1 -0
  267. package/ts_build/src/chat/ChatInputManager.d.ts +22 -0
  268. package/ts_build/src/chat/ChatInputManager.js +85 -0
  269. package/ts_build/src/chat/ChatInputManager.js.map +1 -0
  270. package/ts_build/src/chat/ChatManager.d.ts +49 -0
  271. package/ts_build/src/chat/ChatManager.js +271 -0
  272. package/ts_build/src/chat/ChatManager.js.map +1 -0
  273. package/ts_build/src/chat/ChatSession.d.ts +32 -0
  274. package/ts_build/src/chat/ChatSession.js +3 -0
  275. package/ts_build/src/chat/ChatSession.js.map +1 -0
  276. package/ts_build/src/chat/ChatSessionManager.d.ts +19 -0
  277. package/ts_build/src/chat/ChatSessionManager.js +188 -0
  278. package/ts_build/src/chat/ChatSessionManager.js.map +1 -0
  279. package/ts_build/src/chat/ChatStateManager.d.ts +58 -0
  280. package/ts_build/src/chat/ChatStateManager.js +156 -0
  281. package/ts_build/src/chat/ChatStateManager.js.map +1 -0
  282. package/ts_build/src/chat/CliChatService.d.ts +35 -0
  283. package/ts_build/src/chat/CliChatService.js +201 -0
  284. package/ts_build/src/chat/CliChatService.js.map +1 -0
  285. package/ts_build/src/chat/InterruptibleInput.d.ts +20 -0
  286. package/ts_build/src/chat/InterruptibleInput.js +109 -0
  287. package/ts_build/src/chat/InterruptibleInput.js.map +1 -0
  288. package/ts_build/src/chat/interfaces/ChatModule.d.ts +6 -0
  289. package/ts_build/src/chat/interfaces/ChatModule.js +3 -0
  290. package/ts_build/src/chat/interfaces/ChatModule.js.map +1 -0
  291. package/ts_build/src/chat/modules/AgentModule.d.ts +56 -0
  292. package/ts_build/src/chat/modules/AgentModule.js +705 -0
  293. package/ts_build/src/chat/modules/AgentModule.js.map +1 -0
  294. package/ts_build/src/chat/modules/AskModule.d.ts +10 -0
  295. package/ts_build/src/chat/modules/AskModule.js +63 -0
  296. package/ts_build/src/chat/modules/AskModule.js.map +1 -0
  297. package/ts_build/src/chat/modules/BaseChatModule.d.ts +14 -0
  298. package/ts_build/src/chat/modules/BaseChatModule.js +32 -0
  299. package/ts_build/src/chat/modules/BaseChatModule.js.map +1 -0
  300. package/ts_build/src/chat/modules/InternalChatModule.d.ts +24 -0
  301. package/ts_build/src/chat/modules/InternalChatModule.js +127 -0
  302. package/ts_build/src/chat/modules/InternalChatModule.js.map +1 -0
  303. package/ts_build/src/chat/modules/SearchModule.d.ts +12 -0
  304. package/ts_build/src/chat/modules/SearchModule.js +119 -0
  305. package/ts_build/src/chat/modules/SearchModule.js.map +1 -0
  306. package/ts_build/src/chat/modules/SetupModule.d.ts +15 -0
  307. package/ts_build/src/chat/modules/SetupModule.js +147 -0
  308. package/ts_build/src/chat/modules/SetupModule.js.map +1 -0
  309. package/ts_build/src/chat/modules/SystemModule.d.ts +14 -0
  310. package/ts_build/src/chat/modules/SystemModule.js +90 -0
  311. package/ts_build/src/chat/modules/SystemModule.js.map +1 -0
  312. package/ts_build/src/chat/modules/VoiceModule.d.ts +11 -0
  313. package/ts_build/src/chat/modules/VoiceModule.js +57 -0
  314. package/ts_build/src/chat/modules/VoiceModule.js.map +1 -0
  315. package/ts_build/src/chat/types.d.ts +83 -0
  316. package/ts_build/src/chat/types.js +3 -0
  317. package/ts_build/src/chat/types.js.map +1 -0
  318. package/ts_build/src/chat.js +7 -1
  319. package/ts_build/src/chat.js.map +1 -1
  320. package/ts_build/src/chat2.d.ts +3 -0
  321. package/ts_build/src/chat2.js +47 -0
  322. package/ts_build/src/chat2.js.map +1 -0
  323. package/ts_build/src/cli.js +218 -37
  324. package/ts_build/src/cli.js.map +1 -1
  325. package/ts_build/src/clients/anthropic.d.ts +5 -2
  326. package/ts_build/src/clients/anthropic.js +12 -7
  327. package/ts_build/src/clients/anthropic.js.map +1 -1
  328. package/ts_build/src/clients/gemini.d.ts +6 -3
  329. package/ts_build/src/clients/gemini.js +13 -7
  330. package/ts_build/src/clients/gemini.js.map +1 -1
  331. package/ts_build/src/clients/http.d.ts +1 -0
  332. package/ts_build/src/clients/http.js +12 -5
  333. package/ts_build/src/clients/http.js.map +1 -1
  334. package/ts_build/src/clients/index.d.ts +10 -0
  335. package/ts_build/src/clients/index.js +74 -4
  336. package/ts_build/src/clients/index.js.map +1 -1
  337. package/ts_build/src/clients/knowhow.d.ts +3 -1
  338. package/ts_build/src/clients/knowhow.js +8 -2
  339. package/ts_build/src/clients/knowhow.js.map +1 -1
  340. package/ts_build/src/clients/knowhowMcp.d.ts +20 -0
  341. package/ts_build/src/clients/knowhowMcp.js +86 -0
  342. package/ts_build/src/clients/knowhowMcp.js.map +1 -0
  343. package/ts_build/src/clients/openai.d.ts +5 -2
  344. package/ts_build/src/clients/openai.js +29 -8
  345. package/ts_build/src/clients/openai.js.map +1 -1
  346. package/ts_build/src/clients/types.d.ts +1 -0
  347. package/ts_build/src/clients/xai.d.ts +5 -2
  348. package/ts_build/src/clients/xai.js +15 -5
  349. package/ts_build/src/clients/xai.js.map +1 -1
  350. package/ts_build/src/config.js +24 -3
  351. package/ts_build/src/config.js.map +1 -1
  352. package/ts_build/src/conversion.js +6 -4
  353. package/ts_build/src/conversion.js.map +1 -1
  354. package/ts_build/src/embeddings.d.ts +2 -1
  355. package/ts_build/src/embeddings.js +62 -17
  356. package/ts_build/src/embeddings.js.map +1 -1
  357. package/ts_build/src/login.d.ts +1 -1
  358. package/ts_build/src/login.js +21 -7
  359. package/ts_build/src/login.js.map +1 -1
  360. package/ts_build/src/microphone.js.map +1 -1
  361. package/ts_build/src/plugins/downloader/downloader.d.ts +4 -5
  362. package/ts_build/src/plugins/downloader/downloader.js +55 -26
  363. package/ts_build/src/plugins/downloader/downloader.js.map +1 -1
  364. package/ts_build/src/plugins/downloader/plugin.js +5 -3
  365. package/ts_build/src/plugins/downloader/plugin.js.map +1 -1
  366. package/ts_build/src/plugins/plugins.js +3 -0
  367. package/ts_build/src/plugins/plugins.js.map +1 -1
  368. package/ts_build/src/processors/CustomVariables.d.ts +32 -0
  369. package/ts_build/src/processors/CustomVariables.js +297 -0
  370. package/ts_build/src/processors/CustomVariables.js.map +1 -0
  371. package/ts_build/src/processors/HarmonyToolProcessor.d.ts +15 -0
  372. package/ts_build/src/processors/HarmonyToolProcessor.js +154 -0
  373. package/ts_build/src/processors/HarmonyToolProcessor.js.map +1 -0
  374. package/ts_build/src/processors/XmlToolCallProcessor.d.ts +14 -0
  375. package/ts_build/src/processors/XmlToolCallProcessor.js +357 -0
  376. package/ts_build/src/processors/XmlToolCallProcessor.js.map +1 -0
  377. package/ts_build/src/processors/index.d.ts +3 -0
  378. package/ts_build/src/processors/index.js +7 -1
  379. package/ts_build/src/processors/index.js.map +1 -1
  380. package/ts_build/src/prompts/KnowhowConfigExamples.d.ts +2 -0
  381. package/ts_build/src/prompts/KnowhowConfigExamples.js +379 -0
  382. package/ts_build/src/prompts/KnowhowConfigExamples.js.map +1 -0
  383. package/ts_build/src/services/KnowhowClient.d.ts +22 -0
  384. package/ts_build/src/services/KnowhowClient.js +14 -2
  385. package/ts_build/src/services/KnowhowClient.js.map +1 -1
  386. package/ts_build/src/services/Mcp.d.ts +1 -0
  387. package/ts_build/src/services/Mcp.js +20 -3
  388. package/ts_build/src/services/Mcp.js.map +1 -1
  389. package/ts_build/src/services/McpServer.d.ts +1 -1
  390. package/ts_build/src/services/McpServer.js +8 -4
  391. package/ts_build/src/services/McpServer.js.map +1 -1
  392. package/ts_build/src/services/McpWebsocketTransport.js +17 -7
  393. package/ts_build/src/services/McpWebsocketTransport.js.map +1 -1
  394. package/ts_build/src/services/MessageProcessor.d.ts +1 -1
  395. package/ts_build/src/services/MessageProcessor.js +4 -4
  396. package/ts_build/src/services/MessageProcessor.js.map +1 -1
  397. package/ts_build/src/services/index.d.ts +2 -0
  398. package/ts_build/src/services/index.js +4 -0
  399. package/ts_build/src/services/index.js.map +1 -1
  400. package/ts_build/src/services/script-execution/ScriptExecutor.d.ts +1 -0
  401. package/ts_build/src/services/script-execution/ScriptExecutor.js +23 -0
  402. package/ts_build/src/services/script-execution/ScriptExecutor.js.map +1 -1
  403. package/ts_build/src/services/types.d.ts +2 -6
  404. package/ts_build/src/services/types.js +4 -4
  405. package/ts_build/src/services/types.js.map +1 -1
  406. package/ts_build/src/types.d.ts +11 -0
  407. package/ts_build/src/types.js +8 -0
  408. package/ts_build/src/types.js.map +1 -1
  409. package/ts_build/src/utils/index.d.ts +2 -0
  410. package/ts_build/src/utils/index.js +102 -1
  411. package/ts_build/src/utils/index.js.map +1 -1
  412. package/ts_build/tests/XmlToolCallProcessor.test.d.ts +1 -0
  413. package/ts_build/tests/XmlToolCallProcessor.test.js +376 -0
  414. package/ts_build/tests/XmlToolCallProcessor.test.js.map +1 -0
  415. package/ts_build/tests/manual/ycmd/debug_diagnostics_test.d.ts +1 -0
  416. package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js +114 -0
  417. package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js.map +1 -0
  418. package/ts_build/tests/manual/ycmd/minimal_advanced_test.d.ts +2 -0
  419. package/ts_build/tests/manual/ycmd/minimal_advanced_test.js +104 -0
  420. package/ts_build/tests/manual/ycmd/minimal_advanced_test.js.map +1 -0
  421. package/ts_build/tests/manual/ycmd/simple_diagnostics_test.d.ts +1 -0
  422. package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js +74 -0
  423. package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js.map +1 -0
  424. package/ts_build/tests/manual/ycmd/simple_test.d.ts +2 -0
  425. package/ts_build/tests/manual/ycmd/simple_test.js +82 -0
  426. package/ts_build/tests/manual/ycmd/simple_test.js.map +1 -0
  427. package/ts_build/tests/manual/ycmd/test-typescript-sample.d.ts +14 -0
  428. package/ts_build/tests/manual/ycmd/test-typescript-sample.js +20 -0
  429. package/ts_build/tests/manual/ycmd/test-typescript-sample.js.map +1 -0
  430. package/ts_build/tests/manual/ycmd/test_advanced_features.d.ts +2 -0
  431. package/ts_build/tests/manual/ycmd/test_advanced_features.js +297 -0
  432. package/ts_build/tests/manual/ycmd/test_advanced_features.js.map +1 -0
  433. package/ts_build/tests/manual/ycmd/test_advanced_with_tools.d.ts +3 -0
  434. package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js +262 -0
  435. package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js.map +1 -0
  436. package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.d.ts +2 -0
  437. package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js +186 -0
  438. package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js.map +1 -0
  439. package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.d.ts +1 -0
  440. package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js +174 -0
  441. package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js.map +1 -0
  442. package/ts_build/tests/manual/ycmd/test_diagnostics_fix.d.ts +2 -0
  443. package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js +106 -0
  444. package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js.map +1 -0
  445. package/ts_build/tests/manual/ycmd/test_diagnostics_simple.d.ts +1 -0
  446. package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js +104 -0
  447. package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js.map +1 -0
  448. package/ts_build/tests/manual/ycmd/test_diagnostics_timing.d.ts +1 -0
  449. package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js +119 -0
  450. package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js.map +1 -0
  451. package/ts_build/tests/manual/ycmd/test_discover_commands.d.ts +2 -0
  452. package/ts_build/tests/manual/ycmd/test_discover_commands.js +243 -0
  453. package/ts_build/tests/manual/ycmd/test_discover_commands.js.map +1 -0
  454. package/ts_build/tests/manual/ycmd/test_endpoints.d.ts +2 -0
  455. package/ts_build/tests/manual/ycmd/test_endpoints.js +120 -0
  456. package/ts_build/tests/manual/ycmd/test_endpoints.js.map +1 -0
  457. package/ts_build/tests/manual/ycmd/test_final_comprehensive.d.ts +2 -0
  458. package/ts_build/tests/manual/ycmd/test_final_comprehensive.js +221 -0
  459. package/ts_build/tests/manual/ycmd/test_final_comprehensive.js.map +1 -0
  460. package/ts_build/tests/manual/ycmd/test_final_validation.d.ts +2 -0
  461. package/ts_build/tests/manual/ycmd/test_final_validation.js +160 -0
  462. package/ts_build/tests/manual/ycmd/test_final_validation.js.map +1 -0
  463. package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.d.ts +2 -0
  464. package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js +37 -0
  465. package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js.map +1 -0
  466. package/ts_build/tests/manual/ycmd/test_server_manager.d.ts +1 -0
  467. package/ts_build/tests/manual/ycmd/test_server_manager.js +38 -0
  468. package/ts_build/tests/manual/ycmd/test_server_manager.js.map +1 -0
  469. package/ts_build/tests/manual/ycmd/test_simple_debug.d.ts +2 -0
  470. package/ts_build/tests/manual/ycmd/test_simple_debug.js +99 -0
  471. package/ts_build/tests/manual/ycmd/test_simple_debug.js.map +1 -0
  472. package/ts_build/tests/manual/ycmd/test_tsserver_workflow.d.ts +1 -0
  473. package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js +128 -0
  474. package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js.map +1 -0
  475. package/ts_build/tests/manual/ycmd/test_typescript_simple.d.ts +1 -0
  476. package/ts_build/tests/manual/ycmd/test_typescript_simple.js +66 -0
  477. package/ts_build/tests/manual/ycmd/test_typescript_simple.js.map +1 -0
  478. package/ts_build/tests/manual/ycmd/test_typescript_ycmd.d.ts +1 -0
  479. package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js +105 -0
  480. package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js.map +1 -0
  481. package/ts_build/tests/manual/ycmd/test_workspace_config.d.ts +1 -0
  482. package/ts_build/tests/manual/ycmd/test_workspace_config.js +89 -0
  483. package/ts_build/tests/manual/ycmd/test_workspace_config.js.map +1 -0
  484. package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.d.ts +2 -0
  485. package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js +130 -0
  486. package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js.map +1 -0
  487. package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.d.ts +1 -0
  488. package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js +83 -0
  489. package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js.map +1 -0
  490. package/ts_build/tests/manual/ycmd/test_ycmd_direct.d.ts +2 -0
  491. package/ts_build/tests/manual/ycmd/test_ycmd_direct.js +149 -0
  492. package/ts_build/tests/manual/ycmd/test_ycmd_direct.js.map +1 -0
  493. package/ts_build/tests/manual/ycmd/test_ycmd_experiment.d.ts +15 -0
  494. package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js +58 -0
  495. package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js.map +1 -0
  496. package/ts_build/tests/manual/ycmd/test_ycmd_final.d.ts +2 -0
  497. package/ts_build/tests/manual/ycmd/test_ycmd_final.js +195 -0
  498. package/ts_build/tests/manual/ycmd/test_ycmd_final.js.map +1 -0
  499. package/ts_build/tests/manual/ycmd/test_ycmd_integration.d.ts +3 -0
  500. package/ts_build/tests/manual/ycmd/test_ycmd_integration.js +110 -0
  501. package/ts_build/tests/manual/ycmd/test_ycmd_integration.js.map +1 -0
  502. package/ts_build/tests/manual/ycmd/test_ycmd_simple.d.ts +2 -0
  503. package/ts_build/tests/manual/ycmd/test_ycmd_simple.js +36 -0
  504. package/ts_build/tests/manual/ycmd/test_ycmd_simple.js.map +1 -0
  505. package/ts_build/tests/manual/ycmd/working_simple_test.d.ts +2 -0
  506. package/ts_build/tests/manual/ycmd/working_simple_test.js +134 -0
  507. package/ts_build/tests/manual/ycmd/working_simple_test.js.map +1 -0
  508. package/tsconfig.json +3 -1
@@ -0,0 +1,824 @@
1
+ import { spawn } from "child_process";
2
+ import { promises as fsasync } from "fs";
3
+ import { execSync } from "child_process";
4
+ import * as fs from "fs";
5
+ import * as path from "path";
6
+ import chalk from "chalk";
7
+ import ora from "ora";
8
+ import { services, agents } from "../../ts_build/src/index";
9
+ import {
10
+ BenchmarkConfig,
11
+ BenchmarkResults,
12
+ ExerciseResult,
13
+ Exercise,
14
+ } from "./types";
15
+ import { registerProvider } from "./providers";
16
+ import {
17
+ XmlToolCallProcessor,
18
+ HarmonyToolProcessor,
19
+ } from "../../ts_build/src/processors";
20
+ import { EvaluatorRegistry } from "./evaluators";
21
+
22
+ export class BenchmarkRunner {
23
+ private config: BenchmarkConfig;
24
+ private exercisesDir: string;
25
+ private knowhowPath: string;
26
+ private defaultServices = services.services();
27
+ private defaultAgents = agents.agents(this.defaultServices);
28
+ private selectedAgent: agents.BaseAgent;
29
+ private model: string = "";
30
+ private provider: string = "";
31
+ private isShuttingDown: boolean = false;
32
+ private cleanup: (() => Promise<void>)[] = [];
33
+ private activeSpinners: Set<any> = new Set();
34
+ private childProcesses: Set<any> = new Set();
35
+ private evaluatorRegistry: EvaluatorRegistry;
36
+
37
+ constructor(config: BenchmarkConfig) {
38
+ this.config = config;
39
+ // Use different paths for local vs container
40
+ if (process.env.CONTAINER) {
41
+ this.exercisesDir = "/app/exercises";
42
+ } else {
43
+ this.exercisesDir = path.join(__dirname, "..", "exercises");
44
+ }
45
+ this.knowhowPath = "/app/knowhow";
46
+
47
+ // Initialize Knowhow services
48
+ this.defaultServices = services.services();
49
+ this.defaultAgents = agents.agents(this.defaultServices);
50
+
51
+ // Register agents
52
+ this.defaultServices.Agents.registerAgent(this.defaultAgents.Researcher);
53
+ this.defaultServices.Agents.registerAgent(this.defaultAgents.Patcher);
54
+ this.defaultServices.Agents.registerAgent(this.defaultAgents.Developer);
55
+
56
+ // Select the agent to use (default to Patcher)
57
+ const agentName = config.agent || "Patcher";
58
+ this.selectedAgent =
59
+ this.defaultAgents[agentName as keyof typeof this.defaultAgents];
60
+
61
+ if (!this.selectedAgent) {
62
+ throw new Error(`Unknown agent: ${agentName}`);
63
+ }
64
+
65
+ // Initialize test evaluator registry
66
+ this.evaluatorRegistry = new EvaluatorRegistry();
67
+
68
+ this.setupSignalHandlers();
69
+ }
70
+
71
+ private setupSignalHandlers(): void {
72
+ const gracefulShutdown = async (signal: string) => {
73
+ if (this.isShuttingDown) {
74
+ console.log(
75
+ chalk.red(`\n💥 Force killing process (${signal} received again)`)
76
+ );
77
+ process.exit(1);
78
+ }
79
+
80
+ this.isShuttingDown = true;
81
+ console.log(
82
+ chalk.yellow(`\n🛑 Graceful shutdown initiated (${signal} received)`)
83
+ );
84
+ console.log(chalk.gray("Press Ctrl+C again to force quit"));
85
+
86
+ try {
87
+ // Run cleanup functions
88
+ await Promise.all(this.cleanup.map((fn) => fn().catch(console.error)));
89
+
90
+ // Kill all child processes
91
+ for (const child of this.childProcesses) {
92
+ child.kill("SIGTERM");
93
+ }
94
+
95
+ // Stop all active spinners
96
+ for (const spinner of this.activeSpinners) {
97
+ spinner.stop();
98
+ }
99
+
100
+ // Disconnect MCP servers
101
+ if (this.defaultServices?.Mcp) {
102
+ await this.defaultServices.Mcp.closeAll();
103
+ }
104
+
105
+ console.log(chalk.green("✅ Cleanup completed"));
106
+ process.exit(0);
107
+ } catch (error) {
108
+ console.error(chalk.red("❌ Error during cleanup:"), error);
109
+ process.exit(1);
110
+ }
111
+ };
112
+
113
+ process.on("SIGINT", () => gracefulShutdown("SIGINT"));
114
+ process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
115
+ }
116
+
117
+ customProviders() {
118
+ // Load custom providers if they exist
119
+ const customProvidersPath = path.join(__dirname, "custom_providers.json");
120
+
121
+ if (fs.existsSync(customProvidersPath)) {
122
+ return require(customProvidersPath);
123
+ }
124
+
125
+ return [];
126
+ }
127
+
128
+ async loadModels() {
129
+ // Register configured models
130
+ await this.defaultServices.Clients.registerConfiguredModels();
131
+ const customProviders = this.customProviders();
132
+ for (const custom of customProviders) {
133
+ await registerProvider(
134
+ custom.provider,
135
+ custom.url,
136
+ custom.headers,
137
+ this.defaultServices.Clients
138
+ );
139
+ }
140
+
141
+ const { model, provider } =
142
+ this.defaultServices.Clients.detectProviderModel(
143
+ this.config.provider,
144
+ this.config.model
145
+ );
146
+
147
+ if (!model || !provider) {
148
+ throw new Error(
149
+ `Invalid model/provider combination: options are: ${JSON.stringify(
150
+ this.defaultServices.Clients.listAllModels(),
151
+ null,
152
+ 2
153
+ )}`
154
+ );
155
+ }
156
+
157
+ console.log(chalk.blue(`Using provider: ${provider}`));
158
+ console.log(chalk.blue(`Using model: ${model}`));
159
+
160
+ this.model = model;
161
+ this.provider = provider;
162
+ }
163
+
164
+ async initializeServices(): Promise<void> {
165
+ const spinner = ora("Initializing Knowhow services...").start();
166
+
167
+ // Track spinner for cleanup
168
+ this.activeSpinners.add(spinner);
169
+ const cleanupSpinner = () => {
170
+ this.activeSpinners.delete(spinner);
171
+ };
172
+
173
+ try {
174
+ // Define tools
175
+ this.defaultServices.Tools.defineTools(
176
+ agents.includedTools,
177
+ agents.tools
178
+ );
179
+
180
+ // Connect to MCP servers
181
+ await this.defaultServices.Mcp.connectToConfigured(
182
+ this.defaultServices.Tools
183
+ );
184
+
185
+ // Set agent model preferences
186
+ this.selectedAgent.setModelPreferences([
187
+ { model: this.model, provider: this.provider as any },
188
+ ]);
189
+
190
+ spinner.succeed("Services initialized successfully");
191
+ cleanupSpinner();
192
+ } catch (error) {
193
+ spinner.fail("Failed to initialize services");
194
+ cleanupSpinner();
195
+ throw error;
196
+ }
197
+ }
198
+
199
+ async setupExercises(): Promise<void> {
200
+ const spinner = ora("Setting up exercises...").start();
201
+
202
+ // Track spinner for cleanup
203
+ this.activeSpinners.add(spinner);
204
+ const cleanupSpinner = () => {
205
+ this.activeSpinners.delete(spinner);
206
+ };
207
+
208
+ try {
209
+ // Run the clone script
210
+ await this.runCommand("bash", [
211
+ path.join(__dirname, "..", "scripts", "clone-exercism.sh"),
212
+ this.config.language,
213
+ this.config.maxExercises.toString(),
214
+ ]);
215
+
216
+ spinner.succeed("Exercises setup completed");
217
+ cleanupSpinner();
218
+ } catch (error) {
219
+ spinner.fail("Failed to setup exercises");
220
+ cleanupSpinner();
221
+ throw error;
222
+ }
223
+ }
224
+
225
+ async run(): Promise<BenchmarkResults> {
226
+ console.log(chalk.blue(`Running benchmarks with config:`));
227
+ console.log(chalk.gray(` Language: ${this.config.language}`));
228
+
229
+ await this.loadModels();
230
+ await this.initializeServices();
231
+
232
+ console.log(chalk.gray(` Max exercises: ${this.config.maxExercises}`));
233
+ console.log(chalk.gray(` Model: ${this.model}`));
234
+ console.log(chalk.gray(` Provider: ${this.provider}`));
235
+
236
+ const startTime = new Date();
237
+ await this.setupExercises();
238
+ const exercises = await this.discoverExercises();
239
+ const results: ExerciseResult[] = [];
240
+
241
+ console.log(chalk.blue(`\nFound ${exercises.length} exercises to run\n`));
242
+
243
+ for (const exercise of exercises) {
244
+ // Check if we should stop due to shutdown signal
245
+ if (this.isShuttingDown) {
246
+ console.log(
247
+ chalk.yellow("⏹️ Stopping exercise execution due to shutdown signal")
248
+ );
249
+ break;
250
+ }
251
+
252
+ console.log(chalk.yellow(`Running exercise: ${exercise.name}`));
253
+
254
+ const result = await this.runExercise(exercise);
255
+ results.push(result);
256
+
257
+ // Log individual result with progress
258
+ console.log(
259
+ chalk.green(
260
+ `✓ Exercise ${results.length}/${exercises.length} completed: ${exercise.name}`
261
+ )
262
+ );
263
+ const statusColor = result.status === "success" ? chalk.green : chalk.red;
264
+ console.log(statusColor(` Status: ${result.status}`));
265
+ console.log(chalk.gray(` Turns: ${result.turns}`));
266
+ console.log(chalk.gray(` Time: ${result.timeElapsed.toFixed(2)}s`));
267
+ console.log(chalk.gray(` Cost: $${result.cost.toFixed(4)}\n`));
268
+
269
+ // Save incremental results after each exercise
270
+ const incrementalResults = this.generateResults(
271
+ results,
272
+ startTime,
273
+ new Date()
274
+ );
275
+ await this.saveIncrementalResults(incrementalResults);
276
+ }
277
+
278
+ const endTime = new Date();
279
+ const benchmarkResults = this.generateResults(results, startTime, endTime);
280
+
281
+ // Save results
282
+ await this.saveResults(benchmarkResults);
283
+
284
+ // Print summary
285
+ this.printSummary(benchmarkResults);
286
+
287
+ return benchmarkResults;
288
+ }
289
+
290
+ private async discoverExercises(): Promise<Exercise[]> {
291
+ const filteredDir = path.join(this.exercisesDir, "filtered");
292
+
293
+ try {
294
+ const exerciseNames = await fsasync.readdir(filteredDir);
295
+ const exercises: Exercise[] = [];
296
+
297
+ for (const name of exerciseNames) {
298
+ const exercisePath = path.join(filteredDir, name);
299
+ const stat = await fsasync.stat(exercisePath);
300
+
301
+ if (stat.isDirectory()) {
302
+ const files = await fsasync.readdir(exercisePath);
303
+ const hasTests = files.some(
304
+ (f) => f.includes("test") || f.includes("spec")
305
+ );
306
+
307
+ exercises.push({
308
+ name,
309
+ path: exercisePath,
310
+ hasTests,
311
+ files,
312
+ });
313
+ }
314
+ }
315
+
316
+ return exercises.slice(0, this.config.maxExercises);
317
+ } catch (error) {
318
+ throw new Error(`Failed to discover exercises: ${error}`);
319
+ }
320
+ }
321
+
322
+ private async runExercise(exercise: Exercise): Promise<ExerciseResult> {
323
+ const startTime = new Date();
324
+
325
+ // Check for shutdown before starting exercise
326
+ if (this.isShuttingDown) {
327
+ throw new Error("Exercise cancelled due to shutdown");
328
+ }
329
+
330
+ try {
331
+ // Create the benchmark prompt for the exercise
332
+ const prompt = await this.createExercisePrompt(exercise);
333
+
334
+ // Run knowhow agent on the exercise
335
+ const result = await this.runKnowhowAgent(exercise, prompt);
336
+
337
+ // Run test evaluation after agent execution
338
+ let testResult;
339
+ if (this.evaluatorRegistry.canEvaluateExercise(exercise.path)) {
340
+ const evaluation = await this.evaluatorRegistry.evaluateExercise(
341
+ exercise.path,
342
+ exercise.name
343
+ );
344
+ if (evaluation) {
345
+ testResult = evaluation.testResult;
346
+ console.log(
347
+ chalk.gray(
348
+ ` Tests: ${testResult.passed}/${testResult.total} passed`
349
+ )
350
+ );
351
+ }
352
+ }
353
+
354
+ const endTime = new Date();
355
+ const timeElapsed = (endTime.getTime() - startTime.getTime()) / 1000;
356
+
357
+ return {
358
+ exerciseName: exercise.name,
359
+ status: result.success ? "success" : "failure",
360
+ turns: result.turns,
361
+ testResult,
362
+ timeElapsed,
363
+ cost: result.cost,
364
+ startTime,
365
+ endTime,
366
+ errorMessage: result.error,
367
+ finalOutput: result.output,
368
+ };
369
+ } catch (error: any) {
370
+ const endTime = new Date();
371
+ const timeElapsed = (endTime.getTime() - startTime.getTime()) / 1000;
372
+
373
+ return {
374
+ exerciseName: exercise.name,
375
+ status: "failure",
376
+ testResult: undefined,
377
+ turns: error?.turns || 0,
378
+ timeElapsed,
379
+ cost: error?.cost || 0,
380
+ startTime,
381
+ endTime,
382
+ errorMessage: error instanceof Error ? error.message : String(error),
383
+ };
384
+ }
385
+ }
386
+
387
+ private async createExercisePrompt(exercise: Exercise): Promise<string> {
388
+ let prompt = `I need you to solve this coding exercise:\n\n`;
389
+
390
+ // Add description if available
391
+ const descriptionPath = path.join(exercise.path, "description.md");
392
+ try {
393
+ const description = await fsasync.readFile(descriptionPath, "utf-8");
394
+ prompt += `## Exercise Description\n${description}\n\n`;
395
+ } catch {
396
+ prompt += `## Exercise: ${exercise.name}\n\n`;
397
+ }
398
+
399
+ // List the files in the exercise
400
+ prompt += `## Files in this exercise:\n`;
401
+ for (const file of exercise.files) {
402
+ prompt += `- ${file}\n`;
403
+ }
404
+
405
+ prompt += `\nPlease implement the solution and make sure all tests pass. Focus on:\n`;
406
+ prompt += `1. Reading and understanding the problem\n`;
407
+ prompt += `2. Implementing the required functionality\n`;
408
+ prompt += `3. Running tests to ensure correctness\n`;
409
+ prompt += `4. Fixing any issues that arise\n\n`;
410
+ prompt += `5. If tests are skipped you should unskip them after the initial test passes\n\n`;
411
+ prompt += `You should expect to have to do typical project setup tasks like npm install as a part of this eval`;
412
+ prompt += `Work in the current directory where all the exercise files are located.`;
413
+ prompt += `Your score will be based on whether the tests run, and how many total passed from the file`;
414
+ prompt += `You are allowed to run the tests as many times as your want while you work.`;
415
+
416
+ return prompt;
417
+ }
418
+
419
+ private async runKnowhowAgent(
420
+ exercise: Exercise,
421
+ prompt: string
422
+ ): Promise<{
423
+ success: boolean;
424
+ turns: number;
425
+ cost: number;
426
+ error?: string;
427
+ output?: string;
428
+ }> {
429
+ let turns = 0;
430
+ let totalCost = 0;
431
+ let success = false;
432
+ let error: string | undefined;
433
+ let output = "";
434
+ const toolUsage = {} as Record<string, number>;
435
+
436
+ // Check for shutdown before starting agent
437
+ if (this.isShuttingDown) {
438
+ throw new Error("Agent execution cancelled due to shutdown");
439
+ }
440
+
441
+ try {
442
+ // Set up event tracking for metrics
443
+ const eventHandlers = {
444
+ threadUpdate: (messages: any) => {
445
+ // Turn count is tracked internally by the agent
446
+ totalCost = this.selectedAgent.getTotalCostUsd();
447
+ turns = this.selectedAgent.getTurnCount();
448
+ },
449
+ [this.selectedAgent.eventTypes.toolUsed]: (call: any) => {
450
+ const name = call.toolCall.function.name;
451
+ toolUsage[name] = toolUsage[name] || 0;
452
+ toolUsage[name] += 1;
453
+ },
454
+ costUpdate: (cost: any) => {
455
+ if (typeof cost === "number") {
456
+ totalCost = cost;
457
+ }
458
+ },
459
+ done: (data: any) => {
460
+ success = !data.error;
461
+ totalCost = this.selectedAgent.getTotalCostUsd();
462
+ turns = this.selectedAgent.getTurnCount();
463
+ if (data.error) {
464
+ error = data.error;
465
+ }
466
+ if (data.output) {
467
+ output = data.output;
468
+ }
469
+ },
470
+ };
471
+
472
+ // Add event listeners
473
+ Object.entries(eventHandlers).forEach(([event, handler]) => {
474
+ this.selectedAgent.agentEvents.on(event, handler);
475
+ });
476
+
477
+ // Set limits on the agent before calling
478
+ if (this.selectedAgent.setMaxTurns) {
479
+ this.selectedAgent.setMaxTurns(this.config.limits.maxTurns);
480
+ }
481
+ if (this.selectedAgent.setMaxSpend) {
482
+ this.selectedAgent.setMaxSpend(this.config.limits.maxCost);
483
+ }
484
+ if (this.selectedAgent.setMaxRunTime) {
485
+ this.selectedAgent.setMaxRunTime(this.config.limits.maxTime * 1000); // Convert seconds to milliseconds
486
+ }
487
+
488
+ this.selectedAgent.messageProcessor.setProcessors("post_call", [
489
+ new XmlToolCallProcessor().createProcessor(),
490
+ new HarmonyToolProcessor().createProcessor(),
491
+ ]);
492
+
493
+ // Change to exercise directory
494
+ const originalCwd = process.cwd();
495
+ process.chdir(exercise.path);
496
+
497
+ try {
498
+ // Call the agent directly with the prompt
499
+ this.selectedAgent.newTask();
500
+ const result = await this.selectedAgent.call(prompt);
501
+
502
+ // Extract final output from result
503
+ if (result && typeof result === "string") {
504
+ output = result;
505
+ } else if (
506
+ result &&
507
+ typeof result === "object" &&
508
+ "content" in result
509
+ ) {
510
+ output = String(result.content);
511
+ }
512
+
513
+ success = true;
514
+
515
+ // Get turn count from the agent
516
+ if (this.selectedAgent.getTurnCount) {
517
+ turns = this.selectedAgent.getTurnCount();
518
+ }
519
+ } finally {
520
+ // Restore original directory
521
+ process.chdir(originalCwd);
522
+
523
+ // Remove event listeners
524
+ Object.entries(eventHandlers).forEach(([event, handler]) => {
525
+ this.selectedAgent.agentEvents.off(event, handler);
526
+ });
527
+ }
528
+
529
+ return {
530
+ success,
531
+ turns,
532
+ cost: totalCost,
533
+ output,
534
+ };
535
+ } catch (err) {
536
+ const errorMessage = err instanceof Error ? err.message : String(err);
537
+ return {
538
+ success: false,
539
+ turns,
540
+ cost: totalCost,
541
+ error: errorMessage,
542
+ };
543
+ }
544
+ }
545
+
546
+ private runCommand(
547
+ command: string,
548
+ args: string[],
549
+ options?: {
550
+ cwd?: string;
551
+ timeout?: number;
552
+ }
553
+ ): Promise<string> {
554
+ return new Promise((resolve, reject) => {
555
+ const child = spawn(command, args, {
556
+ cwd: options?.cwd || process.cwd(),
557
+ stdio: ["pipe", "pipe", "pipe"],
558
+ });
559
+
560
+ // Track child process for cleanup
561
+ this.childProcesses.add(child);
562
+
563
+ // Remove from tracking when it exits
564
+ child.on("close", () => {
565
+ this.childProcesses.delete(child);
566
+ });
567
+ child.on("error", () => {
568
+ this.childProcesses.delete(child);
569
+ });
570
+
571
+ let stdout = "";
572
+ let stderr = "";
573
+
574
+ child.stdout?.on("data", (data) => {
575
+ stdout += data.toString();
576
+ });
577
+
578
+ child.stderr?.on("data", (data) => {
579
+ stderr += data.toString();
580
+ });
581
+
582
+ const timeout = options?.timeout;
583
+ let timeoutId: NodeJS.Timeout | undefined;
584
+ // Check for shutdown signal during command execution
585
+ if (this.isShuttingDown) {
586
+ child.kill("SIGTERM");
587
+ reject(new Error("Command cancelled due to shutdown"));
588
+ return;
589
+ }
590
+
591
+ if (timeout) {
592
+ timeoutId = setTimeout(() => {
593
+ child.kill("SIGKILL");
594
+ reject(new Error(`Command timed out after ${timeout}ms`));
595
+ }, timeout);
596
+ }
597
+
598
+ child.on("close", (code) => {
599
+ if (timeoutId) clearTimeout(timeoutId);
600
+
601
+ if (code === 0) {
602
+ resolve(stdout);
603
+ } else {
604
+ reject(new Error(`Command failed with code ${code}: ${stderr}`));
605
+ }
606
+ });
607
+
608
+ child.on("error", (error) => {
609
+ if (timeoutId) clearTimeout(timeoutId);
610
+ reject(error);
611
+ });
612
+ });
613
+ }
614
+
615
+ private generateResults(
616
+ results: ExerciseResult[],
617
+ startTime: Date,
618
+ endTime: Date
619
+ ): BenchmarkResults {
620
+ const totalTime = (endTime.getTime() - startTime.getTime()) / 1000;
621
+ const successCount = results.filter((r) => r.status === "success").length;
622
+ const failureCount = results.filter((r) => r.status === "failure").length;
623
+ const timeoutCount = results.filter((r) => r.status === "timeout").length;
624
+ const costLimitCount = results.filter(
625
+ (r) => r.status === "cost_limit"
626
+ ).length;
627
+ const turnLimitCount = results.filter(
628
+ (r) => r.status === "turn_limit"
629
+ ).length;
630
+
631
+ // Calculate test-based metrics
632
+ const testableExercises = results.filter(
633
+ (r) => r.testResult !== undefined
634
+ ).length;
635
+ const testsPassedCount = results.filter(
636
+ (r) => r.testResult?.success === true
637
+ ).length;
638
+ const testsFailedCount = results.filter(
639
+ (r) => r.testResult && !r.testResult.success
640
+ ).length;
641
+ const testPassRate =
642
+ testableExercises > 0 ? testsPassedCount / testableExercises : 0;
643
+ const agentSuccessRate = successCount / results.length || 0;
644
+ const actualSuccessRate =
645
+ testableExercises > 0 ? testPassRate : agentSuccessRate;
646
+
647
+ const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
648
+ const totalTurns = results.reduce((sum, r) => sum + r.turns, 0);
649
+ const totalExerciseTime = results.reduce(
650
+ (sum, r) => sum + r.timeElapsed,
651
+ 0
652
+ );
653
+
654
+ return {
655
+ config: this.config,
656
+ exercises: results,
657
+ summary: {
658
+ totalExercises: results.length,
659
+ successCount,
660
+ testableExercises,
661
+ testsPassedCount,
662
+ testsFailedCount,
663
+ testPassRate,
664
+ agentSuccessRate,
665
+ failureCount,
666
+ timeoutCount,
667
+ costLimitCount,
668
+ turnLimitCount,
669
+ totalTime: totalExerciseTime,
670
+ totalCost,
671
+ averageTurns: totalTurns / results.length || 0,
672
+ averageTime: totalExerciseTime / results.length || 0,
673
+ successRate: actualSuccessRate,
674
+ },
675
+ startTime,
676
+ endTime,
677
+ };
678
+ }
679
+
680
+ private getCommitHash(): string {
681
+ try {
682
+ // Get the current git commit hash (short format)
683
+ const commitHash = execSync("git rev-parse --short HEAD", {
684
+ encoding: "utf8",
685
+ cwd: process.cwd(),
686
+ }).trim();
687
+ return commitHash;
688
+ } catch (error) {
689
+ // Fallback to a timestamp-based identifier if git is not available
690
+ return `fallback-${Date.now()}`;
691
+ }
692
+ }
693
+
694
+ private formatDateDash(): string {
695
+ const now = new Date();
696
+ const year = now.getFullYear();
697
+ const month = String(now.getMonth() + 1).padStart(2, "0");
698
+ const day = String(now.getDate()).padStart(2, "0");
699
+ return `${year}-${month}-${day}`;
700
+ }
701
+
702
+ private generateResultsPath(): string {
703
+ const commitHash = this.getCommitHash();
704
+ const dateStr = this.formatDateDash();
705
+ const modelFileName = `${this.provider}-${this.model.replace(
706
+ /\//g,
707
+ "-"
708
+ )}.json`;
709
+
710
+ // Use different base paths for local vs container
711
+ const baseDir = process.env.CONTAINER
712
+ ? "/app/knowhow/benchmarks/results"
713
+ : path.join(__dirname, "..", "results");
714
+
715
+ return path.join(
716
+ baseDir,
717
+ commitHash,
718
+ dateStr,
719
+ this.provider,
720
+ modelFileName
721
+ );
722
+ }
723
+
724
+ private async saveResults(results: BenchmarkResults): Promise<void> {
725
+ // Generate the new structured path
726
+ const resultsPath = this.generateResultsPath();
727
+
728
+ // Ensure the directory exists
729
+ await fsasync.mkdir(path.dirname(resultsPath), { recursive: true });
730
+ await fsasync.writeFile(resultsPath, JSON.stringify(results, null, 2));
731
+ }
732
+
733
+ private async saveIncrementalResults(
734
+ results: BenchmarkResults
735
+ ): Promise<void> {
736
+ try {
737
+ // Generate the new structured path for incremental results
738
+ const resultsPath = this.generateResultsPath();
739
+
740
+ // Ensure the directory exists
741
+ await fsasync.mkdir(path.dirname(resultsPath), { recursive: true });
742
+ await fsasync.writeFile(resultsPath, JSON.stringify(results, null, 2));
743
+ console.log(chalk.gray(` → Incremental results saved`));
744
+ } catch (error) {
745
+ // Don't crash the benchmark if incremental save fails
746
+ console.log(
747
+ chalk.yellow(
748
+ ` ⚠ Warning: Failed to save incremental results: ${error}`
749
+ )
750
+ );
751
+ }
752
+ }
753
+
754
+ private printSummary(results: BenchmarkResults): void {
755
+ console.log(chalk.blue("\n📊 Benchmark Summary"));
756
+ console.log(chalk.gray("━".repeat(50)));
757
+ console.log(
758
+ chalk.white(`Total Exercises: ${results.summary.totalExercises}`)
759
+ );
760
+
761
+ if (results.summary.testableExercises > 0) {
762
+ console.log(chalk.blue("\n🧪 Test Evaluation Results:"));
763
+ console.log(
764
+ chalk.white(
765
+ ` Testable exercises: ${results.summary.testableExercises}`
766
+ )
767
+ );
768
+ console.log(
769
+ chalk.green(` Tests passed: ${results.summary.testsPassedCount}`)
770
+ );
771
+ console.log(
772
+ chalk.red(` Tests failed: ${results.summary.testsFailedCount}`)
773
+ );
774
+ console.log(
775
+ chalk.white(
776
+ ` Test pass rate: ${(results.summary.testPassRate * 100).toFixed(
777
+ 1
778
+ )}%`
779
+ )
780
+ );
781
+ console.log(
782
+ chalk.white(
783
+ ` Agent success rate: ${(
784
+ results.summary.agentSuccessRate * 100
785
+ ).toFixed(1)}%`
786
+ )
787
+ );
788
+ console.log(
789
+ chalk.white(
790
+ ` Overall success rate: ${(
791
+ results.summary.successRate * 100
792
+ ).toFixed(1)}%`
793
+ )
794
+ );
795
+ } else {
796
+ console.log(chalk.blue("\n🤖 Agent Evaluation Results:"));
797
+ console.log(chalk.green(` Successful: ${results.summary.successCount}`));
798
+ console.log(chalk.red(` Failed: ${results.summary.failureCount}`));
799
+ console.log(chalk.yellow(` Timeouts: ${results.summary.timeoutCount}`));
800
+ console.log(
801
+ chalk.yellow(` Turn limits: ${results.summary.turnLimitCount}`)
802
+ );
803
+ console.log(
804
+ chalk.yellow(` Cost limits: ${results.summary.costLimitCount}`)
805
+ );
806
+ console.log(
807
+ chalk.white(
808
+ ` Success Rate: ${(results.summary.successRate * 100).toFixed(1)}%`
809
+ )
810
+ );
811
+ }
812
+ console.log(
813
+ chalk.white(`Average Turns: ${results.summary.averageTurns.toFixed(1)}`)
814
+ );
815
+ console.log(
816
+ chalk.white(`Average Time: ${results.summary.averageTime.toFixed(1)}s`)
817
+ );
818
+ console.log(chalk.blue("\n📈 Performance Metrics:"));
819
+ console.log(
820
+ chalk.white(`Total Cost: $${results.summary.totalCost.toFixed(4)}`)
821
+ );
822
+ console.log(chalk.gray(`Results saved to: ${this.generateResultsPath()}`));
823
+ }
824
+ }