@tyvm/knowhow 0.0.33 → 0.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/autodoc/plugins/downloader/downloader.mdx +2 -2
- package/benchmarks/.dockerignore +7 -0
- package/benchmarks/README.md +166 -0
- package/benchmarks/docker/Dockerfile +68 -0
- package/benchmarks/example-config.yml +27 -0
- package/benchmarks/jest.config.js +13 -0
- package/benchmarks/package-lock.json +4297 -0
- package/benchmarks/package.json +39 -0
- package/benchmarks/results/4542435/2025-08-05/lms/lms-openai-gpt-oss-20b.json +2814 -0
- package/benchmarks/results/4542435/2025-08-05/lms/lms-qwen-qwen3-30b-a3b-2507.json +2014 -0
- package/benchmarks/results/4fb9125/2025-08-07/anthropic/anthropic-claude-sonnet-4-20250514.json +3121 -0
- package/benchmarks/results/5766aee/2025-08-02/lms-qwen/qwen3-coder-30b.json +98 -0
- package/benchmarks/results/6d73808/2025-08-07/openai/openai-gpt-5.json +3256 -0
- package/benchmarks/results/77bf0a6/2025-08-02/lms-qwen/qwen3-30b-a3b-2507.json +4298 -0
- package/benchmarks/results/8c0d445/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3031 -0
- package/benchmarks/results/8c0d445/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +2990 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3256 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3007 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +3256 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-mini-2025-04-14.json +3036 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-nano-2025-04-14.json +3280 -0
- package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-30b-a3b-2507.json +1920 -0
- package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-coder-30b.json +3281 -0
- package/benchmarks/results/b502ed9/2025-08-03/lms-qwen/qwen3-coder-30b.json +2896 -0
- package/benchmarks/results/d1a8129/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3011 -0
- package/benchmarks/results/e60471c/2025-08-03/lms/qwen3-30b-a3b-2507.json +3003 -0
- package/benchmarks/scripts/build-and-run.sh +47 -0
- package/benchmarks/scripts/clone-exercism.sh +92 -0
- package/benchmarks/scripts/validate.sh +48 -0
- package/benchmarks/src/__tests__/runner.test.ts +27 -0
- package/benchmarks/src/cli.ts +90 -0
- package/benchmarks/src/evaluators/EvaluatorRegistry.ts +64 -0
- package/benchmarks/src/evaluators/JavaScriptEvaluator.ts +183 -0
- package/benchmarks/src/evaluators/index.ts +3 -0
- package/benchmarks/src/evaluators/types.ts +22 -0
- package/benchmarks/src/index.ts +3 -0
- package/benchmarks/src/providers.ts +13 -0
- package/benchmarks/src/runner.ts +824 -0
- package/benchmarks/src/types.ts +63 -0
- package/benchmarks/tsconfig.json +19 -0
- package/jest.config.js +2 -1
- package/leaderboard/README.md +148 -0
- package/leaderboard/app/api/benchmark-data/route.ts +131 -0
- package/leaderboard/app/api/benchmark-detail/route.ts +172 -0
- package/leaderboard/app/details/[model]/[provider]/[language]/page.tsx +501 -0
- package/leaderboard/app/exercise/[model]/[provider]/[language]/[exercise]/page.tsx +375 -0
- package/leaderboard/app/globals.css +27 -0
- package/leaderboard/app/layout.tsx +21 -0
- package/leaderboard/app/page.tsx +170 -0
- package/leaderboard/components/LeaderboardTable.tsx +168 -0
- package/leaderboard/components/PerformanceChart.tsx +109 -0
- package/leaderboard/next-env.d.ts +5 -0
- package/leaderboard/next.config.js +4 -0
- package/leaderboard/package-lock.json +6363 -0
- package/leaderboard/package.json +28 -0
- package/leaderboard/postcss.config.js +6 -0
- package/leaderboard/tailwind.config.js +17 -0
- package/leaderboard/tsconfig.json +28 -0
- package/leaderboard/types/benchmark.ts +67 -0
- package/leaderboard/utils/dataProcessor.ts +33 -0
- package/package.json +2 -1
- package/src/agents/base/base.ts +182 -24
- package/src/agents/base/prompt.ts +28 -0
- package/src/agents/index.ts +3 -0
- package/src/agents/patcher/patcher.ts +6 -4
- package/src/agents/setup/setup.ts +56 -0
- package/src/agents/tools/agentCall.ts +6 -2
- package/src/agents/tools/aiClient.ts +74 -8
- package/src/agents/tools/execCommand.ts +13 -14
- package/src/agents/tools/executeScript/README.md +16 -0
- package/src/agents/tools/index.ts +2 -0
- package/src/agents/tools/list.ts +73 -16
- package/src/agents/tools/startAgentTask.ts +109 -0
- package/src/agents/tools/textSearch.ts +1 -1
- package/src/agents/tools/visionTool.ts +31 -2
- package/src/agents/tools/ycmd/client.ts +608 -0
- package/src/agents/tools/ycmd/definitions.ts +294 -0
- package/src/agents/tools/ycmd/detection.ts +211 -0
- package/src/agents/tools/ycmd/index.ts +11 -0
- package/src/agents/tools/ycmd/installer.ts +251 -0
- package/src/agents/tools/ycmd/server.ts +535 -0
- package/src/agents/tools/ycmd/serverManager.ts +316 -0
- package/src/agents/tools/ycmd/tools/completion.ts +113 -0
- package/src/agents/tools/ycmd/tools/diagnostics.ts +155 -0
- package/src/agents/tools/ycmd/tools/getLocations.ts +173 -0
- package/src/agents/tools/ycmd/tools/goto.ts +169 -0
- package/src/agents/tools/ycmd/tools/refactor.ts +204 -0
- package/src/agents/tools/ycmd/tools/signature.ts +174 -0
- package/src/agents/tools/ycmd/tools/start.ts +95 -0
- package/src/agents/tools/ycmd/utils/pathUtils.ts +59 -0
- package/src/ai.ts +15 -0
- package/src/chat/CliChatService.ts +277 -0
- package/src/chat/modules/AgentModule.ts +985 -0
- package/src/chat/modules/AskModule.ts +98 -0
- package/src/chat/modules/BaseChatModule.ts +66 -0
- package/src/chat/modules/InternalChatModule.ts +174 -0
- package/src/chat/modules/SearchModule.ts +166 -0
- package/src/chat/modules/SetupModule.ts +185 -0
- package/src/chat/modules/SystemModule.ts +120 -0
- package/src/chat/modules/VoiceModule.ts +70 -0
- package/src/chat/modules/index.js +5 -0
- package/src/chat/types.ts +97 -0
- package/src/chat.ts +9 -1
- package/src/chat2.ts +62 -0
- package/src/cli.ts +264 -35
- package/src/clients/anthropic.ts +14 -7
- package/src/clients/gemini.ts +15 -7
- package/src/clients/http.ts +17 -7
- package/src/clients/index.ts +117 -4
- package/src/clients/knowhow.ts +7 -2
- package/src/clients/knowhowMcp.ts +118 -0
- package/src/clients/openai.ts +32 -8
- package/src/clients/types.ts +1 -0
- package/src/clients/xai.ts +17 -5
- package/src/config.ts +30 -5
- package/src/conversion.ts +4 -1
- package/src/login.ts +26 -9
- package/src/microphone.ts +0 -1
- package/src/plugins/downloader/downloader.ts +191 -49
- package/src/plugins/downloader/plugin.ts +3 -1
- package/src/plugins/plugins.ts +3 -0
- package/src/processors/CustomVariables.ts +425 -0
- package/src/processors/HarmonyToolProcessor.ts +264 -0
- package/src/processors/XmlToolCallProcessor.ts +533 -0
- package/src/processors/index.ts +3 -0
- package/src/prompts/KnowhowConfigExamples.ts +376 -0
- package/src/services/KnowhowClient.ts +49 -3
- package/src/services/Mcp.ts +42 -3
- package/src/services/McpServer.ts +14 -4
- package/src/services/McpWebsocketTransport.ts +21 -7
- package/src/services/MessageProcessor.ts +10 -5
- package/src/services/index.ts +5 -0
- package/src/services/script-execution/ScriptExecutor.ts +34 -1
- package/src/services/types.ts +17 -14
- package/src/types.ts +17 -0
- package/src/utils/index.ts +138 -0
- package/tests/XmlToolCallProcessor.test.ts +468 -0
- package/tests/manual/ycmd/debug_diagnostics_test.ts +127 -0
- package/tests/manual/ycmd/fixtures/debug_diagnostics.ts +26 -0
- package/tests/manual/ycmd/fixtures/file_change_test.ts +17 -0
- package/tests/manual/ycmd/minimal_advanced_test.ts +108 -0
- package/tests/manual/ycmd/simple_diagnostics_test.ts +61 -0
- package/tests/manual/ycmd/simple_test.ts +74 -0
- package/tests/manual/ycmd/test-typescript-sample.ts +34 -0
- package/tests/manual/ycmd/test_advanced_features.ts +407 -0
- package/tests/manual/ycmd/test_advanced_with_tools.ts +320 -0
- package/tests/manual/ycmd/test_comprehensive_typescript.ts +179 -0
- package/tests/manual/ycmd/test_diagnostics_file_changes.ts +249 -0
- package/tests/manual/ycmd/test_diagnostics_fix.ts +99 -0
- package/tests/manual/ycmd/test_diagnostics_simple.ts +100 -0
- package/tests/manual/ycmd/test_diagnostics_timing.ts +120 -0
- package/tests/manual/ycmd/test_discover_commands.ts +310 -0
- package/tests/manual/ycmd/test_endpoints.ts +115 -0
- package/tests/manual/ycmd/test_final_comprehensive.ts +218 -0
- package/tests/manual/ycmd/test_final_validation.ts +150 -0
- package/tests/manual/ycmd/test_implementation.js +42 -0
- package/tests/manual/ycmd/test_individual_ycmd_tool.ts +39 -0
- package/tests/manual/ycmd/test_server_manager.ts +52 -0
- package/tests/manual/ycmd/test_simple_debug.ts +86 -0
- package/tests/manual/ycmd/test_tsserver_workflow.js +83 -0
- package/tests/manual/ycmd/test_tsserver_workflow.ts +122 -0
- package/tests/manual/ycmd/test_typescript_simple.ts +48 -0
- package/tests/manual/ycmd/test_typescript_ycmd.ts +105 -0
- package/tests/manual/ycmd/test_workspace_config.ts +90 -0
- package/tests/manual/ycmd/test_ycmd_auto_start.ts +137 -0
- package/tests/manual/ycmd/test_ycmd_comprehensive.ts +73 -0
- package/tests/manual/ycmd/test_ycmd_connection.py +10 -0
- package/tests/manual/ycmd/test_ycmd_direct.ts +142 -0
- package/tests/manual/ycmd/test_ycmd_experiment.ts +48 -0
- package/tests/manual/ycmd/test_ycmd_final.ts +200 -0
- package/tests/manual/ycmd/test_ycmd_fixed.py +18 -0
- package/tests/manual/ycmd/test_ycmd_integration.ts +112 -0
- package/tests/manual/ycmd/test_ycmd_simple.ts +45 -0
- package/tests/manual/ycmd/test_ycmd_usage.py +27 -0
- package/tests/manual/ycmd/working_simple_test.ts +134 -0
- package/ts_build/src/agents/base/base.d.ts +15 -1
- package/ts_build/src/agents/base/base.js +121 -20
- package/ts_build/src/agents/base/base.js.map +1 -1
- package/ts_build/src/agents/base/prompt.d.ts +1 -1
- package/ts_build/src/agents/base/prompt.js +28 -0
- package/ts_build/src/agents/base/prompt.js.map +1 -1
- package/ts_build/src/agents/index.d.ts +2 -0
- package/ts_build/src/agents/index.js +2 -0
- package/ts_build/src/agents/index.js.map +1 -1
- package/ts_build/src/agents/patcher/patcher.js +6 -3
- package/ts_build/src/agents/patcher/patcher.js.map +1 -1
- package/ts_build/src/agents/setup/setup.d.ts +8 -0
- package/ts_build/src/agents/setup/setup.js +59 -0
- package/ts_build/src/agents/setup/setup.js.map +1 -0
- package/ts_build/src/agents/tools/agentCall.js +5 -2
- package/ts_build/src/agents/tools/agentCall.js.map +1 -1
- package/ts_build/src/agents/tools/aiClient.d.ts +6 -5
- package/ts_build/src/agents/tools/aiClient.js +37 -6
- package/ts_build/src/agents/tools/aiClient.js.map +1 -1
- package/ts_build/src/agents/tools/execCommand.d.ts +2 -2
- package/ts_build/src/agents/tools/execCommand.js +5 -6
- package/ts_build/src/agents/tools/execCommand.js.map +1 -1
- package/ts_build/src/agents/tools/executeScript/index.d.ts +1 -1
- package/ts_build/src/agents/tools/index.d.ts +2 -0
- package/ts_build/src/agents/tools/index.js +2 -0
- package/ts_build/src/agents/tools/index.js.map +1 -1
- package/ts_build/src/agents/tools/list.js +66 -16
- package/ts_build/src/agents/tools/list.js.map +1 -1
- package/ts_build/src/agents/tools/startAgentTask.d.ts +13 -0
- package/ts_build/src/agents/tools/startAgentTask.js +74 -0
- package/ts_build/src/agents/tools/startAgentTask.js.map +1 -0
- package/ts_build/src/agents/tools/startChatTask.d.ts +13 -0
- package/ts_build/src/agents/tools/startChatTask.js +73 -0
- package/ts_build/src/agents/tools/startChatTask.js.map +1 -0
- package/ts_build/src/agents/tools/textSearch.js +1 -1
- package/ts_build/src/agents/tools/textSearch.js.map +1 -1
- package/ts_build/src/agents/tools/visionTool.d.ts +1 -1
- package/ts_build/src/agents/tools/visionTool.js +23 -3
- package/ts_build/src/agents/tools/visionTool.js.map +1 -1
- package/ts_build/src/agents/tools/ycmd/client.d.ts +93 -0
- package/ts_build/src/agents/tools/ycmd/client.js +355 -0
- package/ts_build/src/agents/tools/ycmd/client.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/definitions.d.ts +345 -0
- package/ts_build/src/agents/tools/ycmd/definitions.js +298 -0
- package/ts_build/src/agents/tools/ycmd/definitions.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/detection.d.ts +11 -0
- package/ts_build/src/agents/tools/ycmd/detection.js +175 -0
- package/ts_build/src/agents/tools/ycmd/detection.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/index.d.ts +8 -0
- package/ts_build/src/agents/tools/ycmd/index.js +20 -0
- package/ts_build/src/agents/tools/ycmd/index.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/installer.d.ts +19 -0
- package/ts_build/src/agents/tools/ycmd/installer.js +196 -0
- package/ts_build/src/agents/tools/ycmd/installer.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/server.d.ts +35 -0
- package/ts_build/src/agents/tools/ycmd/server.js +363 -0
- package/ts_build/src/agents/tools/ycmd/server.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.d.ts +39 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.js +210 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.d.ts +22 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.js +72 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.d.ts +42 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js +88 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.d.ts +22 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.js +142 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.d.ts +20 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.js +101 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.d.ts +32 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.js +123 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.d.ts +25 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.js +110 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.d.ts +17 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.js +65 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.d.ts +4 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js +67 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js.map +1 -0
- package/ts_build/src/ai.d.ts +1 -0
- package/ts_build/src/ai.js +40 -1
- package/ts_build/src/ai.js.map +1 -1
- package/ts_build/src/chat/ChatCommandHandler.d.ts +36 -0
- package/ts_build/src/chat/ChatCommandHandler.js +268 -0
- package/ts_build/src/chat/ChatCommandHandler.js.map +1 -0
- package/ts_build/src/chat/ChatInputManager.d.ts +22 -0
- package/ts_build/src/chat/ChatInputManager.js +85 -0
- package/ts_build/src/chat/ChatInputManager.js.map +1 -0
- package/ts_build/src/chat/ChatManager.d.ts +49 -0
- package/ts_build/src/chat/ChatManager.js +271 -0
- package/ts_build/src/chat/ChatManager.js.map +1 -0
- package/ts_build/src/chat/ChatSession.d.ts +32 -0
- package/ts_build/src/chat/ChatSession.js +3 -0
- package/ts_build/src/chat/ChatSession.js.map +1 -0
- package/ts_build/src/chat/ChatSessionManager.d.ts +19 -0
- package/ts_build/src/chat/ChatSessionManager.js +188 -0
- package/ts_build/src/chat/ChatSessionManager.js.map +1 -0
- package/ts_build/src/chat/ChatStateManager.d.ts +58 -0
- package/ts_build/src/chat/ChatStateManager.js +156 -0
- package/ts_build/src/chat/ChatStateManager.js.map +1 -0
- package/ts_build/src/chat/CliChatService.d.ts +35 -0
- package/ts_build/src/chat/CliChatService.js +201 -0
- package/ts_build/src/chat/CliChatService.js.map +1 -0
- package/ts_build/src/chat/InterruptibleInput.d.ts +20 -0
- package/ts_build/src/chat/InterruptibleInput.js +109 -0
- package/ts_build/src/chat/InterruptibleInput.js.map +1 -0
- package/ts_build/src/chat/interfaces/ChatModule.d.ts +6 -0
- package/ts_build/src/chat/interfaces/ChatModule.js +3 -0
- package/ts_build/src/chat/interfaces/ChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/AgentModule.d.ts +57 -0
- package/ts_build/src/chat/modules/AgentModule.js +709 -0
- package/ts_build/src/chat/modules/AgentModule.js.map +1 -0
- package/ts_build/src/chat/modules/AskModule.d.ts +10 -0
- package/ts_build/src/chat/modules/AskModule.js +63 -0
- package/ts_build/src/chat/modules/AskModule.js.map +1 -0
- package/ts_build/src/chat/modules/BaseChatModule.d.ts +14 -0
- package/ts_build/src/chat/modules/BaseChatModule.js +32 -0
- package/ts_build/src/chat/modules/BaseChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/InternalChatModule.d.ts +24 -0
- package/ts_build/src/chat/modules/InternalChatModule.js +127 -0
- package/ts_build/src/chat/modules/InternalChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/SearchModule.d.ts +12 -0
- package/ts_build/src/chat/modules/SearchModule.js +119 -0
- package/ts_build/src/chat/modules/SearchModule.js.map +1 -0
- package/ts_build/src/chat/modules/SetupModule.d.ts +15 -0
- package/ts_build/src/chat/modules/SetupModule.js +147 -0
- package/ts_build/src/chat/modules/SetupModule.js.map +1 -0
- package/ts_build/src/chat/modules/SystemModule.d.ts +14 -0
- package/ts_build/src/chat/modules/SystemModule.js +90 -0
- package/ts_build/src/chat/modules/SystemModule.js.map +1 -0
- package/ts_build/src/chat/modules/VoiceModule.d.ts +11 -0
- package/ts_build/src/chat/modules/VoiceModule.js +57 -0
- package/ts_build/src/chat/modules/VoiceModule.js.map +1 -0
- package/ts_build/src/chat/types.d.ts +83 -0
- package/ts_build/src/chat/types.js +3 -0
- package/ts_build/src/chat/types.js.map +1 -0
- package/ts_build/src/chat.js +7 -1
- package/ts_build/src/chat.js.map +1 -1
- package/ts_build/src/chat2.d.ts +3 -0
- package/ts_build/src/chat2.js +47 -0
- package/ts_build/src/chat2.js.map +1 -0
- package/ts_build/src/cli.js +218 -37
- package/ts_build/src/cli.js.map +1 -1
- package/ts_build/src/clients/anthropic.d.ts +5 -2
- package/ts_build/src/clients/anthropic.js +12 -7
- package/ts_build/src/clients/anthropic.js.map +1 -1
- package/ts_build/src/clients/gemini.d.ts +6 -3
- package/ts_build/src/clients/gemini.js +13 -7
- package/ts_build/src/clients/gemini.js.map +1 -1
- package/ts_build/src/clients/http.d.ts +1 -0
- package/ts_build/src/clients/http.js +12 -5
- package/ts_build/src/clients/http.js.map +1 -1
- package/ts_build/src/clients/index.d.ts +10 -0
- package/ts_build/src/clients/index.js +74 -4
- package/ts_build/src/clients/index.js.map +1 -1
- package/ts_build/src/clients/knowhow.d.ts +3 -1
- package/ts_build/src/clients/knowhow.js +8 -2
- package/ts_build/src/clients/knowhow.js.map +1 -1
- package/ts_build/src/clients/knowhowMcp.d.ts +20 -0
- package/ts_build/src/clients/knowhowMcp.js +86 -0
- package/ts_build/src/clients/knowhowMcp.js.map +1 -0
- package/ts_build/src/clients/openai.d.ts +5 -2
- package/ts_build/src/clients/openai.js +29 -8
- package/ts_build/src/clients/openai.js.map +1 -1
- package/ts_build/src/clients/types.d.ts +1 -0
- package/ts_build/src/clients/xai.d.ts +5 -2
- package/ts_build/src/clients/xai.js +15 -5
- package/ts_build/src/clients/xai.js.map +1 -1
- package/ts_build/src/config.js +24 -3
- package/ts_build/src/config.js.map +1 -1
- package/ts_build/src/conversion.js +6 -4
- package/ts_build/src/conversion.js.map +1 -1
- package/ts_build/src/login.d.ts +1 -1
- package/ts_build/src/login.js +21 -7
- package/ts_build/src/login.js.map +1 -1
- package/ts_build/src/microphone.js.map +1 -1
- package/ts_build/src/plugins/downloader/downloader.d.ts +7 -5
- package/ts_build/src/plugins/downloader/downloader.js +147 -44
- package/ts_build/src/plugins/downloader/downloader.js.map +1 -1
- package/ts_build/src/plugins/downloader/plugin.js +5 -3
- package/ts_build/src/plugins/downloader/plugin.js.map +1 -1
- package/ts_build/src/plugins/plugins.js +3 -0
- package/ts_build/src/plugins/plugins.js.map +1 -1
- package/ts_build/src/processors/CustomVariables.d.ts +32 -0
- package/ts_build/src/processors/CustomVariables.js +297 -0
- package/ts_build/src/processors/CustomVariables.js.map +1 -0
- package/ts_build/src/processors/HarmonyToolProcessor.d.ts +15 -0
- package/ts_build/src/processors/HarmonyToolProcessor.js +154 -0
- package/ts_build/src/processors/HarmonyToolProcessor.js.map +1 -0
- package/ts_build/src/processors/XmlToolCallProcessor.d.ts +14 -0
- package/ts_build/src/processors/XmlToolCallProcessor.js +357 -0
- package/ts_build/src/processors/XmlToolCallProcessor.js.map +1 -0
- package/ts_build/src/processors/index.d.ts +3 -0
- package/ts_build/src/processors/index.js +7 -1
- package/ts_build/src/processors/index.js.map +1 -1
- package/ts_build/src/prompts/KnowhowConfigExamples.d.ts +2 -0
- package/ts_build/src/prompts/KnowhowConfigExamples.js +379 -0
- package/ts_build/src/prompts/KnowhowConfigExamples.js.map +1 -0
- package/ts_build/src/services/KnowhowClient.d.ts +22 -0
- package/ts_build/src/services/KnowhowClient.js +14 -2
- package/ts_build/src/services/KnowhowClient.js.map +1 -1
- package/ts_build/src/services/Mcp.d.ts +1 -0
- package/ts_build/src/services/Mcp.js +20 -3
- package/ts_build/src/services/Mcp.js.map +1 -1
- package/ts_build/src/services/McpServer.d.ts +1 -1
- package/ts_build/src/services/McpServer.js +8 -4
- package/ts_build/src/services/McpServer.js.map +1 -1
- package/ts_build/src/services/McpWebsocketTransport.js +17 -7
- package/ts_build/src/services/McpWebsocketTransport.js.map +1 -1
- package/ts_build/src/services/MessageProcessor.d.ts +1 -1
- package/ts_build/src/services/MessageProcessor.js +4 -4
- package/ts_build/src/services/MessageProcessor.js.map +1 -1
- package/ts_build/src/services/index.d.ts +2 -0
- package/ts_build/src/services/index.js +4 -0
- package/ts_build/src/services/index.js.map +1 -1
- package/ts_build/src/services/script-execution/ScriptExecutor.d.ts +1 -0
- package/ts_build/src/services/script-execution/ScriptExecutor.js +23 -0
- package/ts_build/src/services/script-execution/ScriptExecutor.js.map +1 -1
- package/ts_build/src/services/types.d.ts +2 -6
- package/ts_build/src/services/types.js +4 -4
- package/ts_build/src/services/types.js.map +1 -1
- package/ts_build/src/types.d.ts +11 -0
- package/ts_build/src/types.js +8 -0
- package/ts_build/src/types.js.map +1 -1
- package/ts_build/src/utils/index.d.ts +2 -0
- package/ts_build/src/utils/index.js +102 -1
- package/ts_build/src/utils/index.js.map +1 -1
- package/ts_build/tests/XmlToolCallProcessor.test.d.ts +1 -0
- package/ts_build/tests/XmlToolCallProcessor.test.js +376 -0
- package/ts_build/tests/XmlToolCallProcessor.test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js +114 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.js +104 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js +74 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/simple_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/simple_test.js +82 -0
- package/ts_build/tests/manual/ycmd/simple_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.d.ts +14 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.js +20 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.js +297 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.d.ts +3 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js +262 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js +186 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js +174 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js +106 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js +104 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js +119 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.js +243 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.js +120 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.js +221 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.js +160 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js +37 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.js +38 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.js +99 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js +128 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.js +66 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js +105 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.js +89 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js +130 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js +83 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.js +149 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.d.ts +15 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js +58 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.js +195 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.d.ts +3 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.js +110 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.js +36 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.js +134 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.js.map +1 -0
- package/tsconfig.json +3 -1
|
@@ -0,0 +1,824 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import { promises as fsasync } from "fs";
|
|
3
|
+
import { execSync } from "child_process";
|
|
4
|
+
import * as fs from "fs";
|
|
5
|
+
import * as path from "path";
|
|
6
|
+
import chalk from "chalk";
|
|
7
|
+
import ora from "ora";
|
|
8
|
+
import { services, agents } from "../../ts_build/src/index";
|
|
9
|
+
import {
|
|
10
|
+
BenchmarkConfig,
|
|
11
|
+
BenchmarkResults,
|
|
12
|
+
ExerciseResult,
|
|
13
|
+
Exercise,
|
|
14
|
+
} from "./types";
|
|
15
|
+
import { registerProvider } from "./providers";
|
|
16
|
+
import {
|
|
17
|
+
XmlToolCallProcessor,
|
|
18
|
+
HarmonyToolProcessor,
|
|
19
|
+
} from "../../ts_build/src/processors";
|
|
20
|
+
import { EvaluatorRegistry } from "./evaluators";
|
|
21
|
+
|
|
22
|
+
export class BenchmarkRunner {
|
|
23
|
+
private config: BenchmarkConfig;
|
|
24
|
+
private exercisesDir: string;
|
|
25
|
+
private knowhowPath: string;
|
|
26
|
+
private defaultServices = services.services();
|
|
27
|
+
private defaultAgents = agents.agents(this.defaultServices);
|
|
28
|
+
private selectedAgent: agents.BaseAgent;
|
|
29
|
+
private model: string = "";
|
|
30
|
+
private provider: string = "";
|
|
31
|
+
private isShuttingDown: boolean = false;
|
|
32
|
+
private cleanup: (() => Promise<void>)[] = [];
|
|
33
|
+
private activeSpinners: Set<any> = new Set();
|
|
34
|
+
private childProcesses: Set<any> = new Set();
|
|
35
|
+
private evaluatorRegistry: EvaluatorRegistry;
|
|
36
|
+
|
|
37
|
+
constructor(config: BenchmarkConfig) {
|
|
38
|
+
this.config = config;
|
|
39
|
+
// Use different paths for local vs container
|
|
40
|
+
if (process.env.CONTAINER) {
|
|
41
|
+
this.exercisesDir = "/app/exercises";
|
|
42
|
+
} else {
|
|
43
|
+
this.exercisesDir = path.join(__dirname, "..", "exercises");
|
|
44
|
+
}
|
|
45
|
+
this.knowhowPath = "/app/knowhow";
|
|
46
|
+
|
|
47
|
+
// Initialize Knowhow services
|
|
48
|
+
this.defaultServices = services.services();
|
|
49
|
+
this.defaultAgents = agents.agents(this.defaultServices);
|
|
50
|
+
|
|
51
|
+
// Register agents
|
|
52
|
+
this.defaultServices.Agents.registerAgent(this.defaultAgents.Researcher);
|
|
53
|
+
this.defaultServices.Agents.registerAgent(this.defaultAgents.Patcher);
|
|
54
|
+
this.defaultServices.Agents.registerAgent(this.defaultAgents.Developer);
|
|
55
|
+
|
|
56
|
+
// Select the agent to use (default to Patcher)
|
|
57
|
+
const agentName = config.agent || "Patcher";
|
|
58
|
+
this.selectedAgent =
|
|
59
|
+
this.defaultAgents[agentName as keyof typeof this.defaultAgents];
|
|
60
|
+
|
|
61
|
+
if (!this.selectedAgent) {
|
|
62
|
+
throw new Error(`Unknown agent: ${agentName}`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Initialize test evaluator registry
|
|
66
|
+
this.evaluatorRegistry = new EvaluatorRegistry();
|
|
67
|
+
|
|
68
|
+
this.setupSignalHandlers();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
private setupSignalHandlers(): void {
|
|
72
|
+
const gracefulShutdown = async (signal: string) => {
|
|
73
|
+
if (this.isShuttingDown) {
|
|
74
|
+
console.log(
|
|
75
|
+
chalk.red(`\n💥 Force killing process (${signal} received again)`)
|
|
76
|
+
);
|
|
77
|
+
process.exit(1);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
this.isShuttingDown = true;
|
|
81
|
+
console.log(
|
|
82
|
+
chalk.yellow(`\n🛑 Graceful shutdown initiated (${signal} received)`)
|
|
83
|
+
);
|
|
84
|
+
console.log(chalk.gray("Press Ctrl+C again to force quit"));
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
// Run cleanup functions
|
|
88
|
+
await Promise.all(this.cleanup.map((fn) => fn().catch(console.error)));
|
|
89
|
+
|
|
90
|
+
// Kill all child processes
|
|
91
|
+
for (const child of this.childProcesses) {
|
|
92
|
+
child.kill("SIGTERM");
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Stop all active spinners
|
|
96
|
+
for (const spinner of this.activeSpinners) {
|
|
97
|
+
spinner.stop();
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Disconnect MCP servers
|
|
101
|
+
if (this.defaultServices?.Mcp) {
|
|
102
|
+
await this.defaultServices.Mcp.closeAll();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
console.log(chalk.green("✅ Cleanup completed"));
|
|
106
|
+
process.exit(0);
|
|
107
|
+
} catch (error) {
|
|
108
|
+
console.error(chalk.red("❌ Error during cleanup:"), error);
|
|
109
|
+
process.exit(1);
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
114
|
+
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
customProviders() {
|
|
118
|
+
// Load custom providers if they exist
|
|
119
|
+
const customProvidersPath = path.join(__dirname, "custom_providers.json");
|
|
120
|
+
|
|
121
|
+
if (fs.existsSync(customProvidersPath)) {
|
|
122
|
+
return require(customProvidersPath);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return [];
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async loadModels() {
|
|
129
|
+
// Register configured models
|
|
130
|
+
await this.defaultServices.Clients.registerConfiguredModels();
|
|
131
|
+
const customProviders = this.customProviders();
|
|
132
|
+
for (const custom of customProviders) {
|
|
133
|
+
await registerProvider(
|
|
134
|
+
custom.provider,
|
|
135
|
+
custom.url,
|
|
136
|
+
custom.headers,
|
|
137
|
+
this.defaultServices.Clients
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const { model, provider } =
|
|
142
|
+
this.defaultServices.Clients.detectProviderModel(
|
|
143
|
+
this.config.provider,
|
|
144
|
+
this.config.model
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
if (!model || !provider) {
|
|
148
|
+
throw new Error(
|
|
149
|
+
`Invalid model/provider combination: options are: ${JSON.stringify(
|
|
150
|
+
this.defaultServices.Clients.listAllModels(),
|
|
151
|
+
null,
|
|
152
|
+
2
|
|
153
|
+
)}`
|
|
154
|
+
);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
console.log(chalk.blue(`Using provider: ${provider}`));
|
|
158
|
+
console.log(chalk.blue(`Using model: ${model}`));
|
|
159
|
+
|
|
160
|
+
this.model = model;
|
|
161
|
+
this.provider = provider;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async initializeServices(): Promise<void> {
|
|
165
|
+
const spinner = ora("Initializing Knowhow services...").start();
|
|
166
|
+
|
|
167
|
+
// Track spinner for cleanup
|
|
168
|
+
this.activeSpinners.add(spinner);
|
|
169
|
+
const cleanupSpinner = () => {
|
|
170
|
+
this.activeSpinners.delete(spinner);
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
try {
|
|
174
|
+
// Define tools
|
|
175
|
+
this.defaultServices.Tools.defineTools(
|
|
176
|
+
agents.includedTools,
|
|
177
|
+
agents.tools
|
|
178
|
+
);
|
|
179
|
+
|
|
180
|
+
// Connect to MCP servers
|
|
181
|
+
await this.defaultServices.Mcp.connectToConfigured(
|
|
182
|
+
this.defaultServices.Tools
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
// Set agent model preferences
|
|
186
|
+
this.selectedAgent.setModelPreferences([
|
|
187
|
+
{ model: this.model, provider: this.provider as any },
|
|
188
|
+
]);
|
|
189
|
+
|
|
190
|
+
spinner.succeed("Services initialized successfully");
|
|
191
|
+
cleanupSpinner();
|
|
192
|
+
} catch (error) {
|
|
193
|
+
spinner.fail("Failed to initialize services");
|
|
194
|
+
cleanupSpinner();
|
|
195
|
+
throw error;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async setupExercises(): Promise<void> {
|
|
200
|
+
const spinner = ora("Setting up exercises...").start();
|
|
201
|
+
|
|
202
|
+
// Track spinner for cleanup
|
|
203
|
+
this.activeSpinners.add(spinner);
|
|
204
|
+
const cleanupSpinner = () => {
|
|
205
|
+
this.activeSpinners.delete(spinner);
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
// Run the clone script
|
|
210
|
+
await this.runCommand("bash", [
|
|
211
|
+
path.join(__dirname, "..", "scripts", "clone-exercism.sh"),
|
|
212
|
+
this.config.language,
|
|
213
|
+
this.config.maxExercises.toString(),
|
|
214
|
+
]);
|
|
215
|
+
|
|
216
|
+
spinner.succeed("Exercises setup completed");
|
|
217
|
+
cleanupSpinner();
|
|
218
|
+
} catch (error) {
|
|
219
|
+
spinner.fail("Failed to setup exercises");
|
|
220
|
+
cleanupSpinner();
|
|
221
|
+
throw error;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
async run(): Promise<BenchmarkResults> {
|
|
226
|
+
console.log(chalk.blue(`Running benchmarks with config:`));
|
|
227
|
+
console.log(chalk.gray(` Language: ${this.config.language}`));
|
|
228
|
+
|
|
229
|
+
await this.loadModels();
|
|
230
|
+
await this.initializeServices();
|
|
231
|
+
|
|
232
|
+
console.log(chalk.gray(` Max exercises: ${this.config.maxExercises}`));
|
|
233
|
+
console.log(chalk.gray(` Model: ${this.model}`));
|
|
234
|
+
console.log(chalk.gray(` Provider: ${this.provider}`));
|
|
235
|
+
|
|
236
|
+
const startTime = new Date();
|
|
237
|
+
await this.setupExercises();
|
|
238
|
+
const exercises = await this.discoverExercises();
|
|
239
|
+
const results: ExerciseResult[] = [];
|
|
240
|
+
|
|
241
|
+
console.log(chalk.blue(`\nFound ${exercises.length} exercises to run\n`));
|
|
242
|
+
|
|
243
|
+
for (const exercise of exercises) {
|
|
244
|
+
// Check if we should stop due to shutdown signal
|
|
245
|
+
if (this.isShuttingDown) {
|
|
246
|
+
console.log(
|
|
247
|
+
chalk.yellow("⏹️ Stopping exercise execution due to shutdown signal")
|
|
248
|
+
);
|
|
249
|
+
break;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
console.log(chalk.yellow(`Running exercise: ${exercise.name}`));
|
|
253
|
+
|
|
254
|
+
const result = await this.runExercise(exercise);
|
|
255
|
+
results.push(result);
|
|
256
|
+
|
|
257
|
+
// Log individual result with progress
|
|
258
|
+
console.log(
|
|
259
|
+
chalk.green(
|
|
260
|
+
`✓ Exercise ${results.length}/${exercises.length} completed: ${exercise.name}`
|
|
261
|
+
)
|
|
262
|
+
);
|
|
263
|
+
const statusColor = result.status === "success" ? chalk.green : chalk.red;
|
|
264
|
+
console.log(statusColor(` Status: ${result.status}`));
|
|
265
|
+
console.log(chalk.gray(` Turns: ${result.turns}`));
|
|
266
|
+
console.log(chalk.gray(` Time: ${result.timeElapsed.toFixed(2)}s`));
|
|
267
|
+
console.log(chalk.gray(` Cost: $${result.cost.toFixed(4)}\n`));
|
|
268
|
+
|
|
269
|
+
// Save incremental results after each exercise
|
|
270
|
+
const incrementalResults = this.generateResults(
|
|
271
|
+
results,
|
|
272
|
+
startTime,
|
|
273
|
+
new Date()
|
|
274
|
+
);
|
|
275
|
+
await this.saveIncrementalResults(incrementalResults);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const endTime = new Date();
|
|
279
|
+
const benchmarkResults = this.generateResults(results, startTime, endTime);
|
|
280
|
+
|
|
281
|
+
// Save results
|
|
282
|
+
await this.saveResults(benchmarkResults);
|
|
283
|
+
|
|
284
|
+
// Print summary
|
|
285
|
+
this.printSummary(benchmarkResults);
|
|
286
|
+
|
|
287
|
+
return benchmarkResults;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
private async discoverExercises(): Promise<Exercise[]> {
|
|
291
|
+
const filteredDir = path.join(this.exercisesDir, "filtered");
|
|
292
|
+
|
|
293
|
+
try {
|
|
294
|
+
const exerciseNames = await fsasync.readdir(filteredDir);
|
|
295
|
+
const exercises: Exercise[] = [];
|
|
296
|
+
|
|
297
|
+
for (const name of exerciseNames) {
|
|
298
|
+
const exercisePath = path.join(filteredDir, name);
|
|
299
|
+
const stat = await fsasync.stat(exercisePath);
|
|
300
|
+
|
|
301
|
+
if (stat.isDirectory()) {
|
|
302
|
+
const files = await fsasync.readdir(exercisePath);
|
|
303
|
+
const hasTests = files.some(
|
|
304
|
+
(f) => f.includes("test") || f.includes("spec")
|
|
305
|
+
);
|
|
306
|
+
|
|
307
|
+
exercises.push({
|
|
308
|
+
name,
|
|
309
|
+
path: exercisePath,
|
|
310
|
+
hasTests,
|
|
311
|
+
files,
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return exercises.slice(0, this.config.maxExercises);
|
|
317
|
+
} catch (error) {
|
|
318
|
+
throw new Error(`Failed to discover exercises: ${error}`);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
private async runExercise(exercise: Exercise): Promise<ExerciseResult> {
|
|
323
|
+
const startTime = new Date();
|
|
324
|
+
|
|
325
|
+
// Check for shutdown before starting exercise
|
|
326
|
+
if (this.isShuttingDown) {
|
|
327
|
+
throw new Error("Exercise cancelled due to shutdown");
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
try {
|
|
331
|
+
// Create the benchmark prompt for the exercise
|
|
332
|
+
const prompt = await this.createExercisePrompt(exercise);
|
|
333
|
+
|
|
334
|
+
// Run knowhow agent on the exercise
|
|
335
|
+
const result = await this.runKnowhowAgent(exercise, prompt);
|
|
336
|
+
|
|
337
|
+
// Run test evaluation after agent execution
|
|
338
|
+
let testResult;
|
|
339
|
+
if (this.evaluatorRegistry.canEvaluateExercise(exercise.path)) {
|
|
340
|
+
const evaluation = await this.evaluatorRegistry.evaluateExercise(
|
|
341
|
+
exercise.path,
|
|
342
|
+
exercise.name
|
|
343
|
+
);
|
|
344
|
+
if (evaluation) {
|
|
345
|
+
testResult = evaluation.testResult;
|
|
346
|
+
console.log(
|
|
347
|
+
chalk.gray(
|
|
348
|
+
` Tests: ${testResult.passed}/${testResult.total} passed`
|
|
349
|
+
)
|
|
350
|
+
);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const endTime = new Date();
|
|
355
|
+
const timeElapsed = (endTime.getTime() - startTime.getTime()) / 1000;
|
|
356
|
+
|
|
357
|
+
return {
|
|
358
|
+
exerciseName: exercise.name,
|
|
359
|
+
status: result.success ? "success" : "failure",
|
|
360
|
+
turns: result.turns,
|
|
361
|
+
testResult,
|
|
362
|
+
timeElapsed,
|
|
363
|
+
cost: result.cost,
|
|
364
|
+
startTime,
|
|
365
|
+
endTime,
|
|
366
|
+
errorMessage: result.error,
|
|
367
|
+
finalOutput: result.output,
|
|
368
|
+
};
|
|
369
|
+
} catch (error: any) {
|
|
370
|
+
const endTime = new Date();
|
|
371
|
+
const timeElapsed = (endTime.getTime() - startTime.getTime()) / 1000;
|
|
372
|
+
|
|
373
|
+
return {
|
|
374
|
+
exerciseName: exercise.name,
|
|
375
|
+
status: "failure",
|
|
376
|
+
testResult: undefined,
|
|
377
|
+
turns: error?.turns || 0,
|
|
378
|
+
timeElapsed,
|
|
379
|
+
cost: error?.cost || 0,
|
|
380
|
+
startTime,
|
|
381
|
+
endTime,
|
|
382
|
+
errorMessage: error instanceof Error ? error.message : String(error),
|
|
383
|
+
};
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
private async createExercisePrompt(exercise: Exercise): Promise<string> {
|
|
388
|
+
let prompt = `I need you to solve this coding exercise:\n\n`;
|
|
389
|
+
|
|
390
|
+
// Add description if available
|
|
391
|
+
const descriptionPath = path.join(exercise.path, "description.md");
|
|
392
|
+
try {
|
|
393
|
+
const description = await fsasync.readFile(descriptionPath, "utf-8");
|
|
394
|
+
prompt += `## Exercise Description\n${description}\n\n`;
|
|
395
|
+
} catch {
|
|
396
|
+
prompt += `## Exercise: ${exercise.name}\n\n`;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// List the files in the exercise
|
|
400
|
+
prompt += `## Files in this exercise:\n`;
|
|
401
|
+
for (const file of exercise.files) {
|
|
402
|
+
prompt += `- ${file}\n`;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
prompt += `\nPlease implement the solution and make sure all tests pass. Focus on:\n`;
|
|
406
|
+
prompt += `1. Reading and understanding the problem\n`;
|
|
407
|
+
prompt += `2. Implementing the required functionality\n`;
|
|
408
|
+
prompt += `3. Running tests to ensure correctness\n`;
|
|
409
|
+
prompt += `4. Fixing any issues that arise\n\n`;
|
|
410
|
+
prompt += `5. If tests are skipped you should unskip them after the initial test passes\n\n`;
|
|
411
|
+
prompt += `You should expect to have to do typical project setup tasks like npm install as a part of this eval`;
|
|
412
|
+
prompt += `Work in the current directory where all the exercise files are located.`;
|
|
413
|
+
prompt += `Your score will be based on whether the tests run, and how many total passed from the file`;
|
|
414
|
+
prompt += `You are allowed to run the tests as many times as your want while you work.`;
|
|
415
|
+
|
|
416
|
+
return prompt;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
private async runKnowhowAgent(
|
|
420
|
+
exercise: Exercise,
|
|
421
|
+
prompt: string
|
|
422
|
+
): Promise<{
|
|
423
|
+
success: boolean;
|
|
424
|
+
turns: number;
|
|
425
|
+
cost: number;
|
|
426
|
+
error?: string;
|
|
427
|
+
output?: string;
|
|
428
|
+
}> {
|
|
429
|
+
let turns = 0;
|
|
430
|
+
let totalCost = 0;
|
|
431
|
+
let success = false;
|
|
432
|
+
let error: string | undefined;
|
|
433
|
+
let output = "";
|
|
434
|
+
const toolUsage = {} as Record<string, number>;
|
|
435
|
+
|
|
436
|
+
// Check for shutdown before starting agent
|
|
437
|
+
if (this.isShuttingDown) {
|
|
438
|
+
throw new Error("Agent execution cancelled due to shutdown");
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
try {
|
|
442
|
+
// Set up event tracking for metrics
|
|
443
|
+
const eventHandlers = {
|
|
444
|
+
threadUpdate: (messages: any) => {
|
|
445
|
+
// Turn count is tracked internally by the agent
|
|
446
|
+
totalCost = this.selectedAgent.getTotalCostUsd();
|
|
447
|
+
turns = this.selectedAgent.getTurnCount();
|
|
448
|
+
},
|
|
449
|
+
[this.selectedAgent.eventTypes.toolUsed]: (call: any) => {
|
|
450
|
+
const name = call.toolCall.function.name;
|
|
451
|
+
toolUsage[name] = toolUsage[name] || 0;
|
|
452
|
+
toolUsage[name] += 1;
|
|
453
|
+
},
|
|
454
|
+
costUpdate: (cost: any) => {
|
|
455
|
+
if (typeof cost === "number") {
|
|
456
|
+
totalCost = cost;
|
|
457
|
+
}
|
|
458
|
+
},
|
|
459
|
+
done: (data: any) => {
|
|
460
|
+
success = !data.error;
|
|
461
|
+
totalCost = this.selectedAgent.getTotalCostUsd();
|
|
462
|
+
turns = this.selectedAgent.getTurnCount();
|
|
463
|
+
if (data.error) {
|
|
464
|
+
error = data.error;
|
|
465
|
+
}
|
|
466
|
+
if (data.output) {
|
|
467
|
+
output = data.output;
|
|
468
|
+
}
|
|
469
|
+
},
|
|
470
|
+
};
|
|
471
|
+
|
|
472
|
+
// Add event listeners
|
|
473
|
+
Object.entries(eventHandlers).forEach(([event, handler]) => {
|
|
474
|
+
this.selectedAgent.agentEvents.on(event, handler);
|
|
475
|
+
});
|
|
476
|
+
|
|
477
|
+
// Set limits on the agent before calling
|
|
478
|
+
if (this.selectedAgent.setMaxTurns) {
|
|
479
|
+
this.selectedAgent.setMaxTurns(this.config.limits.maxTurns);
|
|
480
|
+
}
|
|
481
|
+
if (this.selectedAgent.setMaxSpend) {
|
|
482
|
+
this.selectedAgent.setMaxSpend(this.config.limits.maxCost);
|
|
483
|
+
}
|
|
484
|
+
if (this.selectedAgent.setMaxRunTime) {
|
|
485
|
+
this.selectedAgent.setMaxRunTime(this.config.limits.maxTime * 1000); // Convert seconds to milliseconds
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
this.selectedAgent.messageProcessor.setProcessors("post_call", [
|
|
489
|
+
new XmlToolCallProcessor().createProcessor(),
|
|
490
|
+
new HarmonyToolProcessor().createProcessor(),
|
|
491
|
+
]);
|
|
492
|
+
|
|
493
|
+
// Change to exercise directory
|
|
494
|
+
const originalCwd = process.cwd();
|
|
495
|
+
process.chdir(exercise.path);
|
|
496
|
+
|
|
497
|
+
try {
|
|
498
|
+
// Call the agent directly with the prompt
|
|
499
|
+
this.selectedAgent.newTask();
|
|
500
|
+
const result = await this.selectedAgent.call(prompt);
|
|
501
|
+
|
|
502
|
+
// Extract final output from result
|
|
503
|
+
if (result && typeof result === "string") {
|
|
504
|
+
output = result;
|
|
505
|
+
} else if (
|
|
506
|
+
result &&
|
|
507
|
+
typeof result === "object" &&
|
|
508
|
+
"content" in result
|
|
509
|
+
) {
|
|
510
|
+
output = String(result.content);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
success = true;
|
|
514
|
+
|
|
515
|
+
// Get turn count from the agent
|
|
516
|
+
if (this.selectedAgent.getTurnCount) {
|
|
517
|
+
turns = this.selectedAgent.getTurnCount();
|
|
518
|
+
}
|
|
519
|
+
} finally {
|
|
520
|
+
// Restore original directory
|
|
521
|
+
process.chdir(originalCwd);
|
|
522
|
+
|
|
523
|
+
// Remove event listeners
|
|
524
|
+
Object.entries(eventHandlers).forEach(([event, handler]) => {
|
|
525
|
+
this.selectedAgent.agentEvents.off(event, handler);
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
return {
|
|
530
|
+
success,
|
|
531
|
+
turns,
|
|
532
|
+
cost: totalCost,
|
|
533
|
+
output,
|
|
534
|
+
};
|
|
535
|
+
} catch (err) {
|
|
536
|
+
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
537
|
+
return {
|
|
538
|
+
success: false,
|
|
539
|
+
turns,
|
|
540
|
+
cost: totalCost,
|
|
541
|
+
error: errorMessage,
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
private runCommand(
|
|
547
|
+
command: string,
|
|
548
|
+
args: string[],
|
|
549
|
+
options?: {
|
|
550
|
+
cwd?: string;
|
|
551
|
+
timeout?: number;
|
|
552
|
+
}
|
|
553
|
+
): Promise<string> {
|
|
554
|
+
return new Promise((resolve, reject) => {
|
|
555
|
+
const child = spawn(command, args, {
|
|
556
|
+
cwd: options?.cwd || process.cwd(),
|
|
557
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
// Track child process for cleanup
|
|
561
|
+
this.childProcesses.add(child);
|
|
562
|
+
|
|
563
|
+
// Remove from tracking when it exits
|
|
564
|
+
child.on("close", () => {
|
|
565
|
+
this.childProcesses.delete(child);
|
|
566
|
+
});
|
|
567
|
+
child.on("error", () => {
|
|
568
|
+
this.childProcesses.delete(child);
|
|
569
|
+
});
|
|
570
|
+
|
|
571
|
+
let stdout = "";
|
|
572
|
+
let stderr = "";
|
|
573
|
+
|
|
574
|
+
child.stdout?.on("data", (data) => {
|
|
575
|
+
stdout += data.toString();
|
|
576
|
+
});
|
|
577
|
+
|
|
578
|
+
child.stderr?.on("data", (data) => {
|
|
579
|
+
stderr += data.toString();
|
|
580
|
+
});
|
|
581
|
+
|
|
582
|
+
const timeout = options?.timeout;
|
|
583
|
+
let timeoutId: NodeJS.Timeout | undefined;
|
|
584
|
+
// Check for shutdown signal during command execution
|
|
585
|
+
if (this.isShuttingDown) {
|
|
586
|
+
child.kill("SIGTERM");
|
|
587
|
+
reject(new Error("Command cancelled due to shutdown"));
|
|
588
|
+
return;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if (timeout) {
|
|
592
|
+
timeoutId = setTimeout(() => {
|
|
593
|
+
child.kill("SIGKILL");
|
|
594
|
+
reject(new Error(`Command timed out after ${timeout}ms`));
|
|
595
|
+
}, timeout);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
child.on("close", (code) => {
|
|
599
|
+
if (timeoutId) clearTimeout(timeoutId);
|
|
600
|
+
|
|
601
|
+
if (code === 0) {
|
|
602
|
+
resolve(stdout);
|
|
603
|
+
} else {
|
|
604
|
+
reject(new Error(`Command failed with code ${code}: ${stderr}`));
|
|
605
|
+
}
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
child.on("error", (error) => {
|
|
609
|
+
if (timeoutId) clearTimeout(timeoutId);
|
|
610
|
+
reject(error);
|
|
611
|
+
});
|
|
612
|
+
});
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
private generateResults(
|
|
616
|
+
results: ExerciseResult[],
|
|
617
|
+
startTime: Date,
|
|
618
|
+
endTime: Date
|
|
619
|
+
): BenchmarkResults {
|
|
620
|
+
const totalTime = (endTime.getTime() - startTime.getTime()) / 1000;
|
|
621
|
+
const successCount = results.filter((r) => r.status === "success").length;
|
|
622
|
+
const failureCount = results.filter((r) => r.status === "failure").length;
|
|
623
|
+
const timeoutCount = results.filter((r) => r.status === "timeout").length;
|
|
624
|
+
const costLimitCount = results.filter(
|
|
625
|
+
(r) => r.status === "cost_limit"
|
|
626
|
+
).length;
|
|
627
|
+
const turnLimitCount = results.filter(
|
|
628
|
+
(r) => r.status === "turn_limit"
|
|
629
|
+
).length;
|
|
630
|
+
|
|
631
|
+
// Calculate test-based metrics
|
|
632
|
+
const testableExercises = results.filter(
|
|
633
|
+
(r) => r.testResult !== undefined
|
|
634
|
+
).length;
|
|
635
|
+
const testsPassedCount = results.filter(
|
|
636
|
+
(r) => r.testResult?.success === true
|
|
637
|
+
).length;
|
|
638
|
+
const testsFailedCount = results.filter(
|
|
639
|
+
(r) => r.testResult && !r.testResult.success
|
|
640
|
+
).length;
|
|
641
|
+
const testPassRate =
|
|
642
|
+
testableExercises > 0 ? testsPassedCount / testableExercises : 0;
|
|
643
|
+
const agentSuccessRate = successCount / results.length || 0;
|
|
644
|
+
const actualSuccessRate =
|
|
645
|
+
testableExercises > 0 ? testPassRate : agentSuccessRate;
|
|
646
|
+
|
|
647
|
+
const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
|
|
648
|
+
const totalTurns = results.reduce((sum, r) => sum + r.turns, 0);
|
|
649
|
+
const totalExerciseTime = results.reduce(
|
|
650
|
+
(sum, r) => sum + r.timeElapsed,
|
|
651
|
+
0
|
|
652
|
+
);
|
|
653
|
+
|
|
654
|
+
return {
|
|
655
|
+
config: this.config,
|
|
656
|
+
exercises: results,
|
|
657
|
+
summary: {
|
|
658
|
+
totalExercises: results.length,
|
|
659
|
+
successCount,
|
|
660
|
+
testableExercises,
|
|
661
|
+
testsPassedCount,
|
|
662
|
+
testsFailedCount,
|
|
663
|
+
testPassRate,
|
|
664
|
+
agentSuccessRate,
|
|
665
|
+
failureCount,
|
|
666
|
+
timeoutCount,
|
|
667
|
+
costLimitCount,
|
|
668
|
+
turnLimitCount,
|
|
669
|
+
totalTime: totalExerciseTime,
|
|
670
|
+
totalCost,
|
|
671
|
+
averageTurns: totalTurns / results.length || 0,
|
|
672
|
+
averageTime: totalExerciseTime / results.length || 0,
|
|
673
|
+
successRate: actualSuccessRate,
|
|
674
|
+
},
|
|
675
|
+
startTime,
|
|
676
|
+
endTime,
|
|
677
|
+
};
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
private getCommitHash(): string {
|
|
681
|
+
try {
|
|
682
|
+
// Get the current git commit hash (short format)
|
|
683
|
+
const commitHash = execSync("git rev-parse --short HEAD", {
|
|
684
|
+
encoding: "utf8",
|
|
685
|
+
cwd: process.cwd(),
|
|
686
|
+
}).trim();
|
|
687
|
+
return commitHash;
|
|
688
|
+
} catch (error) {
|
|
689
|
+
// Fallback to a timestamp-based identifier if git is not available
|
|
690
|
+
return `fallback-${Date.now()}`;
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
private formatDateDash(): string {
|
|
695
|
+
const now = new Date();
|
|
696
|
+
const year = now.getFullYear();
|
|
697
|
+
const month = String(now.getMonth() + 1).padStart(2, "0");
|
|
698
|
+
const day = String(now.getDate()).padStart(2, "0");
|
|
699
|
+
return `${year}-${month}-${day}`;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
private generateResultsPath(): string {
|
|
703
|
+
const commitHash = this.getCommitHash();
|
|
704
|
+
const dateStr = this.formatDateDash();
|
|
705
|
+
const modelFileName = `${this.provider}-${this.model.replace(
|
|
706
|
+
/\//g,
|
|
707
|
+
"-"
|
|
708
|
+
)}.json`;
|
|
709
|
+
|
|
710
|
+
// Use different base paths for local vs container
|
|
711
|
+
const baseDir = process.env.CONTAINER
|
|
712
|
+
? "/app/knowhow/benchmarks/results"
|
|
713
|
+
: path.join(__dirname, "..", "results");
|
|
714
|
+
|
|
715
|
+
return path.join(
|
|
716
|
+
baseDir,
|
|
717
|
+
commitHash,
|
|
718
|
+
dateStr,
|
|
719
|
+
this.provider,
|
|
720
|
+
modelFileName
|
|
721
|
+
);
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
private async saveResults(results: BenchmarkResults): Promise<void> {
|
|
725
|
+
// Generate the new structured path
|
|
726
|
+
const resultsPath = this.generateResultsPath();
|
|
727
|
+
|
|
728
|
+
// Ensure the directory exists
|
|
729
|
+
await fsasync.mkdir(path.dirname(resultsPath), { recursive: true });
|
|
730
|
+
await fsasync.writeFile(resultsPath, JSON.stringify(results, null, 2));
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
private async saveIncrementalResults(
|
|
734
|
+
results: BenchmarkResults
|
|
735
|
+
): Promise<void> {
|
|
736
|
+
try {
|
|
737
|
+
// Generate the new structured path for incremental results
|
|
738
|
+
const resultsPath = this.generateResultsPath();
|
|
739
|
+
|
|
740
|
+
// Ensure the directory exists
|
|
741
|
+
await fsasync.mkdir(path.dirname(resultsPath), { recursive: true });
|
|
742
|
+
await fsasync.writeFile(resultsPath, JSON.stringify(results, null, 2));
|
|
743
|
+
console.log(chalk.gray(` → Incremental results saved`));
|
|
744
|
+
} catch (error) {
|
|
745
|
+
// Don't crash the benchmark if incremental save fails
|
|
746
|
+
console.log(
|
|
747
|
+
chalk.yellow(
|
|
748
|
+
` ⚠ Warning: Failed to save incremental results: ${error}`
|
|
749
|
+
)
|
|
750
|
+
);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
private printSummary(results: BenchmarkResults): void {
|
|
755
|
+
console.log(chalk.blue("\n📊 Benchmark Summary"));
|
|
756
|
+
console.log(chalk.gray("━".repeat(50)));
|
|
757
|
+
console.log(
|
|
758
|
+
chalk.white(`Total Exercises: ${results.summary.totalExercises}`)
|
|
759
|
+
);
|
|
760
|
+
|
|
761
|
+
if (results.summary.testableExercises > 0) {
|
|
762
|
+
console.log(chalk.blue("\n🧪 Test Evaluation Results:"));
|
|
763
|
+
console.log(
|
|
764
|
+
chalk.white(
|
|
765
|
+
` Testable exercises: ${results.summary.testableExercises}`
|
|
766
|
+
)
|
|
767
|
+
);
|
|
768
|
+
console.log(
|
|
769
|
+
chalk.green(` Tests passed: ${results.summary.testsPassedCount}`)
|
|
770
|
+
);
|
|
771
|
+
console.log(
|
|
772
|
+
chalk.red(` Tests failed: ${results.summary.testsFailedCount}`)
|
|
773
|
+
);
|
|
774
|
+
console.log(
|
|
775
|
+
chalk.white(
|
|
776
|
+
` Test pass rate: ${(results.summary.testPassRate * 100).toFixed(
|
|
777
|
+
1
|
|
778
|
+
)}%`
|
|
779
|
+
)
|
|
780
|
+
);
|
|
781
|
+
console.log(
|
|
782
|
+
chalk.white(
|
|
783
|
+
` Agent success rate: ${(
|
|
784
|
+
results.summary.agentSuccessRate * 100
|
|
785
|
+
).toFixed(1)}%`
|
|
786
|
+
)
|
|
787
|
+
);
|
|
788
|
+
console.log(
|
|
789
|
+
chalk.white(
|
|
790
|
+
` Overall success rate: ${(
|
|
791
|
+
results.summary.successRate * 100
|
|
792
|
+
).toFixed(1)}%`
|
|
793
|
+
)
|
|
794
|
+
);
|
|
795
|
+
} else {
|
|
796
|
+
console.log(chalk.blue("\n🤖 Agent Evaluation Results:"));
|
|
797
|
+
console.log(chalk.green(` Successful: ${results.summary.successCount}`));
|
|
798
|
+
console.log(chalk.red(` Failed: ${results.summary.failureCount}`));
|
|
799
|
+
console.log(chalk.yellow(` Timeouts: ${results.summary.timeoutCount}`));
|
|
800
|
+
console.log(
|
|
801
|
+
chalk.yellow(` Turn limits: ${results.summary.turnLimitCount}`)
|
|
802
|
+
);
|
|
803
|
+
console.log(
|
|
804
|
+
chalk.yellow(` Cost limits: ${results.summary.costLimitCount}`)
|
|
805
|
+
);
|
|
806
|
+
console.log(
|
|
807
|
+
chalk.white(
|
|
808
|
+
` Success Rate: ${(results.summary.successRate * 100).toFixed(1)}%`
|
|
809
|
+
)
|
|
810
|
+
);
|
|
811
|
+
}
|
|
812
|
+
console.log(
|
|
813
|
+
chalk.white(`Average Turns: ${results.summary.averageTurns.toFixed(1)}`)
|
|
814
|
+
);
|
|
815
|
+
console.log(
|
|
816
|
+
chalk.white(`Average Time: ${results.summary.averageTime.toFixed(1)}s`)
|
|
817
|
+
);
|
|
818
|
+
console.log(chalk.blue("\n📈 Performance Metrics:"));
|
|
819
|
+
console.log(
|
|
820
|
+
chalk.white(`Total Cost: $${results.summary.totalCost.toFixed(4)}`)
|
|
821
|
+
);
|
|
822
|
+
console.log(chalk.gray(`Results saved to: ${this.generateResultsPath()}`));
|
|
823
|
+
}
|
|
824
|
+
}
|