@tyvm/knowhow 0.0.33 → 0.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/autodoc/plugins/downloader/downloader.mdx +2 -2
- package/benchmarks/.dockerignore +7 -0
- package/benchmarks/README.md +166 -0
- package/benchmarks/docker/Dockerfile +68 -0
- package/benchmarks/example-config.yml +27 -0
- package/benchmarks/jest.config.js +13 -0
- package/benchmarks/package-lock.json +4297 -0
- package/benchmarks/package.json +39 -0
- package/benchmarks/results/4542435/2025-08-05/lms/lms-openai-gpt-oss-20b.json +2814 -0
- package/benchmarks/results/4542435/2025-08-05/lms/lms-qwen-qwen3-30b-a3b-2507.json +2014 -0
- package/benchmarks/results/4fb9125/2025-08-07/anthropic/anthropic-claude-sonnet-4-20250514.json +3121 -0
- package/benchmarks/results/5766aee/2025-08-02/lms-qwen/qwen3-coder-30b.json +98 -0
- package/benchmarks/results/6d73808/2025-08-07/openai/openai-gpt-5.json +3256 -0
- package/benchmarks/results/77bf0a6/2025-08-02/lms-qwen/qwen3-30b-a3b-2507.json +4298 -0
- package/benchmarks/results/8c0d445/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3031 -0
- package/benchmarks/results/8c0d445/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +2990 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/anthropic/anthropic-claude-sonnet-4-20250514.json +3256 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3007 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-2025-04-14.json +3256 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-mini-2025-04-14.json +3036 -0
- package/benchmarks/results/ac6b2ab/2025-08-03/openai/openai-gpt-4.1-nano-2025-04-14.json +3280 -0
- package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-30b-a3b-2507.json +1920 -0
- package/benchmarks/results/adff675/2025-08-04/lms/lms-qwen-qwen3-coder-30b.json +3281 -0
- package/benchmarks/results/b502ed9/2025-08-03/lms-qwen/qwen3-coder-30b.json +2896 -0
- package/benchmarks/results/d1a8129/2025-08-03/lms/lms-qwen-qwen3-coder-30b.json +3011 -0
- package/benchmarks/results/e60471c/2025-08-03/lms/qwen3-30b-a3b-2507.json +3003 -0
- package/benchmarks/scripts/build-and-run.sh +47 -0
- package/benchmarks/scripts/clone-exercism.sh +92 -0
- package/benchmarks/scripts/validate.sh +48 -0
- package/benchmarks/src/__tests__/runner.test.ts +27 -0
- package/benchmarks/src/cli.ts +90 -0
- package/benchmarks/src/evaluators/EvaluatorRegistry.ts +64 -0
- package/benchmarks/src/evaluators/JavaScriptEvaluator.ts +183 -0
- package/benchmarks/src/evaluators/index.ts +3 -0
- package/benchmarks/src/evaluators/types.ts +22 -0
- package/benchmarks/src/index.ts +3 -0
- package/benchmarks/src/providers.ts +13 -0
- package/benchmarks/src/runner.ts +824 -0
- package/benchmarks/src/types.ts +63 -0
- package/benchmarks/tsconfig.json +19 -0
- package/jest.config.js +2 -1
- package/leaderboard/README.md +148 -0
- package/leaderboard/app/api/benchmark-data/route.ts +131 -0
- package/leaderboard/app/api/benchmark-detail/route.ts +172 -0
- package/leaderboard/app/details/[model]/[provider]/[language]/page.tsx +501 -0
- package/leaderboard/app/exercise/[model]/[provider]/[language]/[exercise]/page.tsx +375 -0
- package/leaderboard/app/globals.css +27 -0
- package/leaderboard/app/layout.tsx +21 -0
- package/leaderboard/app/page.tsx +170 -0
- package/leaderboard/components/LeaderboardTable.tsx +168 -0
- package/leaderboard/components/PerformanceChart.tsx +109 -0
- package/leaderboard/next-env.d.ts +5 -0
- package/leaderboard/next.config.js +4 -0
- package/leaderboard/package-lock.json +6363 -0
- package/leaderboard/package.json +28 -0
- package/leaderboard/postcss.config.js +6 -0
- package/leaderboard/tailwind.config.js +17 -0
- package/leaderboard/tsconfig.json +28 -0
- package/leaderboard/types/benchmark.ts +67 -0
- package/leaderboard/utils/dataProcessor.ts +33 -0
- package/package.json +2 -1
- package/src/agents/base/base.ts +182 -24
- package/src/agents/base/prompt.ts +28 -0
- package/src/agents/index.ts +3 -0
- package/src/agents/patcher/patcher.ts +6 -4
- package/src/agents/setup/setup.ts +56 -0
- package/src/agents/tools/agentCall.ts +6 -2
- package/src/agents/tools/aiClient.ts +74 -8
- package/src/agents/tools/execCommand.ts +13 -14
- package/src/agents/tools/executeScript/README.md +16 -0
- package/src/agents/tools/index.ts +2 -0
- package/src/agents/tools/list.ts +73 -16
- package/src/agents/tools/startAgentTask.ts +109 -0
- package/src/agents/tools/textSearch.ts +1 -1
- package/src/agents/tools/visionTool.ts +31 -2
- package/src/agents/tools/ycmd/client.ts +608 -0
- package/src/agents/tools/ycmd/definitions.ts +294 -0
- package/src/agents/tools/ycmd/detection.ts +211 -0
- package/src/agents/tools/ycmd/index.ts +11 -0
- package/src/agents/tools/ycmd/installer.ts +251 -0
- package/src/agents/tools/ycmd/server.ts +535 -0
- package/src/agents/tools/ycmd/serverManager.ts +316 -0
- package/src/agents/tools/ycmd/tools/completion.ts +113 -0
- package/src/agents/tools/ycmd/tools/diagnostics.ts +155 -0
- package/src/agents/tools/ycmd/tools/getLocations.ts +173 -0
- package/src/agents/tools/ycmd/tools/goto.ts +169 -0
- package/src/agents/tools/ycmd/tools/refactor.ts +204 -0
- package/src/agents/tools/ycmd/tools/signature.ts +174 -0
- package/src/agents/tools/ycmd/tools/start.ts +95 -0
- package/src/agents/tools/ycmd/utils/pathUtils.ts +59 -0
- package/src/ai.ts +15 -0
- package/src/chat/CliChatService.ts +277 -0
- package/src/chat/modules/AgentModule.ts +985 -0
- package/src/chat/modules/AskModule.ts +98 -0
- package/src/chat/modules/BaseChatModule.ts +66 -0
- package/src/chat/modules/InternalChatModule.ts +174 -0
- package/src/chat/modules/SearchModule.ts +166 -0
- package/src/chat/modules/SetupModule.ts +185 -0
- package/src/chat/modules/SystemModule.ts +120 -0
- package/src/chat/modules/VoiceModule.ts +70 -0
- package/src/chat/modules/index.js +5 -0
- package/src/chat/types.ts +97 -0
- package/src/chat.ts +9 -1
- package/src/chat2.ts +62 -0
- package/src/cli.ts +264 -35
- package/src/clients/anthropic.ts +14 -7
- package/src/clients/gemini.ts +15 -7
- package/src/clients/http.ts +17 -7
- package/src/clients/index.ts +117 -4
- package/src/clients/knowhow.ts +7 -2
- package/src/clients/knowhowMcp.ts +118 -0
- package/src/clients/openai.ts +32 -8
- package/src/clients/types.ts +1 -0
- package/src/clients/xai.ts +17 -5
- package/src/config.ts +30 -5
- package/src/conversion.ts +4 -1
- package/src/login.ts +26 -9
- package/src/microphone.ts +0 -1
- package/src/plugins/downloader/downloader.ts +191 -49
- package/src/plugins/downloader/plugin.ts +3 -1
- package/src/plugins/plugins.ts +3 -0
- package/src/processors/CustomVariables.ts +425 -0
- package/src/processors/HarmonyToolProcessor.ts +264 -0
- package/src/processors/XmlToolCallProcessor.ts +533 -0
- package/src/processors/index.ts +3 -0
- package/src/prompts/KnowhowConfigExamples.ts +376 -0
- package/src/services/KnowhowClient.ts +49 -3
- package/src/services/Mcp.ts +42 -3
- package/src/services/McpServer.ts +14 -4
- package/src/services/McpWebsocketTransport.ts +21 -7
- package/src/services/MessageProcessor.ts +10 -5
- package/src/services/index.ts +5 -0
- package/src/services/script-execution/ScriptExecutor.ts +34 -1
- package/src/services/types.ts +17 -14
- package/src/types.ts +17 -0
- package/src/utils/index.ts +138 -0
- package/tests/XmlToolCallProcessor.test.ts +468 -0
- package/tests/manual/ycmd/debug_diagnostics_test.ts +127 -0
- package/tests/manual/ycmd/fixtures/debug_diagnostics.ts +26 -0
- package/tests/manual/ycmd/fixtures/file_change_test.ts +17 -0
- package/tests/manual/ycmd/minimal_advanced_test.ts +108 -0
- package/tests/manual/ycmd/simple_diagnostics_test.ts +61 -0
- package/tests/manual/ycmd/simple_test.ts +74 -0
- package/tests/manual/ycmd/test-typescript-sample.ts +34 -0
- package/tests/manual/ycmd/test_advanced_features.ts +407 -0
- package/tests/manual/ycmd/test_advanced_with_tools.ts +320 -0
- package/tests/manual/ycmd/test_comprehensive_typescript.ts +179 -0
- package/tests/manual/ycmd/test_diagnostics_file_changes.ts +249 -0
- package/tests/manual/ycmd/test_diagnostics_fix.ts +99 -0
- package/tests/manual/ycmd/test_diagnostics_simple.ts +100 -0
- package/tests/manual/ycmd/test_diagnostics_timing.ts +120 -0
- package/tests/manual/ycmd/test_discover_commands.ts +310 -0
- package/tests/manual/ycmd/test_endpoints.ts +115 -0
- package/tests/manual/ycmd/test_final_comprehensive.ts +218 -0
- package/tests/manual/ycmd/test_final_validation.ts +150 -0
- package/tests/manual/ycmd/test_implementation.js +42 -0
- package/tests/manual/ycmd/test_individual_ycmd_tool.ts +39 -0
- package/tests/manual/ycmd/test_server_manager.ts +52 -0
- package/tests/manual/ycmd/test_simple_debug.ts +86 -0
- package/tests/manual/ycmd/test_tsserver_workflow.js +83 -0
- package/tests/manual/ycmd/test_tsserver_workflow.ts +122 -0
- package/tests/manual/ycmd/test_typescript_simple.ts +48 -0
- package/tests/manual/ycmd/test_typescript_ycmd.ts +105 -0
- package/tests/manual/ycmd/test_workspace_config.ts +90 -0
- package/tests/manual/ycmd/test_ycmd_auto_start.ts +137 -0
- package/tests/manual/ycmd/test_ycmd_comprehensive.ts +73 -0
- package/tests/manual/ycmd/test_ycmd_connection.py +10 -0
- package/tests/manual/ycmd/test_ycmd_direct.ts +142 -0
- package/tests/manual/ycmd/test_ycmd_experiment.ts +48 -0
- package/tests/manual/ycmd/test_ycmd_final.ts +200 -0
- package/tests/manual/ycmd/test_ycmd_fixed.py +18 -0
- package/tests/manual/ycmd/test_ycmd_integration.ts +112 -0
- package/tests/manual/ycmd/test_ycmd_simple.ts +45 -0
- package/tests/manual/ycmd/test_ycmd_usage.py +27 -0
- package/tests/manual/ycmd/working_simple_test.ts +134 -0
- package/ts_build/src/agents/base/base.d.ts +15 -1
- package/ts_build/src/agents/base/base.js +121 -20
- package/ts_build/src/agents/base/base.js.map +1 -1
- package/ts_build/src/agents/base/prompt.d.ts +1 -1
- package/ts_build/src/agents/base/prompt.js +28 -0
- package/ts_build/src/agents/base/prompt.js.map +1 -1
- package/ts_build/src/agents/index.d.ts +2 -0
- package/ts_build/src/agents/index.js +2 -0
- package/ts_build/src/agents/index.js.map +1 -1
- package/ts_build/src/agents/patcher/patcher.js +6 -3
- package/ts_build/src/agents/patcher/patcher.js.map +1 -1
- package/ts_build/src/agents/setup/setup.d.ts +8 -0
- package/ts_build/src/agents/setup/setup.js +59 -0
- package/ts_build/src/agents/setup/setup.js.map +1 -0
- package/ts_build/src/agents/tools/agentCall.js +5 -2
- package/ts_build/src/agents/tools/agentCall.js.map +1 -1
- package/ts_build/src/agents/tools/aiClient.d.ts +6 -5
- package/ts_build/src/agents/tools/aiClient.js +37 -6
- package/ts_build/src/agents/tools/aiClient.js.map +1 -1
- package/ts_build/src/agents/tools/execCommand.d.ts +2 -2
- package/ts_build/src/agents/tools/execCommand.js +5 -6
- package/ts_build/src/agents/tools/execCommand.js.map +1 -1
- package/ts_build/src/agents/tools/executeScript/index.d.ts +1 -1
- package/ts_build/src/agents/tools/index.d.ts +2 -0
- package/ts_build/src/agents/tools/index.js +2 -0
- package/ts_build/src/agents/tools/index.js.map +1 -1
- package/ts_build/src/agents/tools/list.js +66 -16
- package/ts_build/src/agents/tools/list.js.map +1 -1
- package/ts_build/src/agents/tools/startAgentTask.d.ts +13 -0
- package/ts_build/src/agents/tools/startAgentTask.js +74 -0
- package/ts_build/src/agents/tools/startAgentTask.js.map +1 -0
- package/ts_build/src/agents/tools/startChatTask.d.ts +13 -0
- package/ts_build/src/agents/tools/startChatTask.js +73 -0
- package/ts_build/src/agents/tools/startChatTask.js.map +1 -0
- package/ts_build/src/agents/tools/textSearch.js +1 -1
- package/ts_build/src/agents/tools/textSearch.js.map +1 -1
- package/ts_build/src/agents/tools/visionTool.d.ts +1 -1
- package/ts_build/src/agents/tools/visionTool.js +23 -3
- package/ts_build/src/agents/tools/visionTool.js.map +1 -1
- package/ts_build/src/agents/tools/ycmd/client.d.ts +93 -0
- package/ts_build/src/agents/tools/ycmd/client.js +355 -0
- package/ts_build/src/agents/tools/ycmd/client.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/definitions.d.ts +345 -0
- package/ts_build/src/agents/tools/ycmd/definitions.js +298 -0
- package/ts_build/src/agents/tools/ycmd/definitions.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/detection.d.ts +11 -0
- package/ts_build/src/agents/tools/ycmd/detection.js +175 -0
- package/ts_build/src/agents/tools/ycmd/detection.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/index.d.ts +8 -0
- package/ts_build/src/agents/tools/ycmd/index.js +20 -0
- package/ts_build/src/agents/tools/ycmd/index.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/installer.d.ts +19 -0
- package/ts_build/src/agents/tools/ycmd/installer.js +196 -0
- package/ts_build/src/agents/tools/ycmd/installer.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/server.d.ts +35 -0
- package/ts_build/src/agents/tools/ycmd/server.js +363 -0
- package/ts_build/src/agents/tools/ycmd/server.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.d.ts +39 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.js +210 -0
- package/ts_build/src/agents/tools/ycmd/serverManager.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.d.ts +22 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.js +72 -0
- package/ts_build/src/agents/tools/ycmd/tools/completion.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.d.ts +42 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js +88 -0
- package/ts_build/src/agents/tools/ycmd/tools/diagnostics.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.d.ts +22 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.js +142 -0
- package/ts_build/src/agents/tools/ycmd/tools/getLocations.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.d.ts +20 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.js +101 -0
- package/ts_build/src/agents/tools/ycmd/tools/goto.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.d.ts +32 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.js +123 -0
- package/ts_build/src/agents/tools/ycmd/tools/refactor.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.d.ts +25 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.js +110 -0
- package/ts_build/src/agents/tools/ycmd/tools/signature.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.d.ts +17 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.js +65 -0
- package/ts_build/src/agents/tools/ycmd/tools/start.js.map +1 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.d.ts +4 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js +67 -0
- package/ts_build/src/agents/tools/ycmd/utils/pathUtils.js.map +1 -0
- package/ts_build/src/ai.d.ts +1 -0
- package/ts_build/src/ai.js +40 -1
- package/ts_build/src/ai.js.map +1 -1
- package/ts_build/src/chat/ChatCommandHandler.d.ts +36 -0
- package/ts_build/src/chat/ChatCommandHandler.js +268 -0
- package/ts_build/src/chat/ChatCommandHandler.js.map +1 -0
- package/ts_build/src/chat/ChatInputManager.d.ts +22 -0
- package/ts_build/src/chat/ChatInputManager.js +85 -0
- package/ts_build/src/chat/ChatInputManager.js.map +1 -0
- package/ts_build/src/chat/ChatManager.d.ts +49 -0
- package/ts_build/src/chat/ChatManager.js +271 -0
- package/ts_build/src/chat/ChatManager.js.map +1 -0
- package/ts_build/src/chat/ChatSession.d.ts +32 -0
- package/ts_build/src/chat/ChatSession.js +3 -0
- package/ts_build/src/chat/ChatSession.js.map +1 -0
- package/ts_build/src/chat/ChatSessionManager.d.ts +19 -0
- package/ts_build/src/chat/ChatSessionManager.js +188 -0
- package/ts_build/src/chat/ChatSessionManager.js.map +1 -0
- package/ts_build/src/chat/ChatStateManager.d.ts +58 -0
- package/ts_build/src/chat/ChatStateManager.js +156 -0
- package/ts_build/src/chat/ChatStateManager.js.map +1 -0
- package/ts_build/src/chat/CliChatService.d.ts +35 -0
- package/ts_build/src/chat/CliChatService.js +201 -0
- package/ts_build/src/chat/CliChatService.js.map +1 -0
- package/ts_build/src/chat/InterruptibleInput.d.ts +20 -0
- package/ts_build/src/chat/InterruptibleInput.js +109 -0
- package/ts_build/src/chat/InterruptibleInput.js.map +1 -0
- package/ts_build/src/chat/interfaces/ChatModule.d.ts +6 -0
- package/ts_build/src/chat/interfaces/ChatModule.js +3 -0
- package/ts_build/src/chat/interfaces/ChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/AgentModule.d.ts +57 -0
- package/ts_build/src/chat/modules/AgentModule.js +709 -0
- package/ts_build/src/chat/modules/AgentModule.js.map +1 -0
- package/ts_build/src/chat/modules/AskModule.d.ts +10 -0
- package/ts_build/src/chat/modules/AskModule.js +63 -0
- package/ts_build/src/chat/modules/AskModule.js.map +1 -0
- package/ts_build/src/chat/modules/BaseChatModule.d.ts +14 -0
- package/ts_build/src/chat/modules/BaseChatModule.js +32 -0
- package/ts_build/src/chat/modules/BaseChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/InternalChatModule.d.ts +24 -0
- package/ts_build/src/chat/modules/InternalChatModule.js +127 -0
- package/ts_build/src/chat/modules/InternalChatModule.js.map +1 -0
- package/ts_build/src/chat/modules/SearchModule.d.ts +12 -0
- package/ts_build/src/chat/modules/SearchModule.js +119 -0
- package/ts_build/src/chat/modules/SearchModule.js.map +1 -0
- package/ts_build/src/chat/modules/SetupModule.d.ts +15 -0
- package/ts_build/src/chat/modules/SetupModule.js +147 -0
- package/ts_build/src/chat/modules/SetupModule.js.map +1 -0
- package/ts_build/src/chat/modules/SystemModule.d.ts +14 -0
- package/ts_build/src/chat/modules/SystemModule.js +90 -0
- package/ts_build/src/chat/modules/SystemModule.js.map +1 -0
- package/ts_build/src/chat/modules/VoiceModule.d.ts +11 -0
- package/ts_build/src/chat/modules/VoiceModule.js +57 -0
- package/ts_build/src/chat/modules/VoiceModule.js.map +1 -0
- package/ts_build/src/chat/types.d.ts +83 -0
- package/ts_build/src/chat/types.js +3 -0
- package/ts_build/src/chat/types.js.map +1 -0
- package/ts_build/src/chat.js +7 -1
- package/ts_build/src/chat.js.map +1 -1
- package/ts_build/src/chat2.d.ts +3 -0
- package/ts_build/src/chat2.js +47 -0
- package/ts_build/src/chat2.js.map +1 -0
- package/ts_build/src/cli.js +218 -37
- package/ts_build/src/cli.js.map +1 -1
- package/ts_build/src/clients/anthropic.d.ts +5 -2
- package/ts_build/src/clients/anthropic.js +12 -7
- package/ts_build/src/clients/anthropic.js.map +1 -1
- package/ts_build/src/clients/gemini.d.ts +6 -3
- package/ts_build/src/clients/gemini.js +13 -7
- package/ts_build/src/clients/gemini.js.map +1 -1
- package/ts_build/src/clients/http.d.ts +1 -0
- package/ts_build/src/clients/http.js +12 -5
- package/ts_build/src/clients/http.js.map +1 -1
- package/ts_build/src/clients/index.d.ts +10 -0
- package/ts_build/src/clients/index.js +74 -4
- package/ts_build/src/clients/index.js.map +1 -1
- package/ts_build/src/clients/knowhow.d.ts +3 -1
- package/ts_build/src/clients/knowhow.js +8 -2
- package/ts_build/src/clients/knowhow.js.map +1 -1
- package/ts_build/src/clients/knowhowMcp.d.ts +20 -0
- package/ts_build/src/clients/knowhowMcp.js +86 -0
- package/ts_build/src/clients/knowhowMcp.js.map +1 -0
- package/ts_build/src/clients/openai.d.ts +5 -2
- package/ts_build/src/clients/openai.js +29 -8
- package/ts_build/src/clients/openai.js.map +1 -1
- package/ts_build/src/clients/types.d.ts +1 -0
- package/ts_build/src/clients/xai.d.ts +5 -2
- package/ts_build/src/clients/xai.js +15 -5
- package/ts_build/src/clients/xai.js.map +1 -1
- package/ts_build/src/config.js +24 -3
- package/ts_build/src/config.js.map +1 -1
- package/ts_build/src/conversion.js +6 -4
- package/ts_build/src/conversion.js.map +1 -1
- package/ts_build/src/login.d.ts +1 -1
- package/ts_build/src/login.js +21 -7
- package/ts_build/src/login.js.map +1 -1
- package/ts_build/src/microphone.js.map +1 -1
- package/ts_build/src/plugins/downloader/downloader.d.ts +7 -5
- package/ts_build/src/plugins/downloader/downloader.js +147 -44
- package/ts_build/src/plugins/downloader/downloader.js.map +1 -1
- package/ts_build/src/plugins/downloader/plugin.js +5 -3
- package/ts_build/src/plugins/downloader/plugin.js.map +1 -1
- package/ts_build/src/plugins/plugins.js +3 -0
- package/ts_build/src/plugins/plugins.js.map +1 -1
- package/ts_build/src/processors/CustomVariables.d.ts +32 -0
- package/ts_build/src/processors/CustomVariables.js +297 -0
- package/ts_build/src/processors/CustomVariables.js.map +1 -0
- package/ts_build/src/processors/HarmonyToolProcessor.d.ts +15 -0
- package/ts_build/src/processors/HarmonyToolProcessor.js +154 -0
- package/ts_build/src/processors/HarmonyToolProcessor.js.map +1 -0
- package/ts_build/src/processors/XmlToolCallProcessor.d.ts +14 -0
- package/ts_build/src/processors/XmlToolCallProcessor.js +357 -0
- package/ts_build/src/processors/XmlToolCallProcessor.js.map +1 -0
- package/ts_build/src/processors/index.d.ts +3 -0
- package/ts_build/src/processors/index.js +7 -1
- package/ts_build/src/processors/index.js.map +1 -1
- package/ts_build/src/prompts/KnowhowConfigExamples.d.ts +2 -0
- package/ts_build/src/prompts/KnowhowConfigExamples.js +379 -0
- package/ts_build/src/prompts/KnowhowConfigExamples.js.map +1 -0
- package/ts_build/src/services/KnowhowClient.d.ts +22 -0
- package/ts_build/src/services/KnowhowClient.js +14 -2
- package/ts_build/src/services/KnowhowClient.js.map +1 -1
- package/ts_build/src/services/Mcp.d.ts +1 -0
- package/ts_build/src/services/Mcp.js +20 -3
- package/ts_build/src/services/Mcp.js.map +1 -1
- package/ts_build/src/services/McpServer.d.ts +1 -1
- package/ts_build/src/services/McpServer.js +8 -4
- package/ts_build/src/services/McpServer.js.map +1 -1
- package/ts_build/src/services/McpWebsocketTransport.js +17 -7
- package/ts_build/src/services/McpWebsocketTransport.js.map +1 -1
- package/ts_build/src/services/MessageProcessor.d.ts +1 -1
- package/ts_build/src/services/MessageProcessor.js +4 -4
- package/ts_build/src/services/MessageProcessor.js.map +1 -1
- package/ts_build/src/services/index.d.ts +2 -0
- package/ts_build/src/services/index.js +4 -0
- package/ts_build/src/services/index.js.map +1 -1
- package/ts_build/src/services/script-execution/ScriptExecutor.d.ts +1 -0
- package/ts_build/src/services/script-execution/ScriptExecutor.js +23 -0
- package/ts_build/src/services/script-execution/ScriptExecutor.js.map +1 -1
- package/ts_build/src/services/types.d.ts +2 -6
- package/ts_build/src/services/types.js +4 -4
- package/ts_build/src/services/types.js.map +1 -1
- package/ts_build/src/types.d.ts +11 -0
- package/ts_build/src/types.js +8 -0
- package/ts_build/src/types.js.map +1 -1
- package/ts_build/src/utils/index.d.ts +2 -0
- package/ts_build/src/utils/index.js +102 -1
- package/ts_build/src/utils/index.js.map +1 -1
- package/ts_build/tests/XmlToolCallProcessor.test.d.ts +1 -0
- package/ts_build/tests/XmlToolCallProcessor.test.js +376 -0
- package/ts_build/tests/XmlToolCallProcessor.test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js +114 -0
- package/ts_build/tests/manual/ycmd/debug_diagnostics_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.js +104 -0
- package/ts_build/tests/manual/ycmd/minimal_advanced_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js +74 -0
- package/ts_build/tests/manual/ycmd/simple_diagnostics_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/simple_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/simple_test.js +82 -0
- package/ts_build/tests/manual/ycmd/simple_test.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.d.ts +14 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.js +20 -0
- package/ts_build/tests/manual/ycmd/test-typescript-sample.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.js +297 -0
- package/ts_build/tests/manual/ycmd/test_advanced_features.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.d.ts +3 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js +262 -0
- package/ts_build/tests/manual/ycmd/test_advanced_with_tools.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js +186 -0
- package/ts_build/tests/manual/ycmd/test_comprehensive_typescript.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js +174 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_file_changes.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js +106 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_fix.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js +104 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js +119 -0
- package/ts_build/tests/manual/ycmd/test_diagnostics_timing.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.js +243 -0
- package/ts_build/tests/manual/ycmd/test_discover_commands.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.js +120 -0
- package/ts_build/tests/manual/ycmd/test_endpoints.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.js +221 -0
- package/ts_build/tests/manual/ycmd/test_final_comprehensive.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.js +160 -0
- package/ts_build/tests/manual/ycmd/test_final_validation.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js +37 -0
- package/ts_build/tests/manual/ycmd/test_individual_ycmd_tool.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.js +38 -0
- package/ts_build/tests/manual/ycmd/test_server_manager.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.js +99 -0
- package/ts_build/tests/manual/ycmd/test_simple_debug.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js +128 -0
- package/ts_build/tests/manual/ycmd/test_tsserver_workflow.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.js +66 -0
- package/ts_build/tests/manual/ycmd/test_typescript_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js +105 -0
- package/ts_build/tests/manual/ycmd/test_typescript_ycmd.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.js +89 -0
- package/ts_build/tests/manual/ycmd/test_workspace_config.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js +130 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_auto_start.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.d.ts +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js +83 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_comprehensive.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.js +149 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_direct.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.d.ts +15 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js +58 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_experiment.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.js +195 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_final.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.d.ts +3 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.js +110 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_integration.js.map +1 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.js +36 -0
- package/ts_build/tests/manual/ycmd/test_ycmd_simple.js.map +1 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.d.ts +2 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.js +134 -0
- package/ts_build/tests/manual/ycmd/working_simple_test.js.map +1 -0
- package/tsconfig.json +3 -1
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { TestResult } from './evaluators/types';
|
|
2
|
+
|
|
3
|
+
export interface BenchmarkConfig {
|
|
4
|
+
language: string;
|
|
5
|
+
maxExercises: number;
|
|
6
|
+
model: string;
|
|
7
|
+
provider: string;
|
|
8
|
+
agent?: string; // Agent type to use (default: 'Patcher')
|
|
9
|
+
limits: BenchmarkLimits;
|
|
10
|
+
outputFile: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface BenchmarkLimits {
|
|
14
|
+
maxTurns: number;
|
|
15
|
+
maxTime: number; // in seconds
|
|
16
|
+
maxCost: number; // in dollars
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ExerciseResult {
|
|
20
|
+
exerciseName: string;
|
|
21
|
+
status: 'success' | 'failure' | 'timeout' | 'cost_limit' | 'turn_limit';
|
|
22
|
+
testResult?: TestResult; // Actual test execution results
|
|
23
|
+
turns: number;
|
|
24
|
+
timeElapsed: number; // in seconds
|
|
25
|
+
cost: number; // in dollars
|
|
26
|
+
startTime: Date;
|
|
27
|
+
endTime: Date;
|
|
28
|
+
errorMessage?: string;
|
|
29
|
+
finalOutput?: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface BenchmarkResults {
|
|
33
|
+
config: BenchmarkConfig;
|
|
34
|
+
exercises: ExerciseResult[];
|
|
35
|
+
summary: {
|
|
36
|
+
totalExercises: number;
|
|
37
|
+
testableExercises: number; // Exercises that had evaluatable tests
|
|
38
|
+
testsPassedCount: number; // Exercises where all tests passed
|
|
39
|
+
testsFailedCount: number; // Exercises where some tests failed
|
|
40
|
+
testPassRate: number; // Percentage of testable exercises where tests passed
|
|
41
|
+
agentSuccessRate: number; // Original success rate (agent thinks it succeeded)
|
|
42
|
+
successCount: number;
|
|
43
|
+
failureCount: number;
|
|
44
|
+
timeoutCount: number;
|
|
45
|
+
costLimitCount: number;
|
|
46
|
+
turnLimitCount: number;
|
|
47
|
+
totalTime: number;
|
|
48
|
+
totalCost: number;
|
|
49
|
+
averageTurns: number;
|
|
50
|
+
averageTime: number;
|
|
51
|
+
successRate: number;
|
|
52
|
+
};
|
|
53
|
+
startTime: Date;
|
|
54
|
+
endTime: Date;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface Exercise {
|
|
58
|
+
name: string;
|
|
59
|
+
path: string;
|
|
60
|
+
description?: string;
|
|
61
|
+
hasTests: boolean;
|
|
62
|
+
files: string[];
|
|
63
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2020",
|
|
4
|
+
"module": "commonjs",
|
|
5
|
+
"lib": ["ES2020"],
|
|
6
|
+
"outDir": "./dist",
|
|
7
|
+
"rootDir": "./src",
|
|
8
|
+
"strict": true,
|
|
9
|
+
"esModuleInterop": true,
|
|
10
|
+
"skipLibCheck": true,
|
|
11
|
+
"forceConsistentCasingInFileNames": true,
|
|
12
|
+
"resolveJsonModule": true,
|
|
13
|
+
"declaration": true,
|
|
14
|
+
"declarationMap": true,
|
|
15
|
+
"sourceMap": true,
|
|
16
|
+
},
|
|
17
|
+
"include": ["src/**/*"],
|
|
18
|
+
"exclude": ["node_modules", "dist", "**/*.test.ts"]
|
|
19
|
+
}
|
package/jest.config.js
CHANGED
|
@@ -14,5 +14,6 @@ module.exports = {
|
|
|
14
14
|
testEnvironment: 'node',
|
|
15
15
|
testRegex: '/tests/.*\.(test|spec)?\.(ts|tsx|js)$',
|
|
16
16
|
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
|
17
|
-
modulePathIgnorePatterns: ["ts_build"]
|
|
17
|
+
modulePathIgnorePatterns: ["ts_build", "benchmarks"],
|
|
18
|
+
testPathIgnorePatterns: ["<rootDir>/benchmarks/"]
|
|
18
19
|
};
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Benchmark Results Leaderboard
|
|
2
|
+
|
|
3
|
+
A Next.js application to display and analyze benchmark results from coding exercise evaluations.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Interactive Leaderboard**: Sortable table showing model performance metrics
|
|
8
|
+
- **Data Visualization**: Charts comparing success rates and cost vs performance
|
|
9
|
+
- **Model Comparison**: Detailed statistics for each model/provider/language combination
|
|
10
|
+
- **Responsive Design**: Works on desktop and mobile devices
|
|
11
|
+
- **Real-time Data**: Automatically loads latest benchmark results
|
|
12
|
+
|
|
13
|
+
## Getting Started
|
|
14
|
+
|
|
15
|
+
### Prerequisites
|
|
16
|
+
|
|
17
|
+
- Node.js 18+
|
|
18
|
+
- npm or yarn
|
|
19
|
+
|
|
20
|
+
### Installation
|
|
21
|
+
|
|
22
|
+
1. Install dependencies:
|
|
23
|
+
```bash
|
|
24
|
+
npm install
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
2. Run the development server:
|
|
28
|
+
```bash
|
|
29
|
+
npm run dev
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
3. Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
|
|
33
|
+
|
|
34
|
+
### Data Source
|
|
35
|
+
|
|
36
|
+
The application reads benchmark results from `../benchmarks/results/results.json`. Make sure to run benchmarks first to generate data.
|
|
37
|
+
|
|
38
|
+
Expected file structure:
|
|
39
|
+
```
|
|
40
|
+
benchmarks/
|
|
41
|
+
results/
|
|
42
|
+
results.json # Main results file
|
|
43
|
+
# Additional result files can be added here
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Project Structure
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
leaderboard/
|
|
50
|
+
├── app/
|
|
51
|
+
│ ├── globals.css # Global styles
|
|
52
|
+
│ ├── layout.tsx # Root layout
|
|
53
|
+
│ └── page.tsx # Main page
|
|
54
|
+
├── components/
|
|
55
|
+
│ ├── LeaderboardTable.tsx # Sortable results table
|
|
56
|
+
│ └── PerformanceChart.tsx # Data visualization
|
|
57
|
+
├── types/
|
|
58
|
+
│ └── benchmark.ts # TypeScript interfaces
|
|
59
|
+
├── utils/
|
|
60
|
+
│ └── dataProcessor.ts # Data loading and aggregation
|
|
61
|
+
└── package.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Available Scripts
|
|
65
|
+
|
|
66
|
+
- `npm run dev` - Start development server
|
|
67
|
+
- `npm run build` - Build for production
|
|
68
|
+
- `npm run start` - Start production server
|
|
69
|
+
- `npm run lint` - Run ESLint
|
|
70
|
+
|
|
71
|
+
## Metrics Displayed
|
|
72
|
+
|
|
73
|
+
- **Success Rate**: Percentage of exercises completed successfully
|
|
74
|
+
- **Total Exercises**: Number of exercises attempted
|
|
75
|
+
- **Average Cost**: Mean cost per exercise in USD
|
|
76
|
+
- **Average Time**: Mean time per exercise in seconds
|
|
77
|
+
- **Average Turns**: Mean number of agent turns per exercise
|
|
78
|
+
- **Total Runs**: Number of benchmark runs for this model/language
|
|
79
|
+
|
|
80
|
+
## Charts
|
|
81
|
+
|
|
82
|
+
1. **Success Rate Comparison**: Bar chart showing top 10 models by success rate
|
|
83
|
+
2. **Cost vs Performance**: Scatter plot comparing cost efficiency vs success rate
|
|
84
|
+
|
|
85
|
+
## Customization
|
|
86
|
+
|
|
87
|
+
### Adding New Data Sources
|
|
88
|
+
|
|
89
|
+
Modify `utils/dataProcessor.ts` to load additional result files or change the aggregation logic.
|
|
90
|
+
|
|
91
|
+
### Styling
|
|
92
|
+
|
|
93
|
+
The application uses Tailwind CSS. Modify component styles directly in the JSX files or update `globals.css` for global changes.
|
|
94
|
+
|
|
95
|
+
### Adding New Charts
|
|
96
|
+
|
|
97
|
+
Use the Recharts library to create additional visualizations in `components/PerformanceChart.tsx`.
|
|
98
|
+
|
|
99
|
+
## Deployment
|
|
100
|
+
|
|
101
|
+
### Build for Production
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
npm run build
|
|
105
|
+
npm run start
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Deploy to Vercel
|
|
109
|
+
|
|
110
|
+
The easiest way to deploy is using the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme).
|
|
111
|
+
|
|
112
|
+
## Data Format
|
|
113
|
+
|
|
114
|
+
The application expects benchmark results in this format:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"config": {
|
|
119
|
+
"language": "javascript",
|
|
120
|
+
"model": "claude-sonnet-4",
|
|
121
|
+
"provider": "openai",
|
|
122
|
+
"maxExercises": 1,
|
|
123
|
+
"limits": {
|
|
124
|
+
"maxTurns": 20,
|
|
125
|
+
"maxTime": 300,
|
|
126
|
+
"maxCost": 1
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
"exercises": [
|
|
130
|
+
{
|
|
131
|
+
"exerciseName": "accumulate",
|
|
132
|
+
"status": "success",
|
|
133
|
+
"turns": 1,
|
|
134
|
+
"timeElapsed": 46.668,
|
|
135
|
+
"cost": 0.090424,
|
|
136
|
+
"startTime": "2025-08-02T07:26:04.029Z",
|
|
137
|
+
"endTime": "2025-08-02T07:26:50.697Z"
|
|
138
|
+
}
|
|
139
|
+
],
|
|
140
|
+
"summary": {
|
|
141
|
+
"totalExercises": 1,
|
|
142
|
+
"successCount": 1,
|
|
143
|
+
"totalTime": 46.668,
|
|
144
|
+
"totalCost": 0.090424,
|
|
145
|
+
"successRate": 1
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
```
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { NextRequest, NextResponse } from 'next/server';
|
|
2
|
+
import { BenchmarkResults, LeaderboardEntry } from '@/types/benchmark';
|
|
3
|
+
import fs from 'fs';
|
|
4
|
+
import path from 'path';
|
|
5
|
+
|
|
6
|
+
export async function GET(request: NextRequest) {
|
|
7
|
+
try {
|
|
8
|
+
const results = await loadAllBenchmarkResults();
|
|
9
|
+
|
|
10
|
+
const leaderboardData = aggregateResults(results);
|
|
11
|
+
return NextResponse.json(leaderboardData);
|
|
12
|
+
} catch (error) {
|
|
13
|
+
console.error('Error loading benchmark results:', error);
|
|
14
|
+
|
|
15
|
+
// Return mock data for development
|
|
16
|
+
const mockData: LeaderboardEntry[] = [
|
|
17
|
+
{
|
|
18
|
+
model: 'sample-model',
|
|
19
|
+
provider: 'sample-provider',
|
|
20
|
+
language: 'javascript',
|
|
21
|
+
successRate: 85.5,
|
|
22
|
+
totalExercises: 6,
|
|
23
|
+
averageCost: 0.05,
|
|
24
|
+
averageTime: 145.2,
|
|
25
|
+
averageTurns: 12.4,
|
|
26
|
+
totalRuns: 1,
|
|
27
|
+
lastRun: new Date().toISOString()
|
|
28
|
+
}
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
return NextResponse.json(mockData);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Recursive function to find JSON files in nested directories
|
|
36
|
+
function findBenchmarkFiles(dir: string): string[] {
|
|
37
|
+
const files: string[] = [];
|
|
38
|
+
|
|
39
|
+
try {
|
|
40
|
+
const items = fs.readdirSync(dir, { withFileTypes: true });
|
|
41
|
+
|
|
42
|
+
for (const item of items) {
|
|
43
|
+
const fullPath = path.join(dir, item.name);
|
|
44
|
+
|
|
45
|
+
if (item.isDirectory()) {
|
|
46
|
+
// Recursively search subdirectories
|
|
47
|
+
files.push(...findBenchmarkFiles(fullPath));
|
|
48
|
+
} else if (item.isFile() && item.name.endsWith('.json')) {
|
|
49
|
+
// Add JSON files to our list
|
|
50
|
+
files.push(fullPath);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
} catch (error) {
|
|
54
|
+
// Ignore directories we can't read
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return files;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async function loadAllBenchmarkResults(): Promise<BenchmarkResults[]> {
|
|
61
|
+
const resultsPath = path.join(process.cwd(), '..', 'benchmarks', 'results');
|
|
62
|
+
const results: BenchmarkResults[] = [];
|
|
63
|
+
|
|
64
|
+
if (!fs.existsSync(resultsPath)) {
|
|
65
|
+
console.warn('Benchmark results directory not found:', resultsPath);
|
|
66
|
+
return results;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Find all JSON files recursively - handles both old and new file structures
|
|
70
|
+
const allFiles = findBenchmarkFiles(resultsPath);
|
|
71
|
+
|
|
72
|
+
for (const filePath of allFiles) {
|
|
73
|
+
try {
|
|
74
|
+
const data = fs.readFileSync(filePath, 'utf8');
|
|
75
|
+
const parsed = JSON.parse(data);
|
|
76
|
+
|
|
77
|
+
// Validate that this is a valid benchmark result
|
|
78
|
+
if (parsed.config && parsed.summary && parsed.exercises) {
|
|
79
|
+
results.push(parsed);
|
|
80
|
+
}
|
|
81
|
+
} catch (error) {
|
|
82
|
+
console.error(`Error loading result file ${filePath}:`, error);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return results;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function aggregateResults(results: BenchmarkResults[]): LeaderboardEntry[] {
|
|
90
|
+
const entriesMap = new Map<string, LeaderboardEntry>();
|
|
91
|
+
|
|
92
|
+
for (const result of results) {
|
|
93
|
+
const key = `${result.config.model}-${result.config.provider}-${result.config.language}`;
|
|
94
|
+
|
|
95
|
+
if (entriesMap.has(key)) {
|
|
96
|
+
// Keep track of total runs, but only show most recent performance
|
|
97
|
+
const existing = entriesMap.get(key)!;
|
|
98
|
+
|
|
99
|
+
// Increment total runs count
|
|
100
|
+
existing.totalRuns = existing.totalRuns + 1;
|
|
101
|
+
|
|
102
|
+
// If this result is more recent, replace the performance data
|
|
103
|
+
if (result.endTime > existing.lastRun) {
|
|
104
|
+
existing.successRate = result.summary.successRate * 100; // Convert from decimal to percentage
|
|
105
|
+
existing.totalExercises = result.summary.totalExercises;
|
|
106
|
+
existing.averageCost = result.summary.totalCost / result.summary.totalExercises;
|
|
107
|
+
existing.averageTime = result.summary.averageTime;
|
|
108
|
+
existing.averageTurns = result.summary.averageTurns;
|
|
109
|
+
existing.lastRun = result.endTime;
|
|
110
|
+
}
|
|
111
|
+
} else {
|
|
112
|
+
// Create new entry
|
|
113
|
+
const entry: LeaderboardEntry = {
|
|
114
|
+
model: result.config.model,
|
|
115
|
+
provider: result.config.provider,
|
|
116
|
+
language: result.config.language,
|
|
117
|
+
successRate: result.summary.successRate * 100, // Convert from decimal to percentage
|
|
118
|
+
totalExercises: result.summary.totalExercises,
|
|
119
|
+
averageCost: result.summary.totalCost / result.summary.totalExercises,
|
|
120
|
+
averageTime: result.summary.averageTime,
|
|
121
|
+
averageTurns: result.summary.averageTurns,
|
|
122
|
+
totalRuns: 1,
|
|
123
|
+
lastRun: result.endTime
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
entriesMap.set(key, entry);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return Array.from(entriesMap.values());
|
|
131
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { NextRequest, NextResponse } from 'next/server';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { BenchmarkResults } from '@/types/benchmark';
|
|
5
|
+
|
|
6
|
+
// Recursive function to find JSON files in nested directories
|
|
7
|
+
function findBenchmarkFiles(dir: string): string[] {
|
|
8
|
+
const files: string[] = [];
|
|
9
|
+
|
|
10
|
+
try {
|
|
11
|
+
const items = fs.readdirSync(dir, { withFileTypes: true });
|
|
12
|
+
|
|
13
|
+
for (const item of items) {
|
|
14
|
+
const fullPath = path.join(dir, item.name);
|
|
15
|
+
|
|
16
|
+
if (item.isDirectory()) {
|
|
17
|
+
// Recursively search subdirectories
|
|
18
|
+
files.push(...findBenchmarkFiles(fullPath));
|
|
19
|
+
} else if (item.isFile() && item.name.endsWith('.json')) {
|
|
20
|
+
// Add JSON files to our list
|
|
21
|
+
files.push(fullPath);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
} catch (error) {
|
|
25
|
+
// Ignore directories we can't read
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return files;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export async function GET(request: NextRequest) {
|
|
32
|
+
const { searchParams } = new URL(request.url);
|
|
33
|
+
const model = searchParams.get('model');
|
|
34
|
+
const provider = searchParams.get('provider');
|
|
35
|
+
const language = searchParams.get('language');
|
|
36
|
+
const timestamp = searchParams.get('timestamp'); // Optional parameter to get specific run
|
|
37
|
+
|
|
38
|
+
if (!model || !provider || !language) {
|
|
39
|
+
return NextResponse.json(
|
|
40
|
+
{ error: 'Missing required parameters: model, provider, language' },
|
|
41
|
+
{ status: 400 }
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
try {
|
|
46
|
+
// Look for benchmark result files in the results directory
|
|
47
|
+
const resultsDir = path.join(process.cwd(), '..', 'benchmarks', 'results');
|
|
48
|
+
|
|
49
|
+
if (!fs.existsSync(resultsDir)) {
|
|
50
|
+
return NextResponse.json(
|
|
51
|
+
{ error: 'Results directory not found' },
|
|
52
|
+
{ status: 404 }
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Find all JSON files recursively in the results directory
|
|
57
|
+
const allFiles = findBenchmarkFiles(resultsDir);
|
|
58
|
+
|
|
59
|
+
// Filter files that match our model/provider/language criteria
|
|
60
|
+
const matchingFiles = allFiles.filter(filePath => {
|
|
61
|
+
try {
|
|
62
|
+
// Read and parse the JSON file to check its config
|
|
63
|
+
const fileContent = fs.readFileSync(filePath, 'utf-8');
|
|
64
|
+
const data = JSON.parse(fileContent);
|
|
65
|
+
|
|
66
|
+
if (!data.config) {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const configModel = data.config.model;
|
|
71
|
+
const configProvider = data.config.provider;
|
|
72
|
+
const configLanguage = data.config.language;
|
|
73
|
+
|
|
74
|
+
// Exact match on all three parameters
|
|
75
|
+
return configModel === model &&
|
|
76
|
+
configProvider === provider &&
|
|
77
|
+
configLanguage === language;
|
|
78
|
+
} catch (error) {
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
if (matchingFiles.length === 0) {
|
|
84
|
+
return NextResponse.json(
|
|
85
|
+
{ error: 'No benchmark results found for the specified model, provider, and language' },
|
|
86
|
+
{ status: 404 }
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Load all matching benchmark results
|
|
91
|
+
const allResults: BenchmarkResults[] = [];
|
|
92
|
+
const filePathMap = new Map<BenchmarkResults, string>(); // Track file paths for commit extraction
|
|
93
|
+
|
|
94
|
+
for (const filePath of matchingFiles) {
|
|
95
|
+
try {
|
|
96
|
+
const fileContent = fs.readFileSync(filePath, 'utf-8');
|
|
97
|
+
const benchmarkData = JSON.parse(fileContent);
|
|
98
|
+
|
|
99
|
+
// Validate that we have the expected structure
|
|
100
|
+
if (benchmarkData.exercises && benchmarkData.summary && benchmarkData.config) {
|
|
101
|
+
allResults.push(benchmarkData);
|
|
102
|
+
filePathMap.set(benchmarkData, filePath);
|
|
103
|
+
}
|
|
104
|
+
} catch (parseError) {
|
|
105
|
+
console.error(`Error parsing file ${filePath}:`, parseError);
|
|
106
|
+
// Continue with other files
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (allResults.length === 0) {
|
|
111
|
+
return NextResponse.json(
|
|
112
|
+
{ error: 'No valid benchmark results found' },
|
|
113
|
+
{ status: 404 }
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Sort results by endTime (most recent first)
|
|
118
|
+
allResults.sort((a, b) => new Date(b.endTime).getTime() - new Date(a.endTime).getTime());
|
|
119
|
+
|
|
120
|
+
// If timestamp is provided, return that specific run
|
|
121
|
+
if (timestamp) {
|
|
122
|
+
const targetTime = timestamp;
|
|
123
|
+
const specificRun = allResults.find(result => result.endTime === targetTime);
|
|
124
|
+
|
|
125
|
+
if (specificRun) {
|
|
126
|
+
return NextResponse.json({
|
|
127
|
+
latest: specificRun,
|
|
128
|
+
history: [], // Don't need history for specific run view
|
|
129
|
+
totalRuns: allResults.length
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Get the most recent result as the main data
|
|
135
|
+
const latestResult = allResults[0];
|
|
136
|
+
|
|
137
|
+
// Create historical summary for previous runs (excluding the latest)
|
|
138
|
+
const previousRuns = allResults; // Skip the first (latest) result
|
|
139
|
+
const historicalRuns = previousRuns.map(result => ({
|
|
140
|
+
endTime: result.endTime,
|
|
141
|
+
successRate: result.summary.successRate * 100, // Convert to percentage
|
|
142
|
+
totalExercises: result.summary.totalExercises,
|
|
143
|
+
totalCost: result.summary.totalCost,
|
|
144
|
+
averageTime: result.summary.averageTime,
|
|
145
|
+
averageTurns: result.summary.averageTurns,
|
|
146
|
+
// Include commit info if available
|
|
147
|
+
commitHash: result.commitHash || 'unknown',
|
|
148
|
+
// Calculate average cost per exercise
|
|
149
|
+
averageCost: result.summary.totalCost / result.summary.totalExercises
|
|
150
|
+
}));
|
|
151
|
+
|
|
152
|
+
// Return both the latest detailed result and historical summary
|
|
153
|
+
const response = {
|
|
154
|
+
// Latest detailed benchmark data
|
|
155
|
+
latest: latestResult,
|
|
156
|
+
// Historical performance summary
|
|
157
|
+
history: historicalRuns,
|
|
158
|
+
// Total number of runs
|
|
159
|
+
totalRuns: allResults.length
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
return NextResponse.json(response);
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
} catch (error) {
|
|
166
|
+
console.error('Error reading benchmark detail:', error);
|
|
167
|
+
return NextResponse.json(
|
|
168
|
+
{ error: 'Internal server error' },
|
|
169
|
+
{ status: 500 }
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
}
|