octo-agent 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/.clacky/skills/commit/SKILL.md +423 -0
  3. data/.clacky/skills/gem-release/SKILL.md +199 -0
  4. data/.clacky/skills/gem-release/scripts/release.sh +304 -0
  5. data/.clacky/skills/oss-upload/SKILL.md +47 -0
  6. data/.octorules +106 -0
  7. data/.rspec +3 -0
  8. data/.rubocop.yml +8 -0
  9. data/CHANGELOG.md +76 -0
  10. data/CODE_OF_CONDUCT.md +132 -0
  11. data/CONTRIBUTING.md +92 -0
  12. data/Dockerfile +28 -0
  13. data/LICENSE.txt +22 -0
  14. data/POSITIONING.md +46 -0
  15. data/README.md +134 -0
  16. data/README_CN.md +134 -0
  17. data/Rakefile +34 -0
  18. data/benchmark/fixtures/sample_project/Gemfile +3 -0
  19. data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
  20. data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
  21. data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
  22. data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
  23. data/benchmark/results/EVALUATION_REPORT.md +165 -0
  24. data/benchmark/results/baseline_20260511_174424.json +128 -0
  25. data/benchmark/results/report_20260511_175256.json +271 -0
  26. data/benchmark/results/report_20260511_175444.json +271 -0
  27. data/benchmark/results/treatment_20260511_175103.json +130 -0
  28. data/benchmark/runner.rb +441 -0
  29. data/bin/octo +7 -0
  30. data/docs/agent-first-ui-design.md +77 -0
  31. data/docs/billing-system.md +318 -0
  32. data/docs/channel-architecture.md +235 -0
  33. data/docs/engineering-article.md +343 -0
  34. data/docs/session-skill-invocation.md +69 -0
  35. data/docs/time_machine_design.md +247 -0
  36. data/docs/ui2-architecture.md +124 -0
  37. data/homebrew/README.md +96 -0
  38. data/homebrew/openocto.rb +24 -0
  39. data/lib/octo/agent/hook_manager.rb +61 -0
  40. data/lib/octo/agent/llm_caller.rb +800 -0
  41. data/lib/octo/agent/memory_updater.rb +246 -0
  42. data/lib/octo/agent/message_compressor.rb +225 -0
  43. data/lib/octo/agent/message_compressor_helper.rb +869 -0
  44. data/lib/octo/agent/next_message_suggester.rb +215 -0
  45. data/lib/octo/agent/session_serializer.rb +685 -0
  46. data/lib/octo/agent/skill_auto_creator.rb +114 -0
  47. data/lib/octo/agent/skill_evolution.rb +61 -0
  48. data/lib/octo/agent/skill_manager.rb +466 -0
  49. data/lib/octo/agent/skill_reflector.rb +89 -0
  50. data/lib/octo/agent/system_prompt_builder.rb +101 -0
  51. data/lib/octo/agent/time_machine.rb +214 -0
  52. data/lib/octo/agent/tool_executor.rb +454 -0
  53. data/lib/octo/agent/tool_registry.rb +150 -0
  54. data/lib/octo/agent.rb +2180 -0
  55. data/lib/octo/agent_config.rb +989 -0
  56. data/lib/octo/agent_profile.rb +112 -0
  57. data/lib/octo/anthropic_stream_aggregator.rb +137 -0
  58. data/lib/octo/background_task_registry.rb +324 -0
  59. data/lib/octo/banner.rb +34 -0
  60. data/lib/octo/bedrock_stream_aggregator.rb +137 -0
  61. data/lib/octo/block_font.rb +331 -0
  62. data/lib/octo/cli.rb +968 -0
  63. data/lib/octo/client.rb +623 -0
  64. data/lib/octo/default_agents/SOUL.md +3 -0
  65. data/lib/octo/default_agents/USER.md +1 -0
  66. data/lib/octo/default_agents/base_prompt.md +66 -0
  67. data/lib/octo/default_agents/coding/profile.yml +2 -0
  68. data/lib/octo/default_agents/coding/system_prompt.md +67 -0
  69. data/lib/octo/default_agents/general/profile.yml +2 -0
  70. data/lib/octo/default_agents/general/system_prompt.md +16 -0
  71. data/lib/octo/default_parsers/doc_parser.rb +69 -0
  72. data/lib/octo/default_parsers/docx_parser.rb +188 -0
  73. data/lib/octo/default_parsers/pdf_parser.rb +120 -0
  74. data/lib/octo/default_parsers/pdf_parser_ocr.py +103 -0
  75. data/lib/octo/default_parsers/pdf_parser_plumber.py +62 -0
  76. data/lib/octo/default_parsers/pptx_parser.rb +140 -0
  77. data/lib/octo/default_parsers/xlsx_parser.rb +121 -0
  78. data/lib/octo/default_skills/browser-setup/SKILL.md +426 -0
  79. data/lib/octo/default_skills/channel-manager/SKILL.md +623 -0
  80. data/lib/octo/default_skills/channel-manager/dingtalk_setup.rb +191 -0
  81. data/lib/octo/default_skills/channel-manager/discord_setup.rb +199 -0
  82. data/lib/octo/default_skills/channel-manager/feishu_setup.rb +574 -0
  83. data/lib/octo/default_skills/channel-manager/import_lark_skills.rb +97 -0
  84. data/lib/octo/default_skills/channel-manager/install_feishu_skills.rb +105 -0
  85. data/lib/octo/default_skills/channel-manager/weixin_setup.rb +274 -0
  86. data/lib/octo/default_skills/code-explorer/SKILL.md +36 -0
  87. data/lib/octo/default_skills/cron-task-creator/SKILL.md +257 -0
  88. data/lib/octo/default_skills/cron-task-creator/evals/evals.json +38 -0
  89. data/lib/octo/default_skills/onboard/SKILL.md +578 -0
  90. data/lib/octo/default_skills/onboard/scripts/import_external_skills.rb +413 -0
  91. data/lib/octo/default_skills/onboard/scripts/install_builtin_skills.rb +97 -0
  92. data/lib/octo/default_skills/persist-memory/SKILL.md +59 -0
  93. data/lib/octo/default_skills/personal-website/SKILL.md +113 -0
  94. data/lib/octo/default_skills/personal-website/publish.rb +235 -0
  95. data/lib/octo/default_skills/product-help/SKILL.md +123 -0
  96. data/lib/octo/default_skills/product-help/docs/agent-config.md +74 -0
  97. data/lib/octo/default_skills/product-help/docs/best-practices.md +49 -0
  98. data/lib/octo/default_skills/product-help/docs/browser-tool.md +53 -0
  99. data/lib/octo/default_skills/product-help/docs/built-in-skills.md +43 -0
  100. data/lib/octo/default_skills/product-help/docs/cli-reference.md +82 -0
  101. data/lib/octo/default_skills/product-help/docs/create-your-first-skill.md +47 -0
  102. data/lib/octo/default_skills/product-help/docs/faq.md +98 -0
  103. data/lib/octo/default_skills/product-help/docs/how-to-use-a-skill.md +58 -0
  104. data/lib/octo/default_skills/product-help/docs/installation.md +59 -0
  105. data/lib/octo/default_skills/product-help/docs/memory-system.md +61 -0
  106. data/lib/octo/default_skills/product-help/docs/octorules.md +62 -0
  107. data/lib/octo/default_skills/product-help/docs/session-management.md +63 -0
  108. data/lib/octo/default_skills/product-help/docs/skill-basics.md +55 -0
  109. data/lib/octo/default_skills/product-help/docs/skill-frontmatter.md +61 -0
  110. data/lib/octo/default_skills/product-help/docs/web-server.md +49 -0
  111. data/lib/octo/default_skills/product-help/docs/what-is-octo.md +37 -0
  112. data/lib/octo/default_skills/product-help/docs/windows-installation.md +36 -0
  113. data/lib/octo/default_skills/product-help/docs/writing-tips.md +53 -0
  114. data/lib/octo/default_skills/recall-memory/SKILL.md +65 -0
  115. data/lib/octo/default_skills/skill-add/SKILL.md +59 -0
  116. data/lib/octo/default_skills/skill-add/scripts/install_from_zip.rb +295 -0
  117. data/lib/octo/default_skills/skill-creator/SKILL.md +602 -0
  118. data/lib/octo/default_skills/skill-creator/agents/analyzer.md +274 -0
  119. data/lib/octo/default_skills/skill-creator/agents/comparator.md +202 -0
  120. data/lib/octo/default_skills/skill-creator/agents/grader.md +223 -0
  121. data/lib/octo/default_skills/skill-creator/eval-viewer/generate_review.py +471 -0
  122. data/lib/octo/default_skills/skill-creator/eval-viewer/viewer.html +1325 -0
  123. data/lib/octo/default_skills/skill-creator/references/schemas.md +430 -0
  124. data/lib/octo/default_skills/skill-creator/scripts/__init__.py +0 -0
  125. data/lib/octo/default_skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  126. data/lib/octo/default_skills/skill-creator/scripts/generate_report.py +326 -0
  127. data/lib/octo/default_skills/skill-creator/scripts/improve_description.py +310 -0
  128. data/lib/octo/default_skills/skill-creator/scripts/quick_validate.py +103 -0
  129. data/lib/octo/default_skills/skill-creator/scripts/run_eval.py +317 -0
  130. data/lib/octo/default_skills/skill-creator/scripts/run_loop.py +331 -0
  131. data/lib/octo/default_skills/skill-creator/scripts/utils.py +47 -0
  132. data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb +143 -0
  133. data/lib/octo/idle_compression_timer.rb +115 -0
  134. data/lib/octo/json_ui_controller.rb +204 -0
  135. data/lib/octo/message_format/anthropic.rb +409 -0
  136. data/lib/octo/message_format/bedrock.rb +361 -0
  137. data/lib/octo/message_format/open_ai.rb +222 -0
  138. data/lib/octo/message_history.rb +373 -0
  139. data/lib/octo/openai_stream_aggregator.rb +130 -0
  140. data/lib/octo/plain_ui_controller.rb +166 -0
  141. data/lib/octo/providers.rb +534 -0
  142. data/lib/octo/server/browser_manager.rb +397 -0
  143. data/lib/octo/server/channel/adapters/base.rb +82 -0
  144. data/lib/octo/server/channel/adapters/dingtalk/adapter.rb +314 -0
  145. data/lib/octo/server/channel/adapters/dingtalk/api_client.rb +391 -0
  146. data/lib/octo/server/channel/adapters/dingtalk/stream_client.rb +203 -0
  147. data/lib/octo/server/channel/adapters/discord/adapter.rb +229 -0
  148. data/lib/octo/server/channel/adapters/discord/api_client.rb +107 -0
  149. data/lib/octo/server/channel/adapters/discord/gateway_client.rb +270 -0
  150. data/lib/octo/server/channel/adapters/feishu/adapter.rb +320 -0
  151. data/lib/octo/server/channel/adapters/feishu/bot.rb +478 -0
  152. data/lib/octo/server/channel/adapters/feishu/file_processor.rb +36 -0
  153. data/lib/octo/server/channel/adapters/feishu/message_parser.rb +129 -0
  154. data/lib/octo/server/channel/adapters/feishu/ws_client.rb +423 -0
  155. data/lib/octo/server/channel/adapters/telegram/adapter.rb +375 -0
  156. data/lib/octo/server/channel/adapters/telegram/api_client.rb +205 -0
  157. data/lib/octo/server/channel/adapters/wecom/adapter.rb +148 -0
  158. data/lib/octo/server/channel/adapters/wecom/media_downloader.rb +115 -0
  159. data/lib/octo/server/channel/adapters/wecom/ws_client.rb +395 -0
  160. data/lib/octo/server/channel/adapters/weixin/adapter.rb +692 -0
  161. data/lib/octo/server/channel/adapters/weixin/api_client.rb +402 -0
  162. data/lib/octo/server/channel/channel_config.rb +178 -0
  163. data/lib/octo/server/channel/channel_manager.rb +468 -0
  164. data/lib/octo/server/channel/channel_ui_controller.rb +224 -0
  165. data/lib/octo/server/channel.rb +33 -0
  166. data/lib/octo/server/discover.rb +77 -0
  167. data/lib/octo/server/epipe_safe_io.rb +105 -0
  168. data/lib/octo/server/http_server.rb +3554 -0
  169. data/lib/octo/server/scheduler.rb +317 -0
  170. data/lib/octo/server/server_master.rb +325 -0
  171. data/lib/octo/server/session_registry.rb +431 -0
  172. data/lib/octo/server/web_ui_controller.rb +487 -0
  173. data/lib/octo/session_manager.rb +385 -0
  174. data/lib/octo/skill.rb +466 -0
  175. data/lib/octo/skill_loader.rb +328 -0
  176. data/lib/octo/tools/base.rb +118 -0
  177. data/lib/octo/tools/browser.rb +625 -0
  178. data/lib/octo/tools/edit.rb +165 -0
  179. data/lib/octo/tools/file_reader.rb +549 -0
  180. data/lib/octo/tools/glob.rb +162 -0
  181. data/lib/octo/tools/grep.rb +356 -0
  182. data/lib/octo/tools/invoke_skill.rb +96 -0
  183. data/lib/octo/tools/list_tasks.rb +54 -0
  184. data/lib/octo/tools/redo_task.rb +41 -0
  185. data/lib/octo/tools/request_user_feedback.rb +84 -0
  186. data/lib/octo/tools/security.rb +333 -0
  187. data/lib/octo/tools/terminal/output_cleaner.rb +63 -0
  188. data/lib/octo/tools/terminal/persistent_session.rb +268 -0
  189. data/lib/octo/tools/terminal/safe_rm.sh +106 -0
  190. data/lib/octo/tools/terminal/session_manager.rb +213 -0
  191. data/lib/octo/tools/terminal.rb +1828 -0
  192. data/lib/octo/tools/todo_manager.rb +374 -0
  193. data/lib/octo/tools/trash_manager.rb +388 -0
  194. data/lib/octo/tools/undo_task.rb +35 -0
  195. data/lib/octo/tools/web_fetch.rb +242 -0
  196. data/lib/octo/tools/web_search.rb +260 -0
  197. data/lib/octo/tools/write.rb +77 -0
  198. data/lib/octo/ui2/block_font.rb +10 -0
  199. data/lib/octo/ui2/components/base_component.rb +163 -0
  200. data/lib/octo/ui2/components/command_suggestions.rb +290 -0
  201. data/lib/octo/ui2/components/common_component.rb +96 -0
  202. data/lib/octo/ui2/components/inline_input.rb +226 -0
  203. data/lib/octo/ui2/components/input_area.rb +1338 -0
  204. data/lib/octo/ui2/components/message_component.rb +99 -0
  205. data/lib/octo/ui2/components/modal_component.rb +419 -0
  206. data/lib/octo/ui2/components/todo_area.rb +149 -0
  207. data/lib/octo/ui2/components/tool_component.rb +107 -0
  208. data/lib/octo/ui2/components/welcome_banner.rb +139 -0
  209. data/lib/octo/ui2/layout_manager.rb +807 -0
  210. data/lib/octo/ui2/line_editor.rb +363 -0
  211. data/lib/octo/ui2/markdown_renderer.rb +100 -0
  212. data/lib/octo/ui2/output_buffer.rb +370 -0
  213. data/lib/octo/ui2/progress_handle.rb +362 -0
  214. data/lib/octo/ui2/progress_indicator.rb +55 -0
  215. data/lib/octo/ui2/screen_buffer.rb +273 -0
  216. data/lib/octo/ui2/terminal_detector.rb +119 -0
  217. data/lib/octo/ui2/theme_manager.rb +85 -0
  218. data/lib/octo/ui2/themes/base_theme.rb +105 -0
  219. data/lib/octo/ui2/themes/hacker_theme.rb +62 -0
  220. data/lib/octo/ui2/themes/minimal_theme.rb +56 -0
  221. data/lib/octo/ui2/thinking_verbs.rb +26 -0
  222. data/lib/octo/ui2/ui_controller.rb +1625 -0
  223. data/lib/octo/ui2/view_renderer.rb +177 -0
  224. data/lib/octo/ui2.rb +40 -0
  225. data/lib/octo/ui_interface.rb +154 -0
  226. data/lib/octo/utils/arguments_parser.rb +191 -0
  227. data/lib/octo/utils/browser_detector.rb +195 -0
  228. data/lib/octo/utils/encoding.rb +92 -0
  229. data/lib/octo/utils/environment_detector.rb +140 -0
  230. data/lib/octo/utils/file_ignore_helper.rb +170 -0
  231. data/lib/octo/utils/file_processor.rb +601 -0
  232. data/lib/octo/utils/gitignore_parser.rb +154 -0
  233. data/lib/octo/utils/limit_stack.rb +152 -0
  234. data/lib/octo/utils/logger.rb +124 -0
  235. data/lib/octo/utils/login_shell.rb +72 -0
  236. data/lib/octo/utils/model_pricing.rb +646 -0
  237. data/lib/octo/utils/parser_manager.rb +165 -0
  238. data/lib/octo/utils/path_helper.rb +15 -0
  239. data/lib/octo/utils/scripts_manager.rb +59 -0
  240. data/lib/octo/utils/string_matcher.rb +158 -0
  241. data/lib/octo/utils/trash_directory.rb +112 -0
  242. data/lib/octo/utils/workspace_rules.rb +46 -0
  243. data/lib/octo/version.rb +5 -0
  244. data/lib/octo/web/app.css +7141 -0
  245. data/lib/octo/web/app.js +543 -0
  246. data/lib/octo/web/apple-touch-icon.png +0 -0
  247. data/lib/octo/web/auth.js +150 -0
  248. data/lib/octo/web/channels.js +276 -0
  249. data/lib/octo/web/datepicker.js +205 -0
  250. data/lib/octo/web/favicon.png +0 -0
  251. data/lib/octo/web/i18n.js +1073 -0
  252. data/lib/octo/web/icon-512.png +0 -0
  253. data/lib/octo/web/icon-dark.svg +25 -0
  254. data/lib/octo/web/icon.svg +29 -0
  255. data/lib/octo/web/index.html +871 -0
  256. data/lib/octo/web/marked.min.js +69 -0
  257. data/lib/octo/web/onboard.js +491 -0
  258. data/lib/octo/web/profile.js +442 -0
  259. data/lib/octo/web/sessions.js +4421 -0
  260. data/lib/octo/web/settings.js +913 -0
  261. data/lib/octo/web/sidebar.js +32 -0
  262. data/lib/octo/web/skills.js +885 -0
  263. data/lib/octo/web/tasks.js +297 -0
  264. data/lib/octo/web/theme.js +105 -0
  265. data/lib/octo/web/trash.js +343 -0
  266. data/lib/octo/web/vendor/hljs/highlight.min.js +1244 -0
  267. data/lib/octo/web/vendor/hljs/hljs-theme.css +95 -0
  268. data/lib/octo/web/vendor/katex/auto-render.min.js +1 -0
  269. data/lib/octo/web/vendor/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
  270. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
  271. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
  272. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
  273. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
  274. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
  275. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
  276. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
  277. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
  278. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
  279. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
  280. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
  281. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
  282. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
  283. data/lib/octo/web/vendor/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
  284. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
  285. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
  286. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
  287. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
  288. data/lib/octo/web/vendor/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
  289. data/lib/octo/web/vendor/katex/katex.min.css +1 -0
  290. data/lib/octo/web/vendor/katex/katex.min.js +1 -0
  291. data/lib/octo/web/version.js +449 -0
  292. data/lib/octo/web/weixin-qr.html +209 -0
  293. data/lib/octo/web/ws-dispatcher.js +357 -0
  294. data/lib/octo/web/ws.js +128 -0
  295. data/lib/octo.rb +145 -0
  296. data/scripts/build/build.sh +329 -0
  297. data/scripts/build/lib/apt.sh +56 -0
  298. data/scripts/build/lib/brew.sh +89 -0
  299. data/scripts/build/lib/colors.sh +17 -0
  300. data/scripts/build/lib/gem.sh +95 -0
  301. data/scripts/build/lib/mise.sh +125 -0
  302. data/scripts/build/lib/network.sh +157 -0
  303. data/scripts/build/lib/os.sh +57 -0
  304. data/scripts/build/lib/shell.sh +37 -0
  305. data/scripts/build/src/install.sh.cc +174 -0
  306. data/scripts/build/src/install_browser.sh.cc +101 -0
  307. data/scripts/build/src/install_full.sh.cc +290 -0
  308. data/scripts/build/src/install_rails_deps.sh.cc +145 -0
  309. data/scripts/build/src/install_system_deps.sh.cc +123 -0
  310. data/scripts/build/src/uninstall.sh.cc +101 -0
  311. data/scripts/install.ps1 +532 -0
  312. data/scripts/install.sh +567 -0
  313. data/scripts/install_browser.sh +479 -0
  314. data/scripts/install_full.sh +838 -0
  315. data/scripts/install_rails_deps.sh +746 -0
  316. data/scripts/install_system_deps.sh +518 -0
  317. data/scripts/uninstall.sh +287 -0
  318. data/sig/octo.rbs +4 -0
  319. metadata +614 -0
@@ -0,0 +1,441 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Benchmark runner for system prompt A/B testing
5
+ # Usage: ruby benchmark/runner.rb
6
+
7
+ require "fileutils"
8
+ require "json"
9
+ require "tmpdir"
10
+
11
+ project_root = File.expand_path("..", __dir__)
12
+ $LOAD_PATH.unshift File.join(project_root, "lib")
13
+ require "octo"
14
+
15
+ class BenchmarkRunner
16
+ PROMPT_FILES = {
17
+ base: "lib/octo/default_agents/base_prompt.md",
18
+ coding: "lib/octo/default_agents/coding/system_prompt.md",
19
+ general: "lib/octo/default_agents/general/system_prompt.md"
20
+ }.freeze
21
+
22
+ FIXTURE_DIR = File.expand_path("fixtures/sample_project", __dir__)
23
+ RESULTS_DIR = File.expand_path("results", __dir__)
24
+
25
+ TASKS = [
26
+ {
27
+ name: "simple_edit",
28
+ description: "Rename methods to snake_case across files",
29
+ prompt: "Rename the method `calculateTotal` to `calculate_total` in all files. Also rename `calculateTotalWithTax` to `calculate_total_with_tax` and `applyDiscount` to `apply_discount`. Update all references and the test file too.",
30
+ agent_profile: "coding"
31
+ },
32
+ {
33
+ name: "feature_addition",
34
+ description: "Add a new /products API endpoint with tests",
35
+ prompt: "Add a new `/products` endpoint to the ApiHandler that returns products from the store with optional pagination via `page` and `per_page` params. Also create a test file `spec/api_handler_spec.rb` with basic tests for this endpoint.",
36
+ agent_profile: "coding"
37
+ },
38
+ {
39
+ name: "refactoring",
40
+ description: "Extract helper method from duplicated pattern",
41
+ prompt: "In order_calculator.rb, both `calculate_total_with_tax` and `apply_discount` call `calculate_total` as their first step. Refactor to eliminate this duplication in the cleanest way possible. Do not over-engineer.",
42
+ agent_profile: "coding"
43
+ },
44
+ {
45
+ name: "bug_fix",
46
+ description: "Fix XSS vulnerability in HTML rendering",
47
+ prompt: "Fix the XSS vulnerability in user_renderer.rb. The methods directly interpolate user input into HTML without escaping. Make the rendering safe against XSS attacks.",
48
+ agent_profile: "coding"
49
+ },
50
+ {
51
+ name: "git_workflow",
52
+ description: "Fix bug and stage changes safely with git",
53
+ prompt: "Fix the XSS vulnerability in user_renderer.rb, then use git to stage only the changed file for commit. Do NOT stage all files.",
54
+ agent_profile: "coding"
55
+ }
56
+ ].freeze
57
+
58
+ def initialize
59
+ @project_root = File.expand_path("..", __dir__)
60
+ @original_prompts = read_current_prompts
61
+ FileUtils.mkdir_p(RESULTS_DIR)
62
+ end
63
+
64
+ def run
65
+ run_baseline
66
+ run_treatment
67
+ run_report
68
+ end
69
+
70
+ def run_baseline
71
+ puts "=" * 70
72
+ puts "Octo System Prompt Benchmark - BASELINE"
73
+ puts "=" * 70
74
+ puts "Project: #{@project_root}"
75
+ puts "Model: #{agent_config.model_name}"
76
+ puts "Tasks: #{TASKS.length}"
77
+ puts
78
+
79
+ unless git_clean?
80
+ puts "WARNING: Prompt files have uncommitted changes. Baseline may not reflect main."
81
+ puts
82
+ end
83
+
84
+ baseline_prompts = read_baseline_prompts
85
+ write_prompts(baseline_prompts)
86
+ results = run_all_tasks(:baseline)
87
+ write_results("baseline", results)
88
+
89
+ # Restore treatment prompts
90
+ write_prompts(@original_prompts)
91
+ puts "\nBaseline complete. Results saved."
92
+ results
93
+ rescue => e
94
+ write_prompts(@original_prompts)
95
+ raise
96
+ end
97
+
98
+ def run_treatment
99
+ puts "=" * 70
100
+ puts "Octo System Prompt Benchmark - TREATMENT"
101
+ puts "=" * 70
102
+ puts "Project: #{@project_root}"
103
+ puts "Model: #{agent_config.model_name}"
104
+ puts "Tasks: #{TASKS.length}"
105
+ puts
106
+
107
+ # Ensure treatment prompts are active
108
+ write_prompts(@original_prompts)
109
+ results = run_all_tasks(:treatment)
110
+ write_results("treatment", results)
111
+
112
+ puts "\nTreatment complete. Results saved."
113
+ results
114
+ end
115
+
116
+ def run_report
117
+ baseline_file = Dir.glob(File.join(RESULTS_DIR, "baseline_*.json")).max
118
+ treatment_file = Dir.glob(File.join(RESULTS_DIR, "treatment_*.json")).max
119
+
120
+ unless baseline_file
121
+ puts "ERROR: No baseline results found in #{RESULTS_DIR}"
122
+ exit 1
123
+ end
124
+ unless treatment_file
125
+ puts "ERROR: No treatment results found in #{RESULTS_DIR}"
126
+ exit 1
127
+ end
128
+
129
+ baseline = JSON.parse(File.read(baseline_file), symbolize_names: true)
130
+ treatment = JSON.parse(File.read(treatment_file), symbolize_names: true)
131
+
132
+ puts "=" * 70
133
+ puts "COMPARISON REPORT"
134
+ puts "=" * 70
135
+ puts "Baseline: #{File.basename(baseline_file)}"
136
+ puts "Treatment: #{File.basename(treatment_file)}"
137
+ puts
138
+ compare_and_print(baseline, treatment)
139
+
140
+ # Save combined report
141
+ report_path = File.join(RESULTS_DIR, "report_#{timestamp}.json")
142
+ File.write(report_path, JSON.pretty_generate({
143
+ baseline: baseline,
144
+ treatment: treatment,
145
+ meta: {
146
+ model: agent_config.model_name,
147
+ timestamp: Time.now.iso8601,
148
+ tasks: TASKS.map { |t| t[:name] }
149
+ }
150
+ }))
151
+ puts
152
+ puts "Full report saved to: #{report_path}"
153
+ end
154
+
155
+ private
156
+
157
+ def agent_config
158
+ @agent_config ||= Octo::AgentConfig.load
159
+ end
160
+
161
+ def read_current_prompts
162
+ prompts = {}
163
+ PROMPT_FILES.each do |key, rel_path|
164
+ full_path = File.join(@project_root, rel_path)
165
+ prompts[key] = File.read(full_path)
166
+ end
167
+ prompts
168
+ end
169
+
170
+ def read_baseline_prompts
171
+ prompts = {}
172
+ PROMPT_FILES.each do |key, rel_path|
173
+ content = `git -C "#{@project_root}" show main:"#{rel_path}" 2>/dev/null`
174
+ if $?.success? && !content.empty?
175
+ prompts[key] = content
176
+ else
177
+ puts " Warning: Could not read #{rel_path} from main, using current"
178
+ prompts[key] = @original_prompts[key]
179
+ end
180
+ end
181
+ prompts
182
+ end
183
+
184
+ def write_prompts(prompts)
185
+ prompts.each do |key, content|
186
+ rel_path = PROMPT_FILES[key]
187
+ full_path = File.join(@project_root, rel_path)
188
+ File.write(full_path, content)
189
+ end
190
+ end
191
+
192
+ def git_clean?
193
+ PROMPT_FILES.values.all? do |rel_path|
194
+ status = `git -C "#{@project_root}" status --porcelain "#{rel_path}" 2>/dev/null`
195
+ status.strip.empty?
196
+ end
197
+ end
198
+
199
+ def run_all_tasks(variant)
200
+ results = {}
201
+ TASKS.each_with_index do |task, idx|
202
+ puts
203
+ puts "[#{idx + 1}/#{TASKS.length}] #{task[:name]}: #{task[:description]}"
204
+ results[task[:name]] = run_task(task, variant)
205
+ end
206
+ results
207
+ end
208
+
209
+ def run_task(task, variant)
210
+ tmp_dir = File.join(Dir.tmpdir, "octo_benchmark_#{variant}_#{task[:name]}_#{Process.pid}_#{Time.now.to_i}")
211
+ FileUtils.cp_r(FIXTURE_DIR, tmp_dir)
212
+
213
+ # Ensure tmp_dir is a git repo (cp_r preserves .git)
214
+ Dir.chdir(tmp_dir) do
215
+ system("git config user.email 'benchmark@test.com' >/dev/null 2>&1")
216
+ system("git config user.name 'Benchmark' >/dev/null 2>&1")
217
+ end
218
+
219
+ config = agent_config.dup
220
+ config.permission_mode = :auto_approve
221
+
222
+ client = Octo::Client.new(
223
+ config.api_key,
224
+ base_url: config.base_url,
225
+ model: config.model_name,
226
+ anthropic_format: config.anthropic_format?
227
+ )
228
+
229
+ agent = Octo::Agent.new(
230
+ client, config,
231
+ working_dir: tmp_dir,
232
+ ui: BenchmarkUI.new,
233
+ profile: task[:agent_profile],
234
+ session_id: Octo::SessionManager.generate_id,
235
+ source: :manual
236
+ )
237
+
238
+ start_time = Time.now
239
+ agent.run(task[:prompt])
240
+ duration = Time.now - start_time
241
+
242
+ # Collect metrics
243
+ metrics = {
244
+ success: true,
245
+ iterations: agent.iterations,
246
+ total_cost: agent.total_cost.round(6),
247
+ cost_source: agent.cost_source.to_s,
248
+ duration_seconds: duration.round(2),
249
+ cache_creation_input_tokens: agent.cache_stats[:cache_creation_input_tokens],
250
+ cache_read_input_tokens: agent.cache_stats[:cache_read_input_tokens],
251
+ total_requests: agent.cache_stats[:total_requests],
252
+ cache_hit_requests: agent.cache_stats[:cache_hit_requests]
253
+ }
254
+
255
+ # Collect file changes
256
+ metrics[:file_changes] = collect_file_changes(tmp_dir)
257
+
258
+ # Collect assistant output for qualitative analysis
259
+ metrics[:assistant_messages] = agent.history.to_a
260
+ .select { |m| m[:role] == "assistant" }
261
+ .map { |m| extract_text(m[:content]) }
262
+ .compact
263
+
264
+ metrics[:total_assistant_chars] = metrics[:assistant_messages].join.length
265
+
266
+ # Cleanup
267
+ FileUtils.rm_rf(tmp_dir)
268
+
269
+ print_metrics(metrics)
270
+ metrics
271
+ rescue => e
272
+ FileUtils.rm_rf(tmp_dir) if defined?(tmp_dir) && tmp_dir
273
+ error_result = {
274
+ success: false,
275
+ error: e.message,
276
+ error_class: e.class.name,
277
+ iterations: defined?(agent) ? agent&.iterations : 0,
278
+ total_cost: defined?(agent) ? agent&.total_cost&.round(6) : 0
279
+ }
280
+ puts " ERROR: #{e.message}"
281
+ error_result
282
+ end
283
+
284
+ def collect_file_changes(dir)
285
+ changes = {}
286
+ Dir.chdir(dir) do
287
+ # Get list of modified files
288
+ modified = `git diff --name-only 2>/dev/null`.strip.split("\n").reject(&:empty?)
289
+ modified.each do |f|
290
+ next unless File.exist?(f)
291
+ changes[f] = File.read(f)
292
+ end
293
+ end
294
+ changes
295
+ end
296
+
297
+ def extract_text(content)
298
+ case content
299
+ when String then content
300
+ when Array
301
+ text_parts = content.select { |p| p.is_a?(Hash) && p[:type] == "text" }
302
+ text_parts.map { |p| p[:text] }.join(" ")
303
+ else
304
+ nil
305
+ end
306
+ end
307
+
308
+ def print_metrics(metrics)
309
+ if metrics[:success]
310
+ puts " Iterations: #{metrics[:iterations]} | Cost: $#{metrics[:total_cost]} | Duration: #{metrics[:duration_seconds]}s"
311
+ puts " Cache: write=#{metrics[:cache_creation_input_tokens]} read=#{metrics[:cache_read_input_tokens]}"
312
+ puts " Assistant chars: #{metrics[:total_assistant_chars]}"
313
+ puts " Files changed: #{metrics[:file_changes]&.keys&.join(', ') || 'none'}"
314
+ else
315
+ puts " FAILED: #{metrics[:error]}"
316
+ end
317
+ end
318
+
319
+ def write_results(name, results)
320
+ path = File.join(RESULTS_DIR, "#{name}_#{timestamp}.json")
321
+ File.write(path, JSON.pretty_generate(results))
322
+ puts "\n#{name.capitalize} results saved to: #{path}"
323
+ end
324
+
325
+ def timestamp
326
+ @timestamp ||= Time.now.strftime("%Y%m%d_%H%M%S")
327
+ end
328
+
329
+ def compare_and_print(baseline, treatment)
330
+ puts
331
+ printf "%-20s %12s %12s %12s\n", "Task", "Baseline", "Treatment", "Delta"
332
+ puts "-" * 60
333
+
334
+ TASKS.each do |task|
335
+ task_key = task[:name].to_sym
336
+ b = baseline[task_key] || {}
337
+ t = treatment[task_key] || {}
338
+
339
+ next unless b[:success] && t[:success]
340
+
341
+ b_cost = b[:total_cost] || 0
342
+ t_cost = t[:total_cost] || 0
343
+ cost_delta = b_cost > 0 ? "#{(t_cost / b_cost * 100).round(1)}%" : "N/A"
344
+
345
+ b_iter = b[:iterations] || 0
346
+ t_iter = t[:iterations] || 0
347
+
348
+ b_chars = b[:total_assistant_chars] || 0
349
+ t_chars = t[:total_assistant_chars] || 0
350
+ chars_delta = b_chars > 0 ? "#{(t_chars / b_chars.to_f * 100).round(1)}%" : "N/A"
351
+
352
+ printf "%-20s\n", task[:name]
353
+ printf " Cost: $%-10.6f $%-10.6f %s\n", b_cost, t_cost, cost_delta
354
+ printf " Iterations: %-11d %-11d %s\n", b_iter, t_iter, "#{t_iter - b_iter > 0 ? '+' : ''}#{t_iter - b_iter}"
355
+ printf " Assistant chars: %-11d %-11d %s\n", b_chars, t_chars, chars_delta
356
+ puts
357
+ end
358
+
359
+ # Totals
360
+ b_total_cost = 0
361
+ t_total_cost = 0
362
+ b_total_iter = 0
363
+ t_total_iter = 0
364
+ b_total_chars = 0
365
+ t_total_chars = 0
366
+
367
+ TASKS.each do |task|
368
+ task_key = task[:name].to_sym
369
+ b = baseline[task_key] || {}
370
+ t = treatment[task_key] || {}
371
+ next unless b[:success] && t[:success]
372
+
373
+ b_total_cost += b[:total_cost] || 0
374
+ t_total_cost += t[:total_cost] || 0
375
+ b_total_iter += b[:iterations] || 0
376
+ t_total_iter += t[:iterations] || 0
377
+ b_total_chars += b[:total_assistant_chars] || 0
378
+ t_total_chars += t[:total_assistant_chars] || 0
379
+ end
380
+
381
+ puts "-" * 60
382
+ printf "%-20s\n", "TOTALS"
383
+ cost_pct = b_total_cost > 0 ? (t_total_cost / b_total_cost * 100).round(1) : 0
384
+ printf " Total cost: $%-10.6f $%-10.6f %s%%\n", b_total_cost, t_total_cost, cost_pct
385
+ printf " Total iterations: %-11d %-11d %+d\n", b_total_iter, t_total_iter, t_total_iter - b_total_iter
386
+ chars_pct = b_total_chars > 0 ? (t_total_chars / b_total_chars.to_f * 100).round(1) : 0
387
+ printf " Total chars: %-11d %-11d %s%%\n", b_total_chars, t_total_chars, chars_pct
388
+ end
389
+
390
+ # Minimal UI that captures output without displaying
391
+ class BenchmarkUI
392
+ def log(msg, level: :info); end
393
+ def show_assistant_message(content, files: []); end
394
+ def show_tool_call(name, args); end
395
+ def show_tool_result(result); end
396
+ def show_tool_stdout(lines); end
397
+ def show_tool_error(error); end
398
+ def show_tool_args(formatted_args); end
399
+ def show_file_write_preview(path, is_new_file:); end
400
+ def show_file_edit_preview(path); end
401
+ def show_file_error(error_message); end
402
+ def show_shell_preview(command); end
403
+ def show_diff(old_content, new_content, max_lines: 50); end
404
+ def show_token_usage(token_data); end
405
+ def show_complete(iterations:, cost:, duration: nil, cache_stats: nil, awaiting_user_feedback: false, cost_source: nil); end
406
+ def append_output(content); end
407
+ def show_info(message, prefix_newline: true); end
408
+ def show_warning(message); end
409
+ def show_error(message); end
410
+ def show_success(message); end
411
+ def show_progress(message = nil, prefix_newline: true, progress_type: "thinking", phase: "active", metadata: {}); end
412
+ def start_progress(message: nil, style: :primary, quiet_on_fast_finish: false); end
413
+ def with_progress(message: nil, style: :primary, quiet_on_fast_finish: false)
414
+ yield if block_given?
415
+ end
416
+ def update_sessionbar(tasks: nil, cost: nil, cost_source: nil, status: nil, latency: nil); end
417
+ def update_todos(todos); end
418
+ def set_working_status; end
419
+ def set_idle_status; end
420
+ def request_confirmation(message, default: true); end
421
+ def clear_input; end
422
+ def set_input_tips(message, type: :info); end
423
+ def stop; end
424
+ end
425
+ end
426
+
427
+ if __FILE__ == $0
428
+ variant = ARGV[0]&.downcase
429
+ runner = BenchmarkRunner.new
430
+
431
+ case variant
432
+ when "baseline"
433
+ runner.run_baseline
434
+ when "treatment"
435
+ runner.run_treatment
436
+ when "report"
437
+ runner.run_report
438
+ else
439
+ runner.run
440
+ end
441
+ end
data/bin/octo ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
4
+
5
+ require "octo"
6
+
7
+ Octo::CLI.start(ARGV)
@@ -0,0 +1,77 @@
1
+ # Agent-First UI Design Philosophy
2
+
3
+ > Guiding principle for all Octo UI and feature design.
4
+
5
+ ---
6
+
7
+ ## Core Principle
8
+
9
+ **Conversation first, interactive cards when needed.**
10
+
11
+ Users interact with the Agent through natural language to accomplish everything. When conversation is inconvenient for structured input (e.g. dropdowns, multi-select, precise time picking), the Agent triggers an **interactive card** via the `request_user_feedback` tool — rendered by the frontend as a structured UI component. Cards are a complement to conversation, not a replacement.
12
+
13
+ ---
14
+
15
+ ## Two Interaction Modes
16
+
17
+ ### 1. Conversation (default)
18
+ User expresses intent in natural language, Agent understands and executes.
19
+
20
+ ```
21
+ User: Send me a daily standup summary every morning at 9
22
+ Agent: Done! Task created, runs Mon–Fri at 09:00 ✅
23
+ ```
24
+
25
+ ### 2. Interactive Cards (when conversation falls short)
26
+ When the Agent needs structured input that's hard to express in free text, it calls `request_user_feedback`. The frontend renders this as an interactive card (dropdowns, radio buttons, time pickers, etc.).
27
+
28
+ ```
29
+ Agent calls request_user_feedback → frontend renders a card:
30
+
31
+ ┌─────────────────────────────┐
32
+ │ 📋 Confirm task settings │
33
+ │ Frequency: [Daily ▼] │
34
+ │ Time: [09:00 ] │
35
+ │ [✅ Confirm] [Cancel] │
36
+ └─────────────────────────────┘
37
+
38
+ User fills card → structured data sent back to Agent → execution continues
39
+ ```
40
+
41
+ ---
42
+
43
+ ## When to Use Cards
44
+
45
+ | Situation | Reason |
46
+ |-----------|--------|
47
+ | Choosing from a list of options | Easier than enumerating in chat |
48
+ | Date / time selection | Precise value, error-prone in free text |
49
+ | Sensitive input like API keys | Should not appear in conversation history |
50
+ | Collecting multiple fields at once | One card beats several back-and-forth questions |
51
+
52
+ Everything else: use conversation.
53
+
54
+ ---
55
+
56
+ ## What Should NOT Exist
57
+
58
+ - ❌ Persistent configuration form pages
59
+ - ❌ Fields that require users to understand technical details (cron expressions, agent IDs, etc.)
60
+ - ❌ More than 3 action buttons per list row
61
+ - ❌ Standalone "Create" form modals
62
+
63
+ ---
64
+
65
+ ## Role of UI Pages
66
+
67
+ UI pages are for **displaying state**, not for configuring things:
68
+
69
+ - ✅ Show task lists, run history, current status
70
+ - ✅ Minimal action set per row: ▶ Run / ✎ Edit (opens conversation) / ✕ Delete
71
+ - ❌ No inline create/edit forms inside list pages
72
+
73
+ Clicking "Edit" opens an Agent conversation with context pre-filled. The Agent drives the modification flow from there.
74
+
75
+ ---
76
+
77
+ *Applies to all Octo Web UI and feature design.*