octo-agent 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/.clacky/skills/commit/SKILL.md +423 -0
  3. data/.clacky/skills/gem-release/SKILL.md +199 -0
  4. data/.clacky/skills/gem-release/scripts/release.sh +304 -0
  5. data/.clacky/skills/oss-upload/SKILL.md +47 -0
  6. data/.octorules +106 -0
  7. data/.rspec +3 -0
  8. data/.rubocop.yml +8 -0
  9. data/CHANGELOG.md +76 -0
  10. data/CODE_OF_CONDUCT.md +132 -0
  11. data/CONTRIBUTING.md +92 -0
  12. data/Dockerfile +28 -0
  13. data/LICENSE.txt +22 -0
  14. data/POSITIONING.md +46 -0
  15. data/README.md +134 -0
  16. data/README_CN.md +134 -0
  17. data/Rakefile +34 -0
  18. data/benchmark/fixtures/sample_project/Gemfile +3 -0
  19. data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
  20. data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
  21. data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
  22. data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
  23. data/benchmark/results/EVALUATION_REPORT.md +165 -0
  24. data/benchmark/results/baseline_20260511_174424.json +128 -0
  25. data/benchmark/results/report_20260511_175256.json +271 -0
  26. data/benchmark/results/report_20260511_175444.json +271 -0
  27. data/benchmark/results/treatment_20260511_175103.json +130 -0
  28. data/benchmark/runner.rb +441 -0
  29. data/bin/octo +7 -0
  30. data/docs/agent-first-ui-design.md +77 -0
  31. data/docs/billing-system.md +318 -0
  32. data/docs/channel-architecture.md +235 -0
  33. data/docs/engineering-article.md +343 -0
  34. data/docs/session-skill-invocation.md +69 -0
  35. data/docs/time_machine_design.md +247 -0
  36. data/docs/ui2-architecture.md +124 -0
  37. data/homebrew/README.md +96 -0
  38. data/homebrew/openocto.rb +24 -0
  39. data/lib/octo/agent/hook_manager.rb +61 -0
  40. data/lib/octo/agent/llm_caller.rb +800 -0
  41. data/lib/octo/agent/memory_updater.rb +246 -0
  42. data/lib/octo/agent/message_compressor.rb +225 -0
  43. data/lib/octo/agent/message_compressor_helper.rb +869 -0
  44. data/lib/octo/agent/next_message_suggester.rb +215 -0
  45. data/lib/octo/agent/session_serializer.rb +685 -0
  46. data/lib/octo/agent/skill_auto_creator.rb +114 -0
  47. data/lib/octo/agent/skill_evolution.rb +61 -0
  48. data/lib/octo/agent/skill_manager.rb +466 -0
  49. data/lib/octo/agent/skill_reflector.rb +89 -0
  50. data/lib/octo/agent/system_prompt_builder.rb +101 -0
  51. data/lib/octo/agent/time_machine.rb +214 -0
  52. data/lib/octo/agent/tool_executor.rb +454 -0
  53. data/lib/octo/agent/tool_registry.rb +150 -0
  54. data/lib/octo/agent.rb +2180 -0
  55. data/lib/octo/agent_config.rb +989 -0
  56. data/lib/octo/agent_profile.rb +112 -0
  57. data/lib/octo/anthropic_stream_aggregator.rb +137 -0
  58. data/lib/octo/background_task_registry.rb +324 -0
  59. data/lib/octo/banner.rb +34 -0
  60. data/lib/octo/bedrock_stream_aggregator.rb +137 -0
  61. data/lib/octo/block_font.rb +331 -0
  62. data/lib/octo/cli.rb +968 -0
  63. data/lib/octo/client.rb +623 -0
  64. data/lib/octo/default_agents/SOUL.md +3 -0
  65. data/lib/octo/default_agents/USER.md +1 -0
  66. data/lib/octo/default_agents/base_prompt.md +66 -0
  67. data/lib/octo/default_agents/coding/profile.yml +2 -0
  68. data/lib/octo/default_agents/coding/system_prompt.md +67 -0
  69. data/lib/octo/default_agents/general/profile.yml +2 -0
  70. data/lib/octo/default_agents/general/system_prompt.md +16 -0
  71. data/lib/octo/default_parsers/doc_parser.rb +69 -0
  72. data/lib/octo/default_parsers/docx_parser.rb +188 -0
  73. data/lib/octo/default_parsers/pdf_parser.rb +120 -0
  74. data/lib/octo/default_parsers/pdf_parser_ocr.py +103 -0
  75. data/lib/octo/default_parsers/pdf_parser_plumber.py +62 -0
  76. data/lib/octo/default_parsers/pptx_parser.rb +140 -0
  77. data/lib/octo/default_parsers/xlsx_parser.rb +121 -0
  78. data/lib/octo/default_skills/browser-setup/SKILL.md +426 -0
  79. data/lib/octo/default_skills/channel-manager/SKILL.md +623 -0
  80. data/lib/octo/default_skills/channel-manager/dingtalk_setup.rb +191 -0
  81. data/lib/octo/default_skills/channel-manager/discord_setup.rb +199 -0
  82. data/lib/octo/default_skills/channel-manager/feishu_setup.rb +574 -0
  83. data/lib/octo/default_skills/channel-manager/import_lark_skills.rb +97 -0
  84. data/lib/octo/default_skills/channel-manager/install_feishu_skills.rb +105 -0
  85. data/lib/octo/default_skills/channel-manager/weixin_setup.rb +274 -0
  86. data/lib/octo/default_skills/code-explorer/SKILL.md +36 -0
  87. data/lib/octo/default_skills/cron-task-creator/SKILL.md +257 -0
  88. data/lib/octo/default_skills/cron-task-creator/evals/evals.json +38 -0
  89. data/lib/octo/default_skills/onboard/SKILL.md +578 -0
  90. data/lib/octo/default_skills/onboard/scripts/import_external_skills.rb +413 -0
  91. data/lib/octo/default_skills/onboard/scripts/install_builtin_skills.rb +97 -0
  92. data/lib/octo/default_skills/persist-memory/SKILL.md +59 -0
  93. data/lib/octo/default_skills/personal-website/SKILL.md +113 -0
  94. data/lib/octo/default_skills/personal-website/publish.rb +235 -0
  95. data/lib/octo/default_skills/product-help/SKILL.md +123 -0
  96. data/lib/octo/default_skills/product-help/docs/agent-config.md +74 -0
  97. data/lib/octo/default_skills/product-help/docs/best-practices.md +49 -0
  98. data/lib/octo/default_skills/product-help/docs/browser-tool.md +53 -0
  99. data/lib/octo/default_skills/product-help/docs/built-in-skills.md +43 -0
  100. data/lib/octo/default_skills/product-help/docs/cli-reference.md +82 -0
  101. data/lib/octo/default_skills/product-help/docs/create-your-first-skill.md +47 -0
  102. data/lib/octo/default_skills/product-help/docs/faq.md +98 -0
  103. data/lib/octo/default_skills/product-help/docs/how-to-use-a-skill.md +58 -0
  104. data/lib/octo/default_skills/product-help/docs/installation.md +59 -0
  105. data/lib/octo/default_skills/product-help/docs/memory-system.md +61 -0
  106. data/lib/octo/default_skills/product-help/docs/octorules.md +62 -0
  107. data/lib/octo/default_skills/product-help/docs/session-management.md +63 -0
  108. data/lib/octo/default_skills/product-help/docs/skill-basics.md +55 -0
  109. data/lib/octo/default_skills/product-help/docs/skill-frontmatter.md +61 -0
  110. data/lib/octo/default_skills/product-help/docs/web-server.md +49 -0
  111. data/lib/octo/default_skills/product-help/docs/what-is-octo.md +37 -0
  112. data/lib/octo/default_skills/product-help/docs/windows-installation.md +36 -0
  113. data/lib/octo/default_skills/product-help/docs/writing-tips.md +53 -0
  114. data/lib/octo/default_skills/recall-memory/SKILL.md +65 -0
  115. data/lib/octo/default_skills/skill-add/SKILL.md +59 -0
  116. data/lib/octo/default_skills/skill-add/scripts/install_from_zip.rb +295 -0
  117. data/lib/octo/default_skills/skill-creator/SKILL.md +602 -0
  118. data/lib/octo/default_skills/skill-creator/agents/analyzer.md +274 -0
  119. data/lib/octo/default_skills/skill-creator/agents/comparator.md +202 -0
  120. data/lib/octo/default_skills/skill-creator/agents/grader.md +223 -0
  121. data/lib/octo/default_skills/skill-creator/eval-viewer/generate_review.py +471 -0
  122. data/lib/octo/default_skills/skill-creator/eval-viewer/viewer.html +1325 -0
  123. data/lib/octo/default_skills/skill-creator/references/schemas.md +430 -0
  124. data/lib/octo/default_skills/skill-creator/scripts/__init__.py +0 -0
  125. data/lib/octo/default_skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  126. data/lib/octo/default_skills/skill-creator/scripts/generate_report.py +326 -0
  127. data/lib/octo/default_skills/skill-creator/scripts/improve_description.py +310 -0
  128. data/lib/octo/default_skills/skill-creator/scripts/quick_validate.py +103 -0
  129. data/lib/octo/default_skills/skill-creator/scripts/run_eval.py +317 -0
  130. data/lib/octo/default_skills/skill-creator/scripts/run_loop.py +331 -0
  131. data/lib/octo/default_skills/skill-creator/scripts/utils.py +47 -0
  132. data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb +143 -0
  133. data/lib/octo/idle_compression_timer.rb +115 -0
  134. data/lib/octo/json_ui_controller.rb +204 -0
  135. data/lib/octo/message_format/anthropic.rb +409 -0
  136. data/lib/octo/message_format/bedrock.rb +361 -0
  137. data/lib/octo/message_format/open_ai.rb +222 -0
  138. data/lib/octo/message_history.rb +373 -0
  139. data/lib/octo/openai_stream_aggregator.rb +130 -0
  140. data/lib/octo/plain_ui_controller.rb +166 -0
  141. data/lib/octo/providers.rb +534 -0
  142. data/lib/octo/server/browser_manager.rb +397 -0
  143. data/lib/octo/server/channel/adapters/base.rb +82 -0
  144. data/lib/octo/server/channel/adapters/dingtalk/adapter.rb +314 -0
  145. data/lib/octo/server/channel/adapters/dingtalk/api_client.rb +391 -0
  146. data/lib/octo/server/channel/adapters/dingtalk/stream_client.rb +203 -0
  147. data/lib/octo/server/channel/adapters/discord/adapter.rb +229 -0
  148. data/lib/octo/server/channel/adapters/discord/api_client.rb +107 -0
  149. data/lib/octo/server/channel/adapters/discord/gateway_client.rb +270 -0
  150. data/lib/octo/server/channel/adapters/feishu/adapter.rb +320 -0
  151. data/lib/octo/server/channel/adapters/feishu/bot.rb +478 -0
  152. data/lib/octo/server/channel/adapters/feishu/file_processor.rb +36 -0
  153. data/lib/octo/server/channel/adapters/feishu/message_parser.rb +129 -0
  154. data/lib/octo/server/channel/adapters/feishu/ws_client.rb +423 -0
  155. data/lib/octo/server/channel/adapters/telegram/adapter.rb +375 -0
  156. data/lib/octo/server/channel/adapters/telegram/api_client.rb +205 -0
  157. data/lib/octo/server/channel/adapters/wecom/adapter.rb +148 -0
  158. data/lib/octo/server/channel/adapters/wecom/media_downloader.rb +115 -0
  159. data/lib/octo/server/channel/adapters/wecom/ws_client.rb +395 -0
  160. data/lib/octo/server/channel/adapters/weixin/adapter.rb +692 -0
  161. data/lib/octo/server/channel/adapters/weixin/api_client.rb +402 -0
  162. data/lib/octo/server/channel/channel_config.rb +178 -0
  163. data/lib/octo/server/channel/channel_manager.rb +468 -0
  164. data/lib/octo/server/channel/channel_ui_controller.rb +224 -0
  165. data/lib/octo/server/channel.rb +33 -0
  166. data/lib/octo/server/discover.rb +77 -0
  167. data/lib/octo/server/epipe_safe_io.rb +105 -0
  168. data/lib/octo/server/http_server.rb +3554 -0
  169. data/lib/octo/server/scheduler.rb +317 -0
  170. data/lib/octo/server/server_master.rb +325 -0
  171. data/lib/octo/server/session_registry.rb +431 -0
  172. data/lib/octo/server/web_ui_controller.rb +487 -0
  173. data/lib/octo/session_manager.rb +385 -0
  174. data/lib/octo/skill.rb +466 -0
  175. data/lib/octo/skill_loader.rb +328 -0
  176. data/lib/octo/tools/base.rb +118 -0
  177. data/lib/octo/tools/browser.rb +625 -0
  178. data/lib/octo/tools/edit.rb +165 -0
  179. data/lib/octo/tools/file_reader.rb +549 -0
  180. data/lib/octo/tools/glob.rb +162 -0
  181. data/lib/octo/tools/grep.rb +356 -0
  182. data/lib/octo/tools/invoke_skill.rb +96 -0
  183. data/lib/octo/tools/list_tasks.rb +54 -0
  184. data/lib/octo/tools/redo_task.rb +41 -0
  185. data/lib/octo/tools/request_user_feedback.rb +84 -0
  186. data/lib/octo/tools/security.rb +333 -0
  187. data/lib/octo/tools/terminal/output_cleaner.rb +63 -0
  188. data/lib/octo/tools/terminal/persistent_session.rb +268 -0
  189. data/lib/octo/tools/terminal/safe_rm.sh +106 -0
  190. data/lib/octo/tools/terminal/session_manager.rb +213 -0
  191. data/lib/octo/tools/terminal.rb +1828 -0
  192. data/lib/octo/tools/todo_manager.rb +374 -0
  193. data/lib/octo/tools/trash_manager.rb +388 -0
  194. data/lib/octo/tools/undo_task.rb +35 -0
  195. data/lib/octo/tools/web_fetch.rb +242 -0
  196. data/lib/octo/tools/web_search.rb +260 -0
  197. data/lib/octo/tools/write.rb +77 -0
  198. data/lib/octo/ui2/block_font.rb +10 -0
  199. data/lib/octo/ui2/components/base_component.rb +163 -0
  200. data/lib/octo/ui2/components/command_suggestions.rb +290 -0
  201. data/lib/octo/ui2/components/common_component.rb +96 -0
  202. data/lib/octo/ui2/components/inline_input.rb +226 -0
  203. data/lib/octo/ui2/components/input_area.rb +1338 -0
  204. data/lib/octo/ui2/components/message_component.rb +99 -0
  205. data/lib/octo/ui2/components/modal_component.rb +419 -0
  206. data/lib/octo/ui2/components/todo_area.rb +149 -0
  207. data/lib/octo/ui2/components/tool_component.rb +107 -0
  208. data/lib/octo/ui2/components/welcome_banner.rb +139 -0
  209. data/lib/octo/ui2/layout_manager.rb +807 -0
  210. data/lib/octo/ui2/line_editor.rb +363 -0
  211. data/lib/octo/ui2/markdown_renderer.rb +100 -0
  212. data/lib/octo/ui2/output_buffer.rb +370 -0
  213. data/lib/octo/ui2/progress_handle.rb +362 -0
  214. data/lib/octo/ui2/progress_indicator.rb +55 -0
  215. data/lib/octo/ui2/screen_buffer.rb +273 -0
  216. data/lib/octo/ui2/terminal_detector.rb +119 -0
  217. data/lib/octo/ui2/theme_manager.rb +85 -0
  218. data/lib/octo/ui2/themes/base_theme.rb +105 -0
  219. data/lib/octo/ui2/themes/hacker_theme.rb +62 -0
  220. data/lib/octo/ui2/themes/minimal_theme.rb +56 -0
  221. data/lib/octo/ui2/thinking_verbs.rb +26 -0
  222. data/lib/octo/ui2/ui_controller.rb +1625 -0
  223. data/lib/octo/ui2/view_renderer.rb +177 -0
  224. data/lib/octo/ui2.rb +40 -0
  225. data/lib/octo/ui_interface.rb +154 -0
  226. data/lib/octo/utils/arguments_parser.rb +191 -0
  227. data/lib/octo/utils/browser_detector.rb +195 -0
  228. data/lib/octo/utils/encoding.rb +92 -0
  229. data/lib/octo/utils/environment_detector.rb +140 -0
  230. data/lib/octo/utils/file_ignore_helper.rb +170 -0
  231. data/lib/octo/utils/file_processor.rb +601 -0
  232. data/lib/octo/utils/gitignore_parser.rb +154 -0
  233. data/lib/octo/utils/limit_stack.rb +152 -0
  234. data/lib/octo/utils/logger.rb +124 -0
  235. data/lib/octo/utils/login_shell.rb +72 -0
  236. data/lib/octo/utils/model_pricing.rb +646 -0
  237. data/lib/octo/utils/parser_manager.rb +165 -0
  238. data/lib/octo/utils/path_helper.rb +15 -0
  239. data/lib/octo/utils/scripts_manager.rb +59 -0
  240. data/lib/octo/utils/string_matcher.rb +158 -0
  241. data/lib/octo/utils/trash_directory.rb +112 -0
  242. data/lib/octo/utils/workspace_rules.rb +46 -0
  243. data/lib/octo/version.rb +5 -0
  244. data/lib/octo/web/app.css +7141 -0
  245. data/lib/octo/web/app.js +543 -0
  246. data/lib/octo/web/apple-touch-icon.png +0 -0
  247. data/lib/octo/web/auth.js +150 -0
  248. data/lib/octo/web/channels.js +276 -0
  249. data/lib/octo/web/datepicker.js +205 -0
  250. data/lib/octo/web/favicon.png +0 -0
  251. data/lib/octo/web/i18n.js +1073 -0
  252. data/lib/octo/web/icon-512.png +0 -0
  253. data/lib/octo/web/icon-dark.svg +25 -0
  254. data/lib/octo/web/icon.svg +29 -0
  255. data/lib/octo/web/index.html +871 -0
  256. data/lib/octo/web/marked.min.js +69 -0
  257. data/lib/octo/web/onboard.js +491 -0
  258. data/lib/octo/web/profile.js +442 -0
  259. data/lib/octo/web/sessions.js +4421 -0
  260. data/lib/octo/web/settings.js +913 -0
  261. data/lib/octo/web/sidebar.js +32 -0
  262. data/lib/octo/web/skills.js +885 -0
  263. data/lib/octo/web/tasks.js +297 -0
  264. data/lib/octo/web/theme.js +105 -0
  265. data/lib/octo/web/trash.js +343 -0
  266. data/lib/octo/web/vendor/hljs/highlight.min.js +1244 -0
  267. data/lib/octo/web/vendor/hljs/hljs-theme.css +95 -0
  268. data/lib/octo/web/vendor/katex/auto-render.min.js +1 -0
  269. data/lib/octo/web/vendor/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
  270. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
  271. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
  272. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
  273. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
  274. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
  275. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
  276. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
  277. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
  278. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
  279. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
  280. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
  281. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
  282. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
  283. data/lib/octo/web/vendor/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
  284. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
  285. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
  286. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
  287. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
  288. data/lib/octo/web/vendor/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
  289. data/lib/octo/web/vendor/katex/katex.min.css +1 -0
  290. data/lib/octo/web/vendor/katex/katex.min.js +1 -0
  291. data/lib/octo/web/version.js +449 -0
  292. data/lib/octo/web/weixin-qr.html +209 -0
  293. data/lib/octo/web/ws-dispatcher.js +357 -0
  294. data/lib/octo/web/ws.js +128 -0
  295. data/lib/octo.rb +145 -0
  296. data/scripts/build/build.sh +329 -0
  297. data/scripts/build/lib/apt.sh +56 -0
  298. data/scripts/build/lib/brew.sh +89 -0
  299. data/scripts/build/lib/colors.sh +17 -0
  300. data/scripts/build/lib/gem.sh +95 -0
  301. data/scripts/build/lib/mise.sh +125 -0
  302. data/scripts/build/lib/network.sh +157 -0
  303. data/scripts/build/lib/os.sh +57 -0
  304. data/scripts/build/lib/shell.sh +37 -0
  305. data/scripts/build/src/install.sh.cc +174 -0
  306. data/scripts/build/src/install_browser.sh.cc +101 -0
  307. data/scripts/build/src/install_full.sh.cc +290 -0
  308. data/scripts/build/src/install_rails_deps.sh.cc +145 -0
  309. data/scripts/build/src/install_system_deps.sh.cc +123 -0
  310. data/scripts/build/src/uninstall.sh.cc +101 -0
  311. data/scripts/install.ps1 +532 -0
  312. data/scripts/install.sh +567 -0
  313. data/scripts/install_browser.sh +479 -0
  314. data/scripts/install_full.sh +838 -0
  315. data/scripts/install_rails_deps.sh +746 -0
  316. data/scripts/install_system_deps.sh +518 -0
  317. data/scripts/uninstall.sh +287 -0
  318. data/sig/octo.rbs +4 -0
  319. metadata +614 -0
@@ -0,0 +1,800 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Octo
4
+ class Agent
5
+ # LLM API call management
6
+ # Handles API calls with retry logic, fallback model support, and progress indication
7
+ module LlmCaller
8
+ # Number of consecutive RetryableError failures (503/429/5xx) before switching to fallback.
9
+ # Network-level errors (connection failures, timeouts) do NOT trigger fallback — they are
10
+ # retried on the primary model for the full max_retries budget, since they are likely
11
+ # transient infrastructure blips rather than a model-level outage.
12
+ RETRIES_BEFORE_FALLBACK = 3
13
+
14
+ # After switching to the fallback model, allow this many retries before giving up.
15
+ # Kept lower than max_retries (10) because we have already exhausted the primary model.
16
+ MAX_RETRIES_ON_FALLBACK = 5
17
+
18
+ # Execute LLM API call with progress indicator, retry logic, and cost tracking.
19
+ #
20
+ # Fallback / probing state machine (driven by AgentConfig):
21
+ #
22
+ # :primary_ok (nil)
23
+ # Normal operation — use the configured model.
24
+ # After RETRIES_BEFORE_FALLBACK consecutive failures → :fallback_active
25
+ #
26
+ # :fallback_active
27
+ # Use fallback model. After FALLBACK_COOLING_OFF_SECONDS (30 min) the
28
+ # config transitions to :probing on the next call_llm entry.
29
+ #
30
+ # :probing
31
+ # Silently attempt the primary model once.
32
+ # Success → config transitions back to :primary_ok, user notified.
33
+ # Failure → renew cooling-off clock, back to :fallback_active, then
34
+ # retry the *same* request with the fallback model so the
35
+ # user experiences no extra delay.
36
+ #
37
+ # @return [Hash] API response with :content, :tool_calls, :usage, etc.
38
+ # NOTE on progress lifecycle:
39
+ # call_llm intentionally does NOT start or stop the progress indicator.
40
+ # Ownership lives with the caller (Agent#think for normal/compression
41
+ # paths, Agent#trigger_idle_compression for idle compression). This
42
+ # avoids nested active/done pairs clobbering each other — a bug that
43
+ # silently dropped the idle-compression summary line.
44
+ #
45
+ # Inside call_llm we only *update in place* during retries, so the
46
+ # already-live progress slot shows meaningful transient status
47
+ # ("Network failed… attempt 2/10", etc.).
48
+ private def call_llm
49
+ # Transition :fallback_active → :probing if cooling-off has expired.
50
+ @config.maybe_start_probing
51
+
52
+ tools_to_send = @tool_registry.all_definitions
53
+
54
+ max_retries = 10
55
+ retry_delay = 5
56
+ retries = 0
57
+
58
+ # Track whether any of the retry/fallback branches below opened a
59
+ # "retrying" progress slot via show_progress(progress_type:
60
+ # "retrying", phase: "active"). If so, we MUST close it before
61
+ # leaving call_llm — otherwise the UI's legacy shim in
62
+ # UI2::UIController keeps the :quiet ProgressHandle alive, its
63
+ # ticker thread keeps running, and the user sees a frozen
64
+ # "Network failed: ... (681s)" line long after the task finished.
65
+ #
66
+ # The close is done in the outer ensure below so it runs on:
67
+ # - normal success (response returned)
68
+ # - unrecoverable failure (raise propagates out)
69
+ # - BadRequestError reasoning-content retry success
70
+ retrying_progress_opened = false
71
+ # One-shot flag set by the BadRequestError rescue below when the server
72
+ # complained about missing reasoning_content. The subsequent retry will
73
+ # pad every assistant message's reasoning_content, which satisfies
74
+ # DeepSeek / Kimi thinking-mode providers even when the earlier turns
75
+ # were produced by a different provider (e.g. MiniMax keeps thinking
76
+ # inline in content and never emits a reasoning_content field, so the
77
+ # history-evidence heuristic in MessageHistory can't infer thinking
78
+ # mode on its own). We retry at most once — if padding doesn't fix it,
79
+ # the error is something else and we let it propagate.
80
+ force_reasoning_content_pad = false
81
+ thinking_retry_attempted = false
82
+ # One-shot flag for context-overflow recovery. When the server complains
83
+ # the input exceeds the model's context window, we run a forced
84
+ # compression with pull_back_from_tail: 1 (preserves the model's
85
+ # two-checkpoint prompt cache) and retry the original request once.
86
+ # We retry at most once — if still overflowing afterward, the issue is
87
+ # something else (e.g. tool schemas alone exceed the window) and we let
88
+ # the error propagate.
89
+ context_overflow_retry_attempted = false
90
+
91
+ begin
92
+ begin
93
+ # Use active_messages (Time Machine) when undone, otherwise send full history.
94
+ # to_api strips internal fields and handles orphaned tool_calls.
95
+ messages_to_send = if respond_to?(:active_messages)
96
+ active_messages(force_reasoning_content_pad: force_reasoning_content_pad)
97
+ else
98
+ @history.to_api(force_reasoning_content_pad: force_reasoning_content_pad)
99
+ end
100
+
101
+ response = @client.send_messages_with_tools(
102
+ messages_to_send,
103
+ model: current_model,
104
+ tools: tools_to_send,
105
+ max_tokens: @config.max_tokens,
106
+ enable_caching: @config.enable_prompt_caching,
107
+ reasoning_effort: @reasoning_effort,
108
+ on_chunk: build_progress_on_chunk
109
+ )
110
+
111
+ # Successful response — if we were probing, confirm primary is healthy.
112
+ handle_probe_success if @config.probing?
113
+
114
+ # ── Upstream truncation detector ──────────────────────────────────
115
+ # OpenRouter / Bedrock and other routers sometimes close the SSE
116
+ # stream mid-tool_use: we receive finish_reason="stop" together with
117
+ # a syntactically valid tool_call whose `arguments` JSON is empty,
118
+ # "{}" (placeholder before any key was streamed), or otherwise
119
+ # unparseable. Treat this as retryable — otherwise the agent would
120
+ # execute a tool with empty args (often failing cryptically) or
121
+ # silently exit thinking the task is done.
122
+ #
123
+ # Raises UpstreamTruncatedError (a RetryableError) so the rescue
124
+ # block below handles retry + fallback identically to 5xx/429.
125
+ detect_upstream_truncation!(response)
126
+
127
+ rescue Faraday::TimeoutError => e
128
+ # ── Read-timeout path (distinct from connection-level failures) ──
129
+ # Faraday::TimeoutError on our non-streaming POST almost always means
130
+ # the *response* took longer than the 300s read-timeout to come back —
131
+ # i.e. the model is trying to produce a huge output in one shot
132
+ # (e.g. "write me a 2000-line snake game"). Blindly retrying the same
133
+ # request with the same prompt reproduces the same timeout.
134
+ #
135
+ # Strategy:
136
+ # 1. On the FIRST timeout in a task, inject a `[SYSTEM]` user message
137
+ # telling the model to break the work into smaller steps, then
138
+ # retry. The history edit changes the prompt, so the retry is
139
+ # materially different from the failed attempt.
140
+ # 2. On subsequent timeouts in the same task, fall back to the
141
+ # generic "just retry" behaviour (the model may have ignored
142
+ # the hint; don't pile on duplicate hints).
143
+ # 3. Probing-mode timeouts still go through handle_probe_failure.
144
+ retries += 1
145
+
146
+ if @config.probing?
147
+ handle_probe_failure
148
+ retry
149
+ end
150
+
151
+ if retries <= max_retries
152
+ inject_large_output_hint_if_first_timeout(e)
153
+ @ui&.show_progress(
154
+ "Response too slow (likely generating too much at once): #{e.message}",
155
+ progress_type: "retrying",
156
+ phase: "active",
157
+ metadata: { attempt: retries, total: max_retries }
158
+ )
159
+ retrying_progress_opened = true
160
+ sleep retry_delay
161
+ retry
162
+ else
163
+ raise AgentError, "[LLM] Request timed out after #{max_retries} retries: #{e.message}"
164
+ end
165
+
166
+ rescue Faraday::ConnectionFailed, Faraday::SSLError, Errno::ECONNREFUSED, Errno::ETIMEDOUT => e
167
+ retries += 1
168
+
169
+ # Probing failure: primary still down — renew cooling-off and retry with fallback.
170
+ if @config.probing?
171
+ handle_probe_failure
172
+ retry
173
+ end
174
+
175
+ # Connection-level errors (DNS, TCP refused, open-timeout, TLS) are
176
+ # transient infrastructure blips — do NOT trigger fallback, and do
177
+ # NOT inject the "break into steps" hint (the model did nothing wrong).
178
+ # Just retry on the current model up to max_retries.
179
+ if retries <= max_retries
180
+ @ui&.show_progress(
181
+ "Network failed: #{e.message}",
182
+ progress_type: "retrying",
183
+ phase: "active",
184
+ metadata: { attempt: retries, total: max_retries }
185
+ )
186
+ retrying_progress_opened = true
187
+ sleep retry_delay
188
+ retry
189
+ else
190
+ # Don't show_error here — let the outer rescue block handle it to avoid duplicates.
191
+ # Progress cleanup is the caller's responsibility (via its own ensure block).
192
+ raise AgentError, "[LLM] Network connection failed after #{max_retries} retries: #{e.message}"
193
+ end
194
+
195
+ rescue RetryableError => e
196
+ retries += 1
197
+
198
+ # Probing failure: primary still down — renew cooling-off and retry with fallback.
199
+ if @config.probing?
200
+ handle_probe_failure
201
+ retry
202
+ end
203
+
204
+ # RetryableError (503/429/5xx/ThrottlingException) signals a service-level outage.
205
+ # After RETRIES_BEFORE_FALLBACK attempts, switch to the fallback model and reset the
206
+ # retry counter — but cap fallback retries at MAX_RETRIES_ON_FALLBACK (< max_retries)
207
+ # since we have already confirmed the primary is struggling.
208
+ current_max = @config.fallback_active? ? MAX_RETRIES_ON_FALLBACK : max_retries
209
+
210
+ if retries <= current_max
211
+ if retries == RETRIES_BEFORE_FALLBACK && !@config.fallback_active?
212
+ if try_activate_fallback(current_model)
213
+ retries = 0
214
+ retry
215
+ end
216
+ end
217
+ @ui&.show_progress(
218
+ e.message,
219
+ progress_type: "retrying",
220
+ phase: "active",
221
+ metadata: { attempt: retries, total: current_max }
222
+ )
223
+ retrying_progress_opened = true
224
+ sleep retry_delay
225
+ retry
226
+ else
227
+ # Don't show_error here — let the outer rescue block handle it to avoid duplicates.
228
+ # Progress cleanup is the caller's responsibility (via its own ensure block).
229
+ raise AgentError, "[LLM] Service unavailable after #{current_max} retries"
230
+ end
231
+
232
+ rescue Octo::BadRequestError => e
233
+ # One-shot recovery for "context too long" errors. The model's
234
+ # context window is exceeded by the current history+tools+system
235
+ # prompt. We run a forced compression with pull_back_from_tail: 1
236
+ # (preserves the two-checkpoint prompt cache so the compression
237
+ # call itself still hits cache#A on the second-to-last position),
238
+ # then retry the original request once.
239
+ if !context_overflow_retry_attempted &&
240
+ !@compressing_for_overflow &&
241
+ context_too_long_error?(e) &&
242
+ respond_to?(:compress_messages_if_needed, true)
243
+ context_overflow_retry_attempted = true
244
+ Octo::Logger.info(
245
+ "[context-overflow] caught BadRequestError, attempting forced compression with pull-back",
246
+ error_message: e.message[0, 200],
247
+ history_size: @history.size,
248
+ previous_total_tokens: @previous_total_tokens
249
+ )
250
+ # Layer 1: standard cache-preserving compression (pull_back: 1).
251
+ # Handles 99% of real overflow cases (newest message tipped the
252
+ # request just past the window).
253
+ if perform_context_overflow_compression(mode: :standard)
254
+ retry
255
+ end
256
+
257
+ # Layer 2: aggressive fallback. The Layer 1 compression call
258
+ # itself overflowed — happens when a single newly-appended
259
+ # message is enormous (huge tool_result, pasted file, etc.) so
260
+ # popping just K=1 didn't bring the request below the window.
261
+ # Pop ~half the history this time; sacrifices prompt cache to
262
+ # guarantee the compression call fits.
263
+ Octo::Logger.warn(
264
+ "[context-overflow] standard compression failed, escalating to aggressive mode"
265
+ )
266
+ if perform_context_overflow_compression(mode: :aggressive)
267
+ retry
268
+ end
269
+
270
+ # Both layers exhausted. Let the original error propagate so the
271
+ # user sees the underlying provider message. This should be
272
+ # extremely rare — would require both halves of the history to
273
+ # individually exceed the window, which is essentially impossible
274
+ # under the "previous turn succeeded" invariant.
275
+ Octo::Logger.error(
276
+ "[context-overflow] both standard and aggressive compression failed; " \
277
+ "propagating original error"
278
+ )
279
+ raise
280
+ end
281
+
282
+ # One-shot recovery for thinking-mode providers (DeepSeek V4, Kimi K2)
283
+ # that require every assistant message in the history to carry a
284
+ # reasoning_content field. The history-evidence heuristic in
285
+ # MessageHistory#to_api can miss this when the preceding turns came
286
+ # from a different thinking style (e.g. MiniMax keeps <think>...</think>
287
+ # inline in content and never emits reasoning_content) — so we detect
288
+ # the error here and retry once with forced padding.
289
+ if !thinking_retry_attempted && reasoning_content_missing_error?(e)
290
+ thinking_retry_attempted = true
291
+ force_reasoning_content_pad = true
292
+ Octo::Logger.info(
293
+ "[thinking-mode] retrying with forced reasoning_content padding " \
294
+ "(model=#{@config.model_name.inspect} base_url=#{@config.base_url.inspect})"
295
+ )
296
+ retry
297
+ end
298
+ raise
299
+ end
300
+
301
+ # Collect token usage data from API response (no cost tracking)
302
+ token_data = collect_iteration_tokens(response[:usage])
303
+ response[:token_usage] = token_data
304
+
305
+ # [DIAG] Log raw client response shape. Only emit when we see the
306
+ # "finish_reason=stop + non-empty tool_calls" combo, or when any
307
+ # tool_call's arguments look empty/unparseable — both indicate the
308
+ # upstream (Bedrock/relay/model) cut the tool_use stream short.
309
+ # Normal responses produce no log line (too noisy).
310
+ begin
311
+ tool_calls = response[:tool_calls] || []
312
+ if !tool_calls.empty?
313
+ raw_tcs = tool_calls.map do |c|
314
+ args_str = c[:arguments].is_a?(String) ? c[:arguments] : c[:arguments].to_s
315
+ parseable = begin
316
+ JSON.parse(args_str)
317
+ true
318
+ rescue StandardError
319
+ false
320
+ end
321
+ {
322
+ name: c[:name].to_s,
323
+ args_len: args_str.length,
324
+ args_parseable: parseable,
325
+ args_head: args_str[0, 120]
326
+ }
327
+ end
328
+ truncated_call = raw_tcs.any? { |t| t[:args_len] == 0 || t[:args_len] == 2 || !t[:args_parseable] }
329
+ suspicious = response[:finish_reason] == "stop"
330
+
331
+ if suspicious || truncated_call
332
+ Octo::Logger.warn("llm.response_suspicious",
333
+ model: current_model,
334
+ finish_reason: response[:finish_reason].to_s,
335
+ tool_calls_count: raw_tcs.size,
336
+ tool_calls: raw_tcs,
337
+ completion_tokens: token_data[:completion_tokens],
338
+ ttft_ms: response.dig(:latency, :ttft_ms),
339
+ combo_stop_with_toolcalls: suspicious,
340
+ has_truncated_args: truncated_call
341
+ )
342
+ end
343
+ end
344
+ rescue StandardError => e
345
+ Octo::Logger.warn("llm.response_log_failed", error: e.message)
346
+ end
347
+
348
+ response
349
+ ensure
350
+ # Close any "retrying" progress slot that was opened during the
351
+ # retry/fallback loop above. The legacy UI shim allocates a
352
+ # separate :quiet ProgressHandle under the "retrying" key; if it
353
+ # is never finished its ticker thread keeps running and the user
354
+ # sees a stale "Network failed: ... (NNN s)" line long after the
355
+ # task has completed. This ensure runs on:
356
+ # - successful retry → close the slot, message is "Recovered"
357
+ # so the final frame is informative rather than blank
358
+ # - unrecoverable failure that raises out → close the slot so
359
+ # the spinner doesn't linger while the error bubbles up
360
+ if retrying_progress_opened
361
+ @ui&.show_progress(progress_type: "retrying", phase: "done")
362
+ end
363
+ end
364
+ end
365
+
366
+ # Attempt to activate the provider fallback model for the given primary model.
367
+ # Shows a user-visible warning when switching. Returns true if a fallback was found
368
+ # and activated, false if no fallback is configured.
369
+ # @param failed_model [String] the model name that is currently failing
370
+ # @return [Boolean]
371
+ private def try_activate_fallback(failed_model)
372
+ fallback = @config.fallback_model_for(failed_model)
373
+ return false unless fallback
374
+
375
+ @config.activate_fallback!(fallback)
376
+ @ui&.show_warning(
377
+ "Model #{failed_model} appears unavailable. " \
378
+ "Automatically switching to fallback model: #{fallback}"
379
+ )
380
+ true
381
+ end
382
+
383
+ # Called when a probe attempt (testing primary after cooling-off) succeeds.
384
+ # Resets the state machine to :primary_ok and notifies the user.
385
+ private def handle_probe_success
386
+ primary = @config.model_name
387
+ @config.confirm_fallback_ok!
388
+ @ui&.show_warning("Primary model #{primary} is healthy again. Switched back automatically.")
389
+ end
390
+
391
+ # Called when a probe attempt fails.
392
+ # Renews the cooling-off clock (back to :fallback_active) so the *same*
393
+ # request is immediately retried with the fallback model — no extra delay.
394
+ private def handle_probe_failure
395
+ fallback = @config.instance_variable_get(:@fallback_model)
396
+ primary = @config.model_name
397
+ @config.activate_fallback!(fallback) # renews @fallback_since
398
+ @ui&.show_warning(
399
+ "Primary model #{primary} still unavailable. " \
400
+ "Continuing with fallback model: #{fallback}"
401
+ )
402
+ end
403
+
404
+ # Run a forced compression to recover from a context-overflow error.
405
+ # Called by the BadRequestError rescue when context_too_long_error?
406
+ # returns true.
407
+ #
408
+ # Two-layer defence:
409
+ # ────────────────────────────────────────────────────────────────────
410
+ # Layer 1 (mode: :standard, default) — preserves prompt cache.
411
+ # Pop K=1 message from @history tail, then run compression. This
412
+ # frees just enough token budget for the compression LLM call
413
+ # itself to fit, while preserving the model's two-checkpoint prompt
414
+ # cache (cache#A at second-to-last position is still hit). The
415
+ # popped message is reattached to the rebuilt history's tail by
416
+ # handle_compression_response, so recent task progress is not lost.
417
+ # Handles 99% of real-world cases where overflow is caused by the
418
+ # newest message pushing total just past the window.
419
+ #
420
+ # Layer 2 (mode: :aggressive) — sacrifices prompt cache to survive.
421
+ # Pop ~half the history (capped) from the tail. This dramatically
422
+ # shrinks the compression call's input regardless of how big any
423
+ # single message is. Used as a fallback when Layer 1 itself raises
424
+ # context_too_long — i.e. a single newly-appended message is so
425
+ # large (e.g. >50K-token tool_result, pasted huge file) that even
426
+ # removing it didn't bring the request under the window, OR the
427
+ # popped message was small but earlier history grew past the limit.
428
+ # Pulled-back messages are still reattached after compression so no
429
+ # user content is silently dropped.
430
+ #
431
+ # @param mode [Symbol] :standard or :aggressive
432
+ # @return [Boolean] true if compression succeeded (caller should retry
433
+ # the original request), false if compression was unable to run
434
+ # (compression disabled, history too short, etc.) or itself failed
435
+ # — caller decides whether to escalate to the next layer or
436
+ # propagate the original error.
437
+ private def perform_context_overflow_compression(mode: :standard)
438
+ return false unless respond_to?(:compress_messages_if_needed, true)
439
+
440
+ # Compute pull-back count.
441
+ # Standard: K=1 (cache-preserving).
442
+ # Aggressive: pop ~half the history, but never less than 4 and never
443
+ # more than (history_size - 2) so we always keep system + at least
444
+ # one recent message. Capped at 64 to bound the worst case (an
445
+ # enormous history that should never realistically occur).
446
+ pull_back =
447
+ if mode == :aggressive
448
+ half = @history.size / 2
449
+ [[half, 4].max, [@history.size - 2, 64].min].min
450
+ else
451
+ 1
452
+ end
453
+
454
+ @compressing_for_overflow = true
455
+ compression_context = nil
456
+
457
+ begin
458
+ compression_context = compress_messages_if_needed(
459
+ force: true,
460
+ pull_back_from_tail: pull_back
461
+ )
462
+ return false if compression_context.nil?
463
+
464
+ compression_message = compression_context[:compression_message]
465
+ @history.append(compression_message)
466
+
467
+ response = call_llm # recursive — guarded by @compressing_for_overflow
468
+ handle_compression_response(response, compression_context)
469
+ Octo::Logger.info(
470
+ "[context-overflow] compression succeeded",
471
+ mode: mode,
472
+ pull_back: pull_back
473
+ )
474
+ true
475
+ rescue => e
476
+ # Compression failed mid-flight. Restore @history to a sensible state:
477
+ # roll back the compression instruction we appended, and re-append the
478
+ # pulled-back messages so the user's recent work isn't silently lost.
479
+ if compression_context
480
+ cm = compression_context[:compression_message]
481
+ @history.rollback_before(cm) if cm
482
+ (compression_context[:pulled_back_messages] || []).each do |m|
483
+ @history.append(m)
484
+ end
485
+ end
486
+ Octo::Logger.warn(
487
+ "[context-overflow] compression failed during overflow recovery",
488
+ mode: mode,
489
+ pull_back: pull_back,
490
+ error_class: e.class.name,
491
+ error_message: e.message[0, 200]
492
+ )
493
+ false
494
+ ensure
495
+ @compressing_for_overflow = false
496
+ end
497
+ end
498
+
499
+ # True when a 400 BadRequestError is specifically about a missing
500
+ # reasoning_content field in thinking mode (DeepSeek V4, Kimi K2 thinking).
501
+ # We require TWO distinct substrings to avoid false positives — a generic
502
+ # 400 that happens to mention "reasoning_content" in passing (e.g. a
503
+ # validation hint in some unrelated provider) must NOT trigger the pad
504
+ # retry, which would silently add an empty field to every assistant
505
+ # message in the history.
506
+ private def reasoning_content_missing_error?(err)
507
+ return false unless err.is_a?(Octo::BadRequestError)
508
+
509
+ msg = err.message.to_s.downcase
510
+ msg.include?("reasoning_content") &&
511
+ (msg.include?("thinking") || msg.include?("must be passed back") ||
512
+ msg.include?("must be provided"))
513
+ end
514
+
515
+ # True when a 400 BadRequestError indicates the request exceeded the
516
+ # model's context window (i.e. the conversation history is too long).
517
+ #
518
+ # We deliberately favour broad detection over narrow precision:
519
+ # - False positive cost: one extra (no-op) compression cycle.
520
+ # - False negative cost: user is stuck — every retry hits the same wall.
521
+ # So the matcher is intentionally permissive.
522
+ #
523
+ # Coverage (verified against real production error strings):
524
+ #
525
+ # OpenAI:
526
+ # "This model's maximum context length is 128000 tokens. However
527
+ # you requested ... Please reduce the length of the messages."
528
+ # error.code == "context_length_exceeded"
529
+ #
530
+ # Anthropic:
531
+ # "prompt is too long: 218849 tokens > 200000 maximum"
532
+ #
533
+ # Qwen / Alibaba (DashScope):
534
+ # "You passed 117345 input tokens and requested 8192 output tokens.
535
+ # However the model's context length is only 125536 tokens, resulting
536
+ # in a maximum input length of 117344 tokens. Please reduce the length
537
+ # of the input prompt. (parameter=input_tokens, value=117345)"
538
+ #
539
+ # Qwen / Alibaba (DashScope) — newer/terser format (qwen3.6 series):
540
+ # "InternalError.Algo.InvalidParameter: Range of input length should be [1, 229376]"
541
+ #
542
+ # DeepSeek / Kimi / MiniMax / most OpenAI-compatible relays:
543
+ # Variants of OpenAI-style "context length" / "tokens exceeds" wording.
544
+ #
545
+ # Generic gateways (Portkey, OpenRouter):
546
+ # "The total number of tokens exceeds the model's maximum context length"
547
+ private def context_too_long_error?(err)
548
+ return false unless err.is_a?(Octo::BadRequestError)
549
+
550
+ msg = err.message.to_s.downcase
551
+
552
+ # Strong phrases — any one of these is conclusive on its own.
553
+ # Each phrase is two-or-more semantic words to avoid single-word noise.
554
+ strong_phrases = [
555
+ "context length", # OpenAI / Qwen / many compat APIs
556
+ "context_length_exceeded", # OpenAI error.code
557
+ "maximum context", # OpenAI variant
558
+ "maximum input length", # Qwen
559
+ "prompt is too long", # Anthropic
560
+ "input is too long", # Anthropic-compat relays
561
+ "exceeds the maximum context", # Portkey & generic gateways
562
+ "exceeds the model's context", # Generic
563
+ "exceeds the model's maximum", # Generic
564
+ "reduce the length of the input", # Qwen action hint
565
+ "reduce the length of the messages", # OpenAI action hint
566
+ "reduce the length of your", # Generic action hint
567
+ "reduce the length of the prompt", # Generic action hint
568
+ "range of input length" # Qwen DashScope qwen3.6+ terse format
569
+ ]
570
+ return true if strong_phrases.any? { |p| msg.include?(p) }
571
+
572
+ # Pattern 1: Anthropic-style "<N> tokens > <N> maximum"
573
+ return true if msg =~ /\d+\s*tokens?\s*>\s*\d+/
574
+
575
+ # Pattern 2: Qwen-style structured field "parameter=input_tokens"
576
+ return true if msg.include?("parameter=input_tokens")
577
+
578
+ false
579
+ end
580
+
581
+ # Detect upstream tool-call truncation and raise UpstreamTruncatedError
582
+ # so the standard RetryableError rescue (with fallback model support)
583
+ # handles retry identically to 5xx/429.
584
+ #
585
+ # Background: OpenRouter routes to Anthropic/Bedrock/etc. and passes
586
+ # through whatever the upstream sends. If the upstream closes the SSE
587
+ # stream mid-tool_use (observed with Anthropic at ~127 s TTFT under
588
+ # load), OpenRouter does NOT surface an error — it emits a valid
589
+ # `tool_calls[]` whose `arguments` is empty, `"{}"`, or non-parseable
590
+ # JSON. Without this check the agent would either execute the tool with
591
+ # empty args or (worse) silently exit thinking the task finished.
592
+ #
593
+ # Rule is deliberately narrow: we only intercept the case where the
594
+ # model streamed literally nothing into the tool_call arguments —
595
+ # i.e. `nil`, empty string, or the placeholder `"{}"`. Partial/invalid
596
+ # JSON (e.g. `{"path": "/tmp/x"`) is left to the existing
597
+ # ArgumentsParser → BadArgumentsError path, because the model already
598
+ # committed to specific values and feeding the parse error back as a
599
+ # tool_result lets it self-correct in one round-trip (faster than a
600
+ # blind retry from scratch).
601
+ private def detect_upstream_truncation!(response)
602
+ tool_calls = response[:tool_calls]
603
+ return if tool_calls.nil? || tool_calls.empty?
604
+
605
+ truncated = tool_calls.find { |tc| tool_call_args_truncated?(tc[:arguments]) }
606
+ return unless truncated
607
+
608
+ args_str = truncated[:arguments].is_a?(String) ? truncated[:arguments] : truncated[:arguments].to_s
609
+ Octo::Logger.warn("llm.upstream_truncation_detected",
610
+ model: current_model,
611
+ tool_name: truncated[:name].to_s,
612
+ args_len: args_str.length,
613
+ args_head: args_str[0, 80],
614
+ finish_reason: response[:finish_reason].to_s,
615
+ completion_tokens: response.dig(:token_usage, :completion_tokens),
616
+ ttft_ms: response.dig(:latency, :ttft_ms)
617
+ )
618
+
619
+ # Inject a one-shot [SYSTEM] hint so a plain retry isn't doomed to the
620
+ # same fate when the truncation correlates with large tool_call args
621
+ # (e.g. writing a 5000-char file in one go). For infrastructure-level
622
+ # blips this hint is harmless — the retry usually succeeds on its own
623
+ # and the hint just sits in history without affecting behaviour.
624
+ inject_upstream_truncation_hint_if_first(truncated)
625
+
626
+ raise Octo::UpstreamTruncatedError,
627
+ "[LLM] Upstream truncated tool_call `#{truncated[:name]}` " \
628
+ "(args=#{args_str[0, 40].inspect}). Retrying..."
629
+ end
630
+
631
+ # True when a tool_call's arguments field looks COMPLETELY empty —
632
+ # i.e. the upstream stream was cut before the model wrote any real
633
+ # content into the arguments JSON.
634
+ #
635
+ # Rules:
636
+ # - nil / non-String / empty string → truncated (nothing at all)
637
+ # - parses to {} (empty object) → truncated (placeholder only)
638
+ # - anything else (including partial/invalid JSON like `{"path":
639
+ # "/tmp/x"` where the model already started writing) → NOT
640
+ # truncated by this detector
641
+ #
642
+ # Partial-JSON cases are deliberately left to the existing
643
+ # ArgumentsParser → BadArgumentsError path, which surfaces the parse
644
+ # error back to the LLM as a tool_result so it can self-correct. That
645
+ # is more efficient than a blind retry when the model already wrote
646
+ # most of the args.
647
+ private def tool_call_args_truncated?(args)
648
+ return true if args.nil?
649
+ return true unless args.is_a?(String)
650
+ return true if args.empty?
651
+
652
+ parsed = begin
653
+ JSON.parse(args)
654
+ rescue JSON::ParserError
655
+ # Partial/invalid JSON — let ArgumentsParser handle it downstream.
656
+ return false
657
+ end
658
+
659
+ parsed.is_a?(Hash) && parsed.empty?
660
+ end
661
+
662
+ # On the FIRST Faraday::TimeoutError within a task, append a [SYSTEM]
663
+ # user message to the history instructing the model to break its work
664
+ # into smaller steps. Subsequent timeouts in the same task are ignored
665
+ # here (caller just retries) so we don't pollute history with duplicate
666
+ # hints.
667
+ #
668
+ # The injected message carries `system_injected: true` so it is:
669
+ # - Hidden from UI replay (session_serializer / replay_history filters)
670
+ # - Skipped by prompt-caching marker placement (client.rb)
671
+ # - Skipped by message compression's "recent user turn" protection
672
+ # (message_compressor_helper.rb)
673
+ #
674
+ # Reset per-task via Agent#run (see @task_timeout_hint_injected = false).
675
+ private def inject_large_output_hint_if_first_timeout(err)
676
+ return if @task_timeout_hint_injected
677
+
678
+ @task_timeout_hint_injected = true
679
+
680
+ hint = "[SYSTEM] The previous LLM response timed out (read timeout after ~300s). " \
681
+ "This usually means the model was trying to produce too much output in a single response. " \
682
+ "Please change your approach:\n" \
683
+ "- Break the task into multiple smaller steps, each producing a short response.\n" \
684
+ "- For long files: first create a skeleton with `write` (structure + placeholder comments only), " \
685
+ "then fill in each section with separate `edit` calls.\n" \
686
+ "- Keep each single tool-call argument (especially file content) well under ~500 lines.\n" \
687
+ "- Do NOT attempt to output the entire deliverable in one response."
688
+
689
+ @history.append({
690
+ role: "user",
691
+ content: hint,
692
+ system_injected: true,
693
+ task_id: @current_task_id
694
+ })
695
+
696
+ Octo::Logger.info(
697
+ "[llm_caller] Read-timeout detected — injected 'break into smaller steps' hint " \
698
+ "(error=#{err.class}: #{err.message})"
699
+ )
700
+
701
+ @ui&.show_warning(
702
+ "LLM response timed out — asking model to break the task into smaller steps and retrying..."
703
+ )
704
+ end
705
+
706
+ # On the FIRST upstream-truncation detection within a task, append a
707
+ # [SYSTEM] user message nudging the model toward smaller tool_call args.
708
+ # This guards against the (real but rare) case where the upstream SSE
709
+ # cut correlates with large tool_call payloads — a plain retry on the
710
+ # same oversized args would keep tripping the same wire.
711
+ #
712
+ # For purely infrastructural truncations (Anthropic edge blip, router
713
+ # hiccup), the hint is harmless — the retry will succeed and the hint
714
+ # just sits unused in history. Cheaper than letting the agent burn
715
+ # through its retry budget on the same oversized payload.
716
+ #
717
+ # Same plumbing as inject_large_output_hint_if_first_timeout: one-shot
718
+ # per task, carries `system_injected: true` so it's hidden from UI
719
+ # replay and skipped by compression/caching placement logic. Reset per
720
+ # task via Agent#run (see @task_upstream_truncation_hint_injected).
721
+ private def inject_upstream_truncation_hint_if_first(truncated_call)
722
+ return if @task_upstream_truncation_hint_injected
723
+
724
+ @task_upstream_truncation_hint_injected = true
725
+
726
+ tool_name = truncated_call[:name].to_s
727
+ hint = "[SYSTEM] The previous response was cut short by the upstream provider " \
728
+ "before the `#{tool_name}` tool_call finished streaming. " \
729
+ "The partial tool_call has been discarded. To avoid the same problem on retry, " \
730
+ "please adapt your approach:\n" \
731
+ "- Prefer smaller tool_call arguments — large single-shot payloads are more likely to be truncated.\n" \
732
+ "- For long file content: create the file first with a minimal skeleton via `write`, " \
733
+ "then append sections one at a time with `edit`.\n" \
734
+ "- Break large tasks into multiple smaller tool calls instead of one big one.\n" \
735
+ "- Keep each tool-call argument comfortably under ~2000 characters when possible."
736
+
737
+ @history.append({
738
+ role: "user",
739
+ content: hint,
740
+ system_injected: true,
741
+ task_id: @current_task_id
742
+ })
743
+
744
+ Octo::Logger.info(
745
+ "[llm_caller] Upstream truncation — injected 'smaller tool_call args' hint " \
746
+ "(tool=#{tool_name.inspect})"
747
+ )
748
+
749
+ @ui&.show_warning(
750
+ "Upstream response was truncated mid tool-call — asking model to use smaller steps and retrying..."
751
+ )
752
+ end
753
+
754
+ # Build a streaming progress callback for Client#send_messages_with_tools.
755
+ # Returns nil when no UI is attached, so the client skips the streaming
756
+ # plumbing entirely. Callback throttles UI updates to avoid flooding the
757
+ # progress handle on fast streams.
758
+ private def build_progress_on_chunk
759
+ return nil unless @ui
760
+ last_emit_at = 0.0
761
+ min_interval = 0.25
762
+ ->(input_tokens:, output_tokens:) {
763
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
764
+ return if now - last_emit_at < min_interval && output_tokens > 0
765
+ last_emit_at = now
766
+ @ui.stream_thinking_progress(input_tokens: input_tokens, output_tokens: output_tokens)
767
+ }
768
+ end
769
+
770
+ # Collect token usage data for current iteration and return it.
771
+ # Does NOT calculate cost — cost tracking has been removed.
772
+ # @param usage [Hash] Usage data from API
773
+ # @return [Hash] token_data ready for show_token_usage
774
+ def collect_iteration_tokens(usage)
775
+ prompt_tokens = usage[:prompt_tokens] || 0
776
+ completion_tokens = usage[:completion_tokens] || 0
777
+ total_tokens = usage[:total_tokens] || (prompt_tokens + completion_tokens)
778
+ cache_write = usage[:cache_creation_input_tokens] || 0
779
+ cache_read = usage[:cache_read_input_tokens] || 0
780
+
781
+ delta_tokens =
782
+ if usage[:total_is_per_turn]
783
+ total_tokens
784
+ else
785
+ total_tokens - @previous_total_tokens
786
+ end
787
+ @previous_total_tokens = total_tokens
788
+
789
+ {
790
+ delta_tokens: delta_tokens,
791
+ prompt_tokens: prompt_tokens,
792
+ completion_tokens: completion_tokens,
793
+ total_tokens: total_tokens,
794
+ cache_write: cache_write,
795
+ cache_read: cache_read
796
+ }
797
+ end
798
+ end
799
+ end
800
+ end