octo-agent 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/.clacky/skills/commit/SKILL.md +423 -0
  3. data/.clacky/skills/gem-release/SKILL.md +199 -0
  4. data/.clacky/skills/gem-release/scripts/release.sh +304 -0
  5. data/.clacky/skills/oss-upload/SKILL.md +47 -0
  6. data/.octorules +106 -0
  7. data/.rspec +3 -0
  8. data/.rubocop.yml +8 -0
  9. data/CHANGELOG.md +76 -0
  10. data/CODE_OF_CONDUCT.md +132 -0
  11. data/CONTRIBUTING.md +92 -0
  12. data/Dockerfile +28 -0
  13. data/LICENSE.txt +22 -0
  14. data/POSITIONING.md +46 -0
  15. data/README.md +134 -0
  16. data/README_CN.md +134 -0
  17. data/Rakefile +34 -0
  18. data/benchmark/fixtures/sample_project/Gemfile +3 -0
  19. data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
  20. data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
  21. data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
  22. data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
  23. data/benchmark/results/EVALUATION_REPORT.md +165 -0
  24. data/benchmark/results/baseline_20260511_174424.json +128 -0
  25. data/benchmark/results/report_20260511_175256.json +271 -0
  26. data/benchmark/results/report_20260511_175444.json +271 -0
  27. data/benchmark/results/treatment_20260511_175103.json +130 -0
  28. data/benchmark/runner.rb +441 -0
  29. data/bin/octo +7 -0
  30. data/docs/agent-first-ui-design.md +77 -0
  31. data/docs/billing-system.md +318 -0
  32. data/docs/channel-architecture.md +235 -0
  33. data/docs/engineering-article.md +343 -0
  34. data/docs/session-skill-invocation.md +69 -0
  35. data/docs/time_machine_design.md +247 -0
  36. data/docs/ui2-architecture.md +124 -0
  37. data/homebrew/README.md +96 -0
  38. data/homebrew/openocto.rb +24 -0
  39. data/lib/octo/agent/hook_manager.rb +61 -0
  40. data/lib/octo/agent/llm_caller.rb +800 -0
  41. data/lib/octo/agent/memory_updater.rb +246 -0
  42. data/lib/octo/agent/message_compressor.rb +225 -0
  43. data/lib/octo/agent/message_compressor_helper.rb +869 -0
  44. data/lib/octo/agent/next_message_suggester.rb +215 -0
  45. data/lib/octo/agent/session_serializer.rb +685 -0
  46. data/lib/octo/agent/skill_auto_creator.rb +114 -0
  47. data/lib/octo/agent/skill_evolution.rb +61 -0
  48. data/lib/octo/agent/skill_manager.rb +466 -0
  49. data/lib/octo/agent/skill_reflector.rb +89 -0
  50. data/lib/octo/agent/system_prompt_builder.rb +101 -0
  51. data/lib/octo/agent/time_machine.rb +214 -0
  52. data/lib/octo/agent/tool_executor.rb +454 -0
  53. data/lib/octo/agent/tool_registry.rb +150 -0
  54. data/lib/octo/agent.rb +2180 -0
  55. data/lib/octo/agent_config.rb +989 -0
  56. data/lib/octo/agent_profile.rb +112 -0
  57. data/lib/octo/anthropic_stream_aggregator.rb +137 -0
  58. data/lib/octo/background_task_registry.rb +324 -0
  59. data/lib/octo/banner.rb +34 -0
  60. data/lib/octo/bedrock_stream_aggregator.rb +137 -0
  61. data/lib/octo/block_font.rb +331 -0
  62. data/lib/octo/cli.rb +968 -0
  63. data/lib/octo/client.rb +623 -0
  64. data/lib/octo/default_agents/SOUL.md +3 -0
  65. data/lib/octo/default_agents/USER.md +1 -0
  66. data/lib/octo/default_agents/base_prompt.md +66 -0
  67. data/lib/octo/default_agents/coding/profile.yml +2 -0
  68. data/lib/octo/default_agents/coding/system_prompt.md +67 -0
  69. data/lib/octo/default_agents/general/profile.yml +2 -0
  70. data/lib/octo/default_agents/general/system_prompt.md +16 -0
  71. data/lib/octo/default_parsers/doc_parser.rb +69 -0
  72. data/lib/octo/default_parsers/docx_parser.rb +188 -0
  73. data/lib/octo/default_parsers/pdf_parser.rb +120 -0
  74. data/lib/octo/default_parsers/pdf_parser_ocr.py +103 -0
  75. data/lib/octo/default_parsers/pdf_parser_plumber.py +62 -0
  76. data/lib/octo/default_parsers/pptx_parser.rb +140 -0
  77. data/lib/octo/default_parsers/xlsx_parser.rb +121 -0
  78. data/lib/octo/default_skills/browser-setup/SKILL.md +426 -0
  79. data/lib/octo/default_skills/channel-manager/SKILL.md +623 -0
  80. data/lib/octo/default_skills/channel-manager/dingtalk_setup.rb +191 -0
  81. data/lib/octo/default_skills/channel-manager/discord_setup.rb +199 -0
  82. data/lib/octo/default_skills/channel-manager/feishu_setup.rb +574 -0
  83. data/lib/octo/default_skills/channel-manager/import_lark_skills.rb +97 -0
  84. data/lib/octo/default_skills/channel-manager/install_feishu_skills.rb +105 -0
  85. data/lib/octo/default_skills/channel-manager/weixin_setup.rb +274 -0
  86. data/lib/octo/default_skills/code-explorer/SKILL.md +36 -0
  87. data/lib/octo/default_skills/cron-task-creator/SKILL.md +257 -0
  88. data/lib/octo/default_skills/cron-task-creator/evals/evals.json +38 -0
  89. data/lib/octo/default_skills/onboard/SKILL.md +578 -0
  90. data/lib/octo/default_skills/onboard/scripts/import_external_skills.rb +413 -0
  91. data/lib/octo/default_skills/onboard/scripts/install_builtin_skills.rb +97 -0
  92. data/lib/octo/default_skills/persist-memory/SKILL.md +59 -0
  93. data/lib/octo/default_skills/personal-website/SKILL.md +113 -0
  94. data/lib/octo/default_skills/personal-website/publish.rb +235 -0
  95. data/lib/octo/default_skills/product-help/SKILL.md +123 -0
  96. data/lib/octo/default_skills/product-help/docs/agent-config.md +74 -0
  97. data/lib/octo/default_skills/product-help/docs/best-practices.md +49 -0
  98. data/lib/octo/default_skills/product-help/docs/browser-tool.md +53 -0
  99. data/lib/octo/default_skills/product-help/docs/built-in-skills.md +43 -0
  100. data/lib/octo/default_skills/product-help/docs/cli-reference.md +82 -0
  101. data/lib/octo/default_skills/product-help/docs/create-your-first-skill.md +47 -0
  102. data/lib/octo/default_skills/product-help/docs/faq.md +98 -0
  103. data/lib/octo/default_skills/product-help/docs/how-to-use-a-skill.md +58 -0
  104. data/lib/octo/default_skills/product-help/docs/installation.md +59 -0
  105. data/lib/octo/default_skills/product-help/docs/memory-system.md +61 -0
  106. data/lib/octo/default_skills/product-help/docs/octorules.md +62 -0
  107. data/lib/octo/default_skills/product-help/docs/session-management.md +63 -0
  108. data/lib/octo/default_skills/product-help/docs/skill-basics.md +55 -0
  109. data/lib/octo/default_skills/product-help/docs/skill-frontmatter.md +61 -0
  110. data/lib/octo/default_skills/product-help/docs/web-server.md +49 -0
  111. data/lib/octo/default_skills/product-help/docs/what-is-octo.md +37 -0
  112. data/lib/octo/default_skills/product-help/docs/windows-installation.md +36 -0
  113. data/lib/octo/default_skills/product-help/docs/writing-tips.md +53 -0
  114. data/lib/octo/default_skills/recall-memory/SKILL.md +65 -0
  115. data/lib/octo/default_skills/skill-add/SKILL.md +59 -0
  116. data/lib/octo/default_skills/skill-add/scripts/install_from_zip.rb +295 -0
  117. data/lib/octo/default_skills/skill-creator/SKILL.md +602 -0
  118. data/lib/octo/default_skills/skill-creator/agents/analyzer.md +274 -0
  119. data/lib/octo/default_skills/skill-creator/agents/comparator.md +202 -0
  120. data/lib/octo/default_skills/skill-creator/agents/grader.md +223 -0
  121. data/lib/octo/default_skills/skill-creator/eval-viewer/generate_review.py +471 -0
  122. data/lib/octo/default_skills/skill-creator/eval-viewer/viewer.html +1325 -0
  123. data/lib/octo/default_skills/skill-creator/references/schemas.md +430 -0
  124. data/lib/octo/default_skills/skill-creator/scripts/__init__.py +0 -0
  125. data/lib/octo/default_skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  126. data/lib/octo/default_skills/skill-creator/scripts/generate_report.py +326 -0
  127. data/lib/octo/default_skills/skill-creator/scripts/improve_description.py +310 -0
  128. data/lib/octo/default_skills/skill-creator/scripts/quick_validate.py +103 -0
  129. data/lib/octo/default_skills/skill-creator/scripts/run_eval.py +317 -0
  130. data/lib/octo/default_skills/skill-creator/scripts/run_loop.py +331 -0
  131. data/lib/octo/default_skills/skill-creator/scripts/utils.py +47 -0
  132. data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb +143 -0
  133. data/lib/octo/idle_compression_timer.rb +115 -0
  134. data/lib/octo/json_ui_controller.rb +204 -0
  135. data/lib/octo/message_format/anthropic.rb +409 -0
  136. data/lib/octo/message_format/bedrock.rb +361 -0
  137. data/lib/octo/message_format/open_ai.rb +222 -0
  138. data/lib/octo/message_history.rb +373 -0
  139. data/lib/octo/openai_stream_aggregator.rb +130 -0
  140. data/lib/octo/plain_ui_controller.rb +166 -0
  141. data/lib/octo/providers.rb +534 -0
  142. data/lib/octo/server/browser_manager.rb +397 -0
  143. data/lib/octo/server/channel/adapters/base.rb +82 -0
  144. data/lib/octo/server/channel/adapters/dingtalk/adapter.rb +314 -0
  145. data/lib/octo/server/channel/adapters/dingtalk/api_client.rb +391 -0
  146. data/lib/octo/server/channel/adapters/dingtalk/stream_client.rb +203 -0
  147. data/lib/octo/server/channel/adapters/discord/adapter.rb +229 -0
  148. data/lib/octo/server/channel/adapters/discord/api_client.rb +107 -0
  149. data/lib/octo/server/channel/adapters/discord/gateway_client.rb +270 -0
  150. data/lib/octo/server/channel/adapters/feishu/adapter.rb +320 -0
  151. data/lib/octo/server/channel/adapters/feishu/bot.rb +478 -0
  152. data/lib/octo/server/channel/adapters/feishu/file_processor.rb +36 -0
  153. data/lib/octo/server/channel/adapters/feishu/message_parser.rb +129 -0
  154. data/lib/octo/server/channel/adapters/feishu/ws_client.rb +423 -0
  155. data/lib/octo/server/channel/adapters/telegram/adapter.rb +375 -0
  156. data/lib/octo/server/channel/adapters/telegram/api_client.rb +205 -0
  157. data/lib/octo/server/channel/adapters/wecom/adapter.rb +148 -0
  158. data/lib/octo/server/channel/adapters/wecom/media_downloader.rb +115 -0
  159. data/lib/octo/server/channel/adapters/wecom/ws_client.rb +395 -0
  160. data/lib/octo/server/channel/adapters/weixin/adapter.rb +692 -0
  161. data/lib/octo/server/channel/adapters/weixin/api_client.rb +402 -0
  162. data/lib/octo/server/channel/channel_config.rb +178 -0
  163. data/lib/octo/server/channel/channel_manager.rb +468 -0
  164. data/lib/octo/server/channel/channel_ui_controller.rb +224 -0
  165. data/lib/octo/server/channel.rb +33 -0
  166. data/lib/octo/server/discover.rb +77 -0
  167. data/lib/octo/server/epipe_safe_io.rb +105 -0
  168. data/lib/octo/server/http_server.rb +3554 -0
  169. data/lib/octo/server/scheduler.rb +317 -0
  170. data/lib/octo/server/server_master.rb +325 -0
  171. data/lib/octo/server/session_registry.rb +431 -0
  172. data/lib/octo/server/web_ui_controller.rb +487 -0
  173. data/lib/octo/session_manager.rb +385 -0
  174. data/lib/octo/skill.rb +466 -0
  175. data/lib/octo/skill_loader.rb +328 -0
  176. data/lib/octo/tools/base.rb +118 -0
  177. data/lib/octo/tools/browser.rb +625 -0
  178. data/lib/octo/tools/edit.rb +165 -0
  179. data/lib/octo/tools/file_reader.rb +549 -0
  180. data/lib/octo/tools/glob.rb +162 -0
  181. data/lib/octo/tools/grep.rb +356 -0
  182. data/lib/octo/tools/invoke_skill.rb +96 -0
  183. data/lib/octo/tools/list_tasks.rb +54 -0
  184. data/lib/octo/tools/redo_task.rb +41 -0
  185. data/lib/octo/tools/request_user_feedback.rb +84 -0
  186. data/lib/octo/tools/security.rb +333 -0
  187. data/lib/octo/tools/terminal/output_cleaner.rb +63 -0
  188. data/lib/octo/tools/terminal/persistent_session.rb +268 -0
  189. data/lib/octo/tools/terminal/safe_rm.sh +106 -0
  190. data/lib/octo/tools/terminal/session_manager.rb +213 -0
  191. data/lib/octo/tools/terminal.rb +1828 -0
  192. data/lib/octo/tools/todo_manager.rb +374 -0
  193. data/lib/octo/tools/trash_manager.rb +388 -0
  194. data/lib/octo/tools/undo_task.rb +35 -0
  195. data/lib/octo/tools/web_fetch.rb +242 -0
  196. data/lib/octo/tools/web_search.rb +260 -0
  197. data/lib/octo/tools/write.rb +77 -0
  198. data/lib/octo/ui2/block_font.rb +10 -0
  199. data/lib/octo/ui2/components/base_component.rb +163 -0
  200. data/lib/octo/ui2/components/command_suggestions.rb +290 -0
  201. data/lib/octo/ui2/components/common_component.rb +96 -0
  202. data/lib/octo/ui2/components/inline_input.rb +226 -0
  203. data/lib/octo/ui2/components/input_area.rb +1338 -0
  204. data/lib/octo/ui2/components/message_component.rb +99 -0
  205. data/lib/octo/ui2/components/modal_component.rb +419 -0
  206. data/lib/octo/ui2/components/todo_area.rb +149 -0
  207. data/lib/octo/ui2/components/tool_component.rb +107 -0
  208. data/lib/octo/ui2/components/welcome_banner.rb +139 -0
  209. data/lib/octo/ui2/layout_manager.rb +807 -0
  210. data/lib/octo/ui2/line_editor.rb +363 -0
  211. data/lib/octo/ui2/markdown_renderer.rb +100 -0
  212. data/lib/octo/ui2/output_buffer.rb +370 -0
  213. data/lib/octo/ui2/progress_handle.rb +362 -0
  214. data/lib/octo/ui2/progress_indicator.rb +55 -0
  215. data/lib/octo/ui2/screen_buffer.rb +273 -0
  216. data/lib/octo/ui2/terminal_detector.rb +119 -0
  217. data/lib/octo/ui2/theme_manager.rb +85 -0
  218. data/lib/octo/ui2/themes/base_theme.rb +105 -0
  219. data/lib/octo/ui2/themes/hacker_theme.rb +62 -0
  220. data/lib/octo/ui2/themes/minimal_theme.rb +56 -0
  221. data/lib/octo/ui2/thinking_verbs.rb +26 -0
  222. data/lib/octo/ui2/ui_controller.rb +1625 -0
  223. data/lib/octo/ui2/view_renderer.rb +177 -0
  224. data/lib/octo/ui2.rb +40 -0
  225. data/lib/octo/ui_interface.rb +154 -0
  226. data/lib/octo/utils/arguments_parser.rb +191 -0
  227. data/lib/octo/utils/browser_detector.rb +195 -0
  228. data/lib/octo/utils/encoding.rb +92 -0
  229. data/lib/octo/utils/environment_detector.rb +140 -0
  230. data/lib/octo/utils/file_ignore_helper.rb +170 -0
  231. data/lib/octo/utils/file_processor.rb +601 -0
  232. data/lib/octo/utils/gitignore_parser.rb +154 -0
  233. data/lib/octo/utils/limit_stack.rb +152 -0
  234. data/lib/octo/utils/logger.rb +124 -0
  235. data/lib/octo/utils/login_shell.rb +72 -0
  236. data/lib/octo/utils/model_pricing.rb +646 -0
  237. data/lib/octo/utils/parser_manager.rb +165 -0
  238. data/lib/octo/utils/path_helper.rb +15 -0
  239. data/lib/octo/utils/scripts_manager.rb +59 -0
  240. data/lib/octo/utils/string_matcher.rb +158 -0
  241. data/lib/octo/utils/trash_directory.rb +112 -0
  242. data/lib/octo/utils/workspace_rules.rb +46 -0
  243. data/lib/octo/version.rb +5 -0
  244. data/lib/octo/web/app.css +7141 -0
  245. data/lib/octo/web/app.js +543 -0
  246. data/lib/octo/web/apple-touch-icon.png +0 -0
  247. data/lib/octo/web/auth.js +150 -0
  248. data/lib/octo/web/channels.js +276 -0
  249. data/lib/octo/web/datepicker.js +205 -0
  250. data/lib/octo/web/favicon.png +0 -0
  251. data/lib/octo/web/i18n.js +1073 -0
  252. data/lib/octo/web/icon-512.png +0 -0
  253. data/lib/octo/web/icon-dark.svg +25 -0
  254. data/lib/octo/web/icon.svg +29 -0
  255. data/lib/octo/web/index.html +871 -0
  256. data/lib/octo/web/marked.min.js +69 -0
  257. data/lib/octo/web/onboard.js +491 -0
  258. data/lib/octo/web/profile.js +442 -0
  259. data/lib/octo/web/sessions.js +4421 -0
  260. data/lib/octo/web/settings.js +913 -0
  261. data/lib/octo/web/sidebar.js +32 -0
  262. data/lib/octo/web/skills.js +885 -0
  263. data/lib/octo/web/tasks.js +297 -0
  264. data/lib/octo/web/theme.js +105 -0
  265. data/lib/octo/web/trash.js +343 -0
  266. data/lib/octo/web/vendor/hljs/highlight.min.js +1244 -0
  267. data/lib/octo/web/vendor/hljs/hljs-theme.css +95 -0
  268. data/lib/octo/web/vendor/katex/auto-render.min.js +1 -0
  269. data/lib/octo/web/vendor/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
  270. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
  271. data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
  272. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
  273. data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
  274. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
  275. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
  276. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
  277. data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
  278. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
  279. data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
  280. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
  281. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
  282. data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
  283. data/lib/octo/web/vendor/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
  284. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
  285. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
  286. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
  287. data/lib/octo/web/vendor/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
  288. data/lib/octo/web/vendor/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
  289. data/lib/octo/web/vendor/katex/katex.min.css +1 -0
  290. data/lib/octo/web/vendor/katex/katex.min.js +1 -0
  291. data/lib/octo/web/version.js +449 -0
  292. data/lib/octo/web/weixin-qr.html +209 -0
  293. data/lib/octo/web/ws-dispatcher.js +357 -0
  294. data/lib/octo/web/ws.js +128 -0
  295. data/lib/octo.rb +145 -0
  296. data/scripts/build/build.sh +329 -0
  297. data/scripts/build/lib/apt.sh +56 -0
  298. data/scripts/build/lib/brew.sh +89 -0
  299. data/scripts/build/lib/colors.sh +17 -0
  300. data/scripts/build/lib/gem.sh +95 -0
  301. data/scripts/build/lib/mise.sh +125 -0
  302. data/scripts/build/lib/network.sh +157 -0
  303. data/scripts/build/lib/os.sh +57 -0
  304. data/scripts/build/lib/shell.sh +37 -0
  305. data/scripts/build/src/install.sh.cc +174 -0
  306. data/scripts/build/src/install_browser.sh.cc +101 -0
  307. data/scripts/build/src/install_full.sh.cc +290 -0
  308. data/scripts/build/src/install_rails_deps.sh.cc +145 -0
  309. data/scripts/build/src/install_system_deps.sh.cc +123 -0
  310. data/scripts/build/src/uninstall.sh.cc +101 -0
  311. data/scripts/install.ps1 +532 -0
  312. data/scripts/install.sh +567 -0
  313. data/scripts/install_browser.sh +479 -0
  314. data/scripts/install_full.sh +838 -0
  315. data/scripts/install_rails_deps.sh +746 -0
  316. data/scripts/install_system_deps.sh +518 -0
  317. data/scripts/uninstall.sh +287 -0
  318. data/sig/octo.rbs +4 -0
  319. metadata +614 -0
@@ -0,0 +1,66 @@
1
+ ## General Behavior
2
+
3
+ - Ask clarifying questions if requirements are unclear.
4
+ - Break down complex tasks into manageable steps.
5
+ - **USE TOOLS to create/modify files** — don't just return content.
6
+ - When the user asks to send/download a file or you generate one for them, append `[filename](file://~/path/to/file)` at the end of your reply.
7
+
8
+ ## Tool Usage Rules
9
+
10
+ - **ALWAYS use `glob` tool to find files — NEVER use shell `find` command for file discovery**
11
+ - **All operations default to the working directory** (shown in session context)
12
+
13
+ ## Response Style
14
+
15
+ - Keep responses short and concise. One sentence per update is almost always enough.
16
+ - Do not use a colon before tool calls (e.g., "Let me read the file:" → "Let me read the file.")
17
+ - Don't narrate your internal deliberation. User-facing text should be relevant communication, not a running commentary.
18
+ - Don't summarize what you just did at the end of every response. The user can read the diff.
19
+ - Only use emojis if the user explicitly requests it. Avoid emojis in all communication unless asked.
20
+
21
+ ## Task Tracking
22
+
23
+ Use `todo_manager` to plan and track work on complex tasks (3+ steps).
24
+ - Exactly ONE task must be `in_progress` at any time.
25
+ - Mark tasks complete IMMEDIATELY after finishing — don't batch completions.
26
+ - Complete current tasks before starting new ones.
27
+
28
+ Adding todos is NOT completion — it's just the planning phase. After creating the TODO list, START EXECUTING each task immediately. NEVER stop after just adding todos without executing them!
29
+
30
+ ## Terminal Commands
31
+
32
+ **Two modes only:**
33
+
34
+ - **Sync (default)** — `terminal(command: "...")`. Quick commands return immediately with `{exit_code, output}`. Slow build/test/install commands are auto-routed to async by the harness — you'll get a handle back without thinking about it. If the command hits an interactive prompt, you also get a handle so you can answer it.
35
+
36
+ - **Async** — `terminal(command: "...", async: true)`. Returns a handle immediately. Use for any long task you intend to leave running (build, deploy, dev server, REPL, watcher, side quest). One flag for all of them — no separate "background" vs "fire-and-forget".
37
+
38
+ **Five operations on a handle** (the `handle_id` returned from any async call or sync-hits-idle response):
39
+
40
+ - `Read(output_file)` — read the task's full stdout, both during run and after exit. The `<output-file>` tag is included in every handle response AND in every `<task-notification>`. Notifications don't inline output — they ship a `<summary>` (often the last useful line) plus the path. If summary is enough, skip the Read. Raw PTY log (may contain ANSI escapes).
41
+ - `terminal(handle_id: "<id>")` — query current status (running/completed/cancelled/exited + elapsed time + exit code).
42
+ - `terminal(handle_id: "<id>", input: "y\n")` — send input to the underlying PTY (answer a prompt, drive a REPL).
43
+ - `terminal(handle_id: "<id>", kill: true)` — terminate the underlying process.
44
+ - **Wait for `<task-notification>`** — when the task exits, the harness pushes a notification into your context with the same `handle_id`. You don't need to poll.
45
+
46
+ **Examples:**
47
+ ✅ `terminal(command: "npm run build")` — harness recognises this is slow → async automatically → you get a handle, do other work, notification fires on completion.
48
+ ✅ `terminal(command: "rails s", async: true)` — dev server, you'll kill it later. Same async path; the handle gives you `terminal(handle_id:, kill: true)`.
49
+ ✅ `terminal(command: "deploy-staging.sh", async: true)` — long task you want to fire off and continue with other work.
50
+ ✅ `terminal(command: "apt install foo")` → hits `[Y/n]` prompt → returns handle with `state: "waiting"` → `terminal(handle_id:, input: "y\n")` to answer.
51
+ ❌ Polling `terminal(handle_id:)` in a tight loop while waiting — wait for the notification, or `Read(output_file)` once to peek.
52
+
53
+ **When an async task is started, do NOT poll it.** Do not query its status in a tight loop, and do not start another instance of the same command. The harness will push a `<task-notification>` when the task exits — that is your cue to resume.
54
+
55
+ Whether to continue with other work while waiting depends on dependency:
56
+ - If your next step **requires** the task's result (e.g., you need test output to decide the next fix), STOP and wait for the notification.
57
+ - If your next step is **independent** (e.g., modify unrelated files, review another module, draft the next change, ask the user a clarifying question), you MAY continue. Treat the running task as background — it does not block unrelated work.
58
+
59
+ **When multiple async tasks are running concurrently, proactively keep the user informed.** Before starting unrelated new work that the user did not explicitly request, send a one-line status: "I have N tasks running (build, tests, …); doing X next while they finish."
60
+
61
+ ## Long-term Memory
62
+
63
+ Topical knowledge lives in `~/.octo/memories/`.
64
+
65
+ - **Recall** with `invoke_skill("recall-memory", "<topic>")` when the user expects you to already know something — they reference prior context as shared knowledge, mention an unfamiliar name/path/decision, or ask you to recall.
66
+ - **Persist** when the user asks you to remember or note something: `invoke_skill("persist-memory", "<what to remember>")` immediately.
@@ -0,0 +1,2 @@
1
+ name: coding
2
+ description: AI coding assistant and technical co-founder
@@ -0,0 +1,67 @@
1
+ You are an AI coding assistant and technical co-founder, designed to help non-technical
2
+ users complete software development projects. You are responsible for development in the current project.
3
+
4
+ Your role is to:
5
+ - Understand project requirements and translate them into technical solutions
6
+ - Write clean, maintainable code
7
+ - Follow best practices and industry standards
8
+ - Explain technical concepts in simple terms when needed
9
+ - Proactively identify potential issues and suggest improvements
10
+ - Help with debugging, testing, and deployment
11
+
12
+ Working process:
13
+ 1. Always read existing code before making changes (use file_reader/glob/grep or invoke code-explorer skill)
14
+ 2. Write code that is secure, efficient, and easy to understand
15
+ 3. You should frequently refer to the existing codebase. For unclear instructions,
16
+ prioritize understanding the codebase first before answering or taking action.
17
+ Always read relevant code files to understand the project structure, patterns, and conventions.
18
+
19
+ ## Code Style
20
+
21
+ - **Default to writing no comments.** Only add one when the WHY is non-obvious: a hidden constraint, a subtle invariant, a workaround for a specific bug, or behavior that would surprise a reader.
22
+ - Don't explain WHAT the code does — well-named identifiers already do that.
23
+ - Don't reference the current task, fix, or callers ("used by X", "added for the Y flow", "handles the case from issue #123"). These belong in the PR description and rot as the codebase evolves.
24
+ - Never write multi-paragraph docstrings or multi-line comment blocks — one short line max.
25
+
26
+ ## File Modification Rules
27
+
28
+ - **ALWAYS prefer `edit` over `write`.** Use `write` only for creating entirely new files or complete rewrites.
29
+ - When editing text from `file_reader` output, preserve the exact indentation (tabs/spaces) as it appears AFTER the line number prefix.
30
+ - Ensure `old_string` is unique in the file. If not, provide a larger string with more surrounding context to make it unique.
31
+ - Use `replace_all` only when you genuinely need to change every occurrence.
32
+ - When referencing specific functions or pieces of code, include `file_path:line_number` to help the user navigate.
33
+
34
+ ## Git Safety Protocol
35
+
36
+ - NEVER update git config (user.name, user.email, etc.)
37
+ - NEVER run destructive commands: `git push --force`, `git reset --hard`, `git checkout .`, `git clean -f`
38
+ - NEVER skip hooks (`--no-verify`, `--no-gpg-sign`)
39
+ - When staging files, prefer `git add <specific-file>` over `git add -A` or `git add .`
40
+ - Always create NEW commits rather than amending existing ones
41
+ - Never amend published commits
42
+ - Only create commits when requested by the user. If unclear, ask first.
43
+
44
+ ## Error Handling
45
+
46
+ - Don't add error handling, fallbacks, or validation for scenarios that can't happen. Trust internal code and framework guarantees.
47
+ - Only validate at system boundaries (user input, external APIs).
48
+ - Don't use feature flags or backwards-compatibility shims when you can just change the code.
49
+
50
+ ## Security
51
+
52
+ - Be careful not to introduce security vulnerabilities such as command injection, XSS, SQL injection, and other OWASP top 10 vulnerabilities.
53
+ - If you notice insecure code, immediately fix it.
54
+ - Prioritize writing safe, secure, and correct code.
55
+
56
+ ## Testing
57
+
58
+ - For UI or frontend changes, start the dev server and verify in a browser before reporting the task as complete.
59
+ - Type checking and test suites verify code correctness, not feature correctness — if you can't test the UI, say so explicitly rather than claiming success.
60
+ - When the user asks you to run tests, do so and report the results.
61
+
62
+ ## Code Quality
63
+
64
+ - Don't add features, refactor, or introduce abstractions beyond what the task requires.
65
+ - A bug fix doesn't need surrounding cleanup; a one-shot operation doesn't need a helper.
66
+ - Three similar lines is better than a premature abstraction.
67
+ - No half-finished implementations either.
@@ -0,0 +1,2 @@
1
+ name: general
2
+ description: A versatile digital employee living on your computer
@@ -0,0 +1,16 @@
1
+ You are a versatile digital employee living on the user's computer,
2
+ capable of handling a wide range of tasks autonomously.
3
+
4
+ Your role is to:
5
+ - Execute tasks autonomously with minimal interruption
6
+ - Manage files, run commands, and interact with the system on behalf of the user
7
+ - Research, summarize, and synthesize information from the web
8
+ - Handle scheduling and automated workflows
9
+ - Communicate clearly and concisely about what you did and what you found
10
+
11
+ Working style:
12
+ - Proactive: if you see a better way to do something, suggest it
13
+ - Efficient: complete tasks with the fewest steps necessary
14
+ - Reliable: always confirm task completion with a clear summary
15
+ - When a task is ambiguous, ask ONE clarifying question before starting
16
+ - Prefer action over planning for simple tasks
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+ #
4
+ # Octo DOC Parser — CLI interface
5
+ #
6
+ # Usage:
7
+ # ruby doc_parser.rb <file_path>
8
+ #
9
+ # Output:
10
+ # stdout — extracted text content (UTF-8)
11
+ # stderr — error messages
12
+ # exit 0 — success
13
+ # exit 1 — failure
14
+ #
15
+ # This file lives in ~/.octo/parsers/ and can be modified by the LLM
16
+ # to add new capabilities (e.g. antiword, libreoffice conversion).
17
+ #
18
+ # VERSION: 1
19
+
20
+ require "open3"
21
+
22
+ MIN_CONTENT_BYTES = 20
23
+
24
+ # Use macOS textutil to convert .doc → txt
25
+ def try_textutil(path)
26
+ stdout, _stderr, status = Open3.capture3("textutil", "-convert", "txt", "-stdout", path)
27
+ return nil unless status.success?
28
+ text = stdout.strip
29
+ return nil if text.bytesize < MIN_CONTENT_BYTES
30
+ text
31
+ rescue Errno::ENOENT
32
+ nil # textutil not available (non-macOS)
33
+ end
34
+
35
+ # Use antiword to extract text from .doc files (Linux/WSL)
36
+ def try_antiword(path)
37
+ stdout, _stderr, status = Open3.capture3("antiword", path)
38
+ return nil unless status.success?
39
+ text = stdout.strip
40
+ return nil if text.bytesize < MIN_CONTENT_BYTES
41
+ text
42
+ rescue Errno::ENOENT
43
+ nil # antiword not installed
44
+ end
45
+
46
+ # --- main ---
47
+
48
+ path = ARGV[0]
49
+
50
+ if path.nil? || path.empty?
51
+ warn "Usage: ruby doc_parser.rb <file_path>"
52
+ exit 1
53
+ end
54
+
55
+ unless File.exist?(path)
56
+ warn "File not found: #{path}"
57
+ exit 1
58
+ end
59
+
60
+ text = try_textutil(path) || try_antiword(path)
61
+
62
+ if text
63
+ print text
64
+ exit 0
65
+ else
66
+ warn "Could not extract text from .doc file."
67
+ warn "Tip: on macOS textutil should work. On Linux/WSL try: apt install antiword"
68
+ exit 1
69
+ end
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+ # encoding: utf-8
4
+
5
+ Encoding.default_external = Encoding::UTF_8
6
+ Encoding.default_internal = Encoding::UTF_8
7
+
8
+ #
9
+ # Octo DOCX Parser — CLI interface
10
+ #
11
+ # Usage:
12
+ # ruby docx_parser.rb <file_path>
13
+ #
14
+ # Output:
15
+ # stdout — extracted text in Markdown (UTF-8)
16
+ # stderr — error messages
17
+ # exit 0 — success
18
+ # exit 1 — failure
19
+ #
20
+ # Dependencies: rubyzip gem (gem install rubyzip)
21
+ #
22
+ # This file lives in ~/.octo/parsers/ and can be modified by the LLM.
23
+ #
24
+ # VERSION: 1
25
+
26
+ require "zip"
27
+ require "rexml/document"
28
+ require "stringio"
29
+
30
+ def safe_utf8(str)
31
+ # First try force_encoding (lossless, for content that IS valid UTF-8)
32
+ utf8 = str.dup.force_encoding("UTF-8")
33
+ return utf8 if utf8.valid_encoding?
34
+ # Fallback: transcode with replacement for genuinely invalid bytes
35
+ str.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
36
+ end
37
+
38
+ def read_zip_entry(body, name)
39
+ xml = nil
40
+ Zip::File.open_buffer(StringIO.new(body)) do |zip|
41
+ entry = zip.find_entry(name)
42
+ xml = safe_utf8(entry.get_input_stream.read) if entry
43
+ end
44
+ xml
45
+ end
46
+
47
+ def read_document_xml(body)
48
+ xml = read_zip_entry(body, "word/document.xml")
49
+ raise "Could not extract content — possibly encrypted or invalid format" unless xml
50
+ xml
51
+ end
52
+
53
+ def read_numbering(body)
54
+ result = {}
55
+ xml = read_zip_entry(body, "word/numbering.xml")
56
+ return result unless xml
57
+ doc = REXML::Document.new(xml)
58
+ REXML::XPath.each(doc, "//w:abstractNum") do |an|
59
+ id = an.attributes["w:abstractNumId"]
60
+ levels = {}
61
+ REXML::XPath.each(an, "w:lvl") do |lvl|
62
+ ilvl = lvl.attributes["w:ilvl"].to_i
63
+ fmt = REXML::XPath.first(lvl, "w:numFmt")&.attributes&.[]("w:val")
64
+ levels[ilvl] = { fmt: fmt || "bullet" }
65
+ end
66
+ result[id] = levels
67
+ end
68
+ result
69
+ rescue
70
+ {}
71
+ end
72
+
73
+ def read_styles(body)
74
+ result = {}
75
+ xml = read_zip_entry(body, "word/styles.xml")
76
+ return result unless xml
77
+ doc = REXML::Document.new(xml)
78
+ REXML::XPath.each(doc, "//w:style") do |s|
79
+ sid = s.attributes["w:styleId"]
80
+ name = REXML::XPath.first(s, "w:name")&.attributes&.[]("w:val").to_s
81
+ if name =~ /^heading (\d)/i
82
+ result[sid] = { heading: $1.to_i }
83
+ end
84
+ end
85
+ result
86
+ rescue
87
+ {}
88
+ end
89
+
90
+ def extract_runs(para_node)
91
+ parts = []
92
+ REXML::XPath.each(para_node, "w:r") do |run|
93
+ rpr = REXML::XPath.first(run, "w:rPr")
94
+ bold = REXML::XPath.first(rpr, "w:b") if rpr
95
+ text = REXML::XPath.match(run, "w:t").map(&:text).compact.join
96
+ next if text.empty?
97
+ parts << (bold ? "**#{text}**" : text)
98
+ end
99
+ parts.join
100
+ end
101
+
102
+ def parse_paragraph(node, styles, numbering)
103
+ ppr = REXML::XPath.first(node, "w:pPr")
104
+ style = REXML::XPath.first(ppr, "w:pStyle")&.attributes&.[]("w:val") if ppr
105
+ num_pr = REXML::XPath.first(ppr, "w:numPr") if ppr
106
+
107
+ text = extract_runs(node)
108
+ return nil if text.strip.empty?
109
+
110
+ if style && styles[style]
111
+ level = styles[style][:heading]
112
+ return "#{"#" * level} #{text}"
113
+ end
114
+
115
+ if num_pr
116
+ ilvl = REXML::XPath.first(num_pr, "w:ilvl")&.attributes&.[]("w:val").to_i
117
+ indent = " " * ilvl
118
+ return "#{indent}- #{text}"
119
+ end
120
+
121
+ text
122
+ end
123
+
124
+ def parse_table(tbl_node)
125
+ rows = []
126
+ REXML::XPath.each(tbl_node, "w:tr") do |tr|
127
+ cells = REXML::XPath.match(tr, "w:tc").map do |tc|
128
+ REXML::XPath.match(tc, ".//w:t").map(&:text).compact.join(" ").strip
129
+ end
130
+ rows << cells
131
+ end
132
+ return "" if rows.empty?
133
+
134
+ col_count = rows.map(&:size).max
135
+ lines = []
136
+ rows.each_with_index do |row, i|
137
+ padded = row + [""] * [col_count - row.size, 0].max
138
+ lines << "| #{padded.join(" | ")} |"
139
+ lines << "|#{" --- |" * col_count}" if i == 0
140
+ end
141
+ lines.join("\n")
142
+ end
143
+
144
+ # --- main ---
145
+
146
+ path = ARGV[0]
147
+
148
+ if path.nil? || path.empty?
149
+ warn "Usage: ruby docx_parser.rb <file_path>"
150
+ exit 1
151
+ end
152
+
153
+ unless File.exist?(path)
154
+ warn "File not found: #{path}"
155
+ exit 1
156
+ end
157
+
158
+ begin
159
+ body = File.binread(path)
160
+ xml = read_document_xml(body)
161
+ doc = REXML::Document.new(xml)
162
+ numbering = read_numbering(body)
163
+ styles = read_styles(body)
164
+
165
+ lines = []
166
+ REXML::XPath.each(doc, "//w:body/*") do |node|
167
+ case node.name
168
+ when "p"
169
+ line = parse_paragraph(node, styles, numbering)
170
+ lines << line unless line.nil?
171
+ when "tbl"
172
+ lines << parse_table(node)
173
+ end
174
+ end
175
+
176
+ result = lines.join("\n").strip
177
+ if result.empty?
178
+ warn "Document appears to be empty"
179
+ exit 1
180
+ end
181
+
182
+ print result
183
+ exit 0
184
+ rescue => e
185
+ warn "Failed to parse DOCX: #{e.message}"
186
+ warn "Tip: ensure rubyzip is installed: gem install rubyzip"
187
+ exit 1
188
+ end
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+ #
4
+ # Octo PDF Parser — CLI interface
5
+ #
6
+ # Usage:
7
+ # ruby pdf_parser.rb <file_path>
8
+ #
9
+ # Output:
10
+ # stdout — extracted text content (UTF-8)
11
+ # stderr — error messages
12
+ # exit 0 — success
13
+ # exit 1 — failure
14
+ #
15
+ # This file lives in ~/.octo/parsers/ and can be modified by the LLM.
16
+ #
17
+ # Extraction pipeline (first successful step wins):
18
+ # 1. pdftotext (poppler) — fastest, text-based PDFs
19
+ # 2. pdfplumber (Python) — handles more layouts
20
+ # (→ pdf_parser_plumber.py)
21
+ # 3. OCR (tesseract) — scanned / image-only PDFs
22
+ # (→ pdf_parser_ocr.py)
23
+ #
24
+ # Each extractor is a plain, self-contained function. Python-backed steps
25
+ # shell out to a sibling .py script so the LLM can edit them directly
26
+ # (with proper syntax highlighting, linters, and per-file run/debug)
27
+ # instead of wrestling with embedded heredocs.
28
+ #
29
+ # VERSION: 3
30
+
31
+ require "open3"
32
+
33
+ # Minimum useful output (in bytes). Below this, a step is considered a
34
+ # miss and the next fallback is tried.
35
+ MIN_CONTENT_BYTES = 20
36
+
37
+ # Script directory — resolve sibling .py helpers relative to this file
38
+ # so it works both from the gem's default_parsers/ dir and from the
39
+ # copied-to-user ~/.octo/parsers/ dir.
40
+ SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
41
+
42
+ def try_pdftotext(path)
43
+ stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
44
+ return nil unless status.success?
45
+ text = stdout.strip
46
+ return nil if text.bytesize < MIN_CONTENT_BYTES
47
+ text
48
+ rescue Errno::ENOENT
49
+ nil # pdftotext not installed
50
+ end
51
+
52
+ def try_pdfplumber(path)
53
+ script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
54
+ return nil unless File.exist?(script)
55
+
56
+ stdout, _stderr, status = Open3.capture3("python3", script, path)
57
+ return nil unless status.success?
58
+ text = stdout.strip
59
+ return nil if text.bytesize < MIN_CONTENT_BYTES
60
+ text
61
+ rescue Errno::ENOENT
62
+ nil # python3 not available
63
+ end
64
+
65
+ # OCR fallback for scanned/image-only PDFs.
66
+ # See pdf_parser_ocr.py for the actual extraction logic.
67
+ #
68
+ # Installation hints (also printed on final failure):
69
+ # macOS: brew install tesseract tesseract-lang poppler
70
+ # pip3 install pytesseract pdf2image
71
+ # Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
72
+ # pip3 install pytesseract pdf2image
73
+ def try_ocr(path)
74
+ # Quick capability check — avoid spawning python if tesseract is missing.
75
+ _stdout, _stderr, status = Open3.capture3("tesseract", "--version")
76
+ return nil unless status.success?
77
+
78
+ script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
79
+ return nil unless File.exist?(script)
80
+
81
+ stdout, stderr, status = Open3.capture3("python3", script, path)
82
+ unless status.success?
83
+ warn stderr.strip unless stderr.strip.empty?
84
+ return nil
85
+ end
86
+ text = stdout.strip
87
+ return nil if text.bytesize < MIN_CONTENT_BYTES
88
+ text
89
+ rescue Errno::ENOENT
90
+ nil # tesseract or python3 not available
91
+ end
92
+
93
+ # --- main ---
94
+
95
+ path = ARGV[0]
96
+
97
+ if path.nil? || path.empty?
98
+ warn "Usage: ruby pdf_parser.rb <file_path>"
99
+ exit 1
100
+ end
101
+
102
+ unless File.exist?(path)
103
+ warn "File not found: #{path}"
104
+ exit 1
105
+ end
106
+
107
+ # Try each extractor in order; first non-nil result wins.
108
+ text = try_pdftotext(path) || try_pdfplumber(path) || try_ocr(path)
109
+
110
+ if text
111
+ print text
112
+ exit 0
113
+ else
114
+ warn "Could not extract text from PDF."
115
+ warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
116
+ warn "For scanned PDFs (OCR):"
117
+ warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
118
+ warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
119
+ exit 1
120
+ end
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf_parser_ocr.py — extract text from a scanned/image-only PDF using OCR.
4
+
5
+ Usage:
6
+ python3 pdf_parser_ocr.py <file_path>
7
+
8
+ Output:
9
+ stdout — extracted text, one block per page, separated by blank lines
10
+ stderr — error messages
11
+ exit 0 — success (text was extracted)
12
+ exit 1 — failure / no text found
13
+ exit 2 — dependency missing (pytesseract or pdf2image)
14
+ exit 3 — pdf2image couldn't rasterise the PDF (usually missing poppler)
15
+
16
+ Called from pdf_parser.rb as the third-tier fallback (after pdftotext and
17
+ pdfplumber). This script is copied into ~/.octo/parsers/ and can be
18
+ edited freely by the LLM — common tweaks:
19
+ - Change DPI (higher = better accuracy, slower + more memory)
20
+ - Change OCR_LANG to match your document (e.g. "jpn+eng")
21
+ - Add image preprocessing (deskew, contrast, threshold) before OCR
22
+ - Adjust MAX_PAGES for very large scans
23
+
24
+ Environment variable overrides:
25
+ OCTO_OCR_LANG — override OCR_LANG (e.g. "eng", "jpn+eng")
26
+ OCTO_OCR_MAX_PAGES — override MAX_PAGES
27
+ OCTO_OCR_DPI — override DPI
28
+
29
+ Install:
30
+ macOS: brew install tesseract tesseract-lang poppler
31
+ pip3 install pytesseract pdf2image
32
+ Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
33
+ pip3 install pytesseract pdf2image
34
+ """
35
+
36
+ # VERSION: 1
37
+
38
+ import os
39
+ import sys
40
+
41
+ # --- Config ---
42
+ # Simplified Chinese + English covers most mixed-language documents.
43
+ # For pure English scans, "eng" alone is faster and lighter.
44
+ OCR_LANG = "chi_sim+eng"
45
+
46
+ # 200 DPI is a good balance: tesseract's accuracy plateau starts around
47
+ # 300 DPI, but memory + time cost scales quadratically. Raise to 300 for
48
+ # small fonts or when accuracy matters more than speed.
49
+ DPI = 200
50
+
51
+ # Hard cap on pages to OCR. OCR is slow (~1-3s/page); for huge scans the
52
+ # LLM should be told to OCR in chunks instead.
53
+ MAX_PAGES = 50
54
+
55
+
56
+ def main():
57
+ if len(sys.argv) < 2:
58
+ sys.stderr.write("Usage: pdf_parser_ocr.py <file_path>\n")
59
+ sys.exit(1)
60
+
61
+ path = sys.argv[1]
62
+
63
+ try:
64
+ import pytesseract
65
+ from pdf2image import convert_from_path
66
+ except ImportError as e:
67
+ sys.stderr.write(f"OCR dependencies missing: {e}\n")
68
+ sys.stderr.write("Install with: pip3 install pytesseract pdf2image\n")
69
+ sys.exit(2)
70
+
71
+ lang = os.environ.get("OCTO_OCR_LANG", OCR_LANG)
72
+ max_pages = int(os.environ.get("OCTO_OCR_MAX_PAGES", MAX_PAGES))
73
+ dpi = int(os.environ.get("OCTO_OCR_DPI", DPI))
74
+
75
+ try:
76
+ images = convert_from_path(path, dpi=dpi, last_page=max_pages)
77
+ except Exception as e:
78
+ sys.stderr.write(f"pdf2image failed: {e}\n")
79
+ sys.stderr.write("Is poppler installed? (brew install poppler / apt install poppler-utils)\n")
80
+ sys.exit(3)
81
+
82
+ pages = []
83
+ for i, image in enumerate(images, 1):
84
+ try:
85
+ text = pytesseract.image_to_string(image, lang=lang)
86
+ except pytesseract.TesseractError as e:
87
+ # Most common cause: requested language pack not installed.
88
+ # Fall back to English-only for this page rather than aborting.
89
+ sys.stderr.write(f"tesseract error on page {i}: {e}\n")
90
+ text = pytesseract.image_to_string(image, lang="eng")
91
+ text = text.strip()
92
+ if text:
93
+ pages.append(f"--- Page {i} (OCR) ---\n{text}")
94
+
95
+ if not pages:
96
+ sys.stderr.write("OCR produced no text — PDF may be blank or unreadable.\n")
97
+ sys.exit(1)
98
+
99
+ print("\n\n".join(pages))
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()