octo-agent 0.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.clacky/skills/commit/SKILL.md +423 -0
- data/.clacky/skills/gem-release/SKILL.md +199 -0
- data/.clacky/skills/gem-release/scripts/release.sh +304 -0
- data/.clacky/skills/oss-upload/SKILL.md +47 -0
- data/.octorules +106 -0
- data/.rspec +3 -0
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +76 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/CONTRIBUTING.md +92 -0
- data/Dockerfile +28 -0
- data/LICENSE.txt +22 -0
- data/POSITIONING.md +46 -0
- data/README.md +134 -0
- data/README_CN.md +134 -0
- data/Rakefile +34 -0
- data/benchmark/fixtures/sample_project/Gemfile +3 -0
- data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
- data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
- data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
- data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
- data/benchmark/results/EVALUATION_REPORT.md +165 -0
- data/benchmark/results/baseline_20260511_174424.json +128 -0
- data/benchmark/results/report_20260511_175256.json +271 -0
- data/benchmark/results/report_20260511_175444.json +271 -0
- data/benchmark/results/treatment_20260511_175103.json +130 -0
- data/benchmark/runner.rb +441 -0
- data/bin/octo +7 -0
- data/docs/agent-first-ui-design.md +77 -0
- data/docs/billing-system.md +318 -0
- data/docs/channel-architecture.md +235 -0
- data/docs/engineering-article.md +343 -0
- data/docs/session-skill-invocation.md +69 -0
- data/docs/time_machine_design.md +247 -0
- data/docs/ui2-architecture.md +124 -0
- data/homebrew/README.md +96 -0
- data/homebrew/openocto.rb +24 -0
- data/lib/octo/agent/hook_manager.rb +61 -0
- data/lib/octo/agent/llm_caller.rb +800 -0
- data/lib/octo/agent/memory_updater.rb +246 -0
- data/lib/octo/agent/message_compressor.rb +225 -0
- data/lib/octo/agent/message_compressor_helper.rb +869 -0
- data/lib/octo/agent/next_message_suggester.rb +215 -0
- data/lib/octo/agent/session_serializer.rb +685 -0
- data/lib/octo/agent/skill_auto_creator.rb +114 -0
- data/lib/octo/agent/skill_evolution.rb +61 -0
- data/lib/octo/agent/skill_manager.rb +466 -0
- data/lib/octo/agent/skill_reflector.rb +89 -0
- data/lib/octo/agent/system_prompt_builder.rb +101 -0
- data/lib/octo/agent/time_machine.rb +214 -0
- data/lib/octo/agent/tool_executor.rb +454 -0
- data/lib/octo/agent/tool_registry.rb +150 -0
- data/lib/octo/agent.rb +2180 -0
- data/lib/octo/agent_config.rb +989 -0
- data/lib/octo/agent_profile.rb +112 -0
- data/lib/octo/anthropic_stream_aggregator.rb +137 -0
- data/lib/octo/background_task_registry.rb +324 -0
- data/lib/octo/banner.rb +34 -0
- data/lib/octo/bedrock_stream_aggregator.rb +137 -0
- data/lib/octo/block_font.rb +331 -0
- data/lib/octo/cli.rb +968 -0
- data/lib/octo/client.rb +623 -0
- data/lib/octo/default_agents/SOUL.md +3 -0
- data/lib/octo/default_agents/USER.md +1 -0
- data/lib/octo/default_agents/base_prompt.md +66 -0
- data/lib/octo/default_agents/coding/profile.yml +2 -0
- data/lib/octo/default_agents/coding/system_prompt.md +67 -0
- data/lib/octo/default_agents/general/profile.yml +2 -0
- data/lib/octo/default_agents/general/system_prompt.md +16 -0
- data/lib/octo/default_parsers/doc_parser.rb +69 -0
- data/lib/octo/default_parsers/docx_parser.rb +188 -0
- data/lib/octo/default_parsers/pdf_parser.rb +120 -0
- data/lib/octo/default_parsers/pdf_parser_ocr.py +103 -0
- data/lib/octo/default_parsers/pdf_parser_plumber.py +62 -0
- data/lib/octo/default_parsers/pptx_parser.rb +140 -0
- data/lib/octo/default_parsers/xlsx_parser.rb +121 -0
- data/lib/octo/default_skills/browser-setup/SKILL.md +426 -0
- data/lib/octo/default_skills/channel-manager/SKILL.md +623 -0
- data/lib/octo/default_skills/channel-manager/dingtalk_setup.rb +191 -0
- data/lib/octo/default_skills/channel-manager/discord_setup.rb +199 -0
- data/lib/octo/default_skills/channel-manager/feishu_setup.rb +574 -0
- data/lib/octo/default_skills/channel-manager/import_lark_skills.rb +97 -0
- data/lib/octo/default_skills/channel-manager/install_feishu_skills.rb +105 -0
- data/lib/octo/default_skills/channel-manager/weixin_setup.rb +274 -0
- data/lib/octo/default_skills/code-explorer/SKILL.md +36 -0
- data/lib/octo/default_skills/cron-task-creator/SKILL.md +257 -0
- data/lib/octo/default_skills/cron-task-creator/evals/evals.json +38 -0
- data/lib/octo/default_skills/onboard/SKILL.md +578 -0
- data/lib/octo/default_skills/onboard/scripts/import_external_skills.rb +413 -0
- data/lib/octo/default_skills/onboard/scripts/install_builtin_skills.rb +97 -0
- data/lib/octo/default_skills/persist-memory/SKILL.md +59 -0
- data/lib/octo/default_skills/personal-website/SKILL.md +113 -0
- data/lib/octo/default_skills/personal-website/publish.rb +235 -0
- data/lib/octo/default_skills/product-help/SKILL.md +123 -0
- data/lib/octo/default_skills/product-help/docs/agent-config.md +74 -0
- data/lib/octo/default_skills/product-help/docs/best-practices.md +49 -0
- data/lib/octo/default_skills/product-help/docs/browser-tool.md +53 -0
- data/lib/octo/default_skills/product-help/docs/built-in-skills.md +43 -0
- data/lib/octo/default_skills/product-help/docs/cli-reference.md +82 -0
- data/lib/octo/default_skills/product-help/docs/create-your-first-skill.md +47 -0
- data/lib/octo/default_skills/product-help/docs/faq.md +98 -0
- data/lib/octo/default_skills/product-help/docs/how-to-use-a-skill.md +58 -0
- data/lib/octo/default_skills/product-help/docs/installation.md +59 -0
- data/lib/octo/default_skills/product-help/docs/memory-system.md +61 -0
- data/lib/octo/default_skills/product-help/docs/octorules.md +62 -0
- data/lib/octo/default_skills/product-help/docs/session-management.md +63 -0
- data/lib/octo/default_skills/product-help/docs/skill-basics.md +55 -0
- data/lib/octo/default_skills/product-help/docs/skill-frontmatter.md +61 -0
- data/lib/octo/default_skills/product-help/docs/web-server.md +49 -0
- data/lib/octo/default_skills/product-help/docs/what-is-octo.md +37 -0
- data/lib/octo/default_skills/product-help/docs/windows-installation.md +36 -0
- data/lib/octo/default_skills/product-help/docs/writing-tips.md +53 -0
- data/lib/octo/default_skills/recall-memory/SKILL.md +65 -0
- data/lib/octo/default_skills/skill-add/SKILL.md +59 -0
- data/lib/octo/default_skills/skill-add/scripts/install_from_zip.rb +295 -0
- data/lib/octo/default_skills/skill-creator/SKILL.md +602 -0
- data/lib/octo/default_skills/skill-creator/agents/analyzer.md +274 -0
- data/lib/octo/default_skills/skill-creator/agents/comparator.md +202 -0
- data/lib/octo/default_skills/skill-creator/agents/grader.md +223 -0
- data/lib/octo/default_skills/skill-creator/eval-viewer/generate_review.py +471 -0
- data/lib/octo/default_skills/skill-creator/eval-viewer/viewer.html +1325 -0
- data/lib/octo/default_skills/skill-creator/references/schemas.md +430 -0
- data/lib/octo/default_skills/skill-creator/scripts/__init__.py +0 -0
- data/lib/octo/default_skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- data/lib/octo/default_skills/skill-creator/scripts/generate_report.py +326 -0
- data/lib/octo/default_skills/skill-creator/scripts/improve_description.py +310 -0
- data/lib/octo/default_skills/skill-creator/scripts/quick_validate.py +103 -0
- data/lib/octo/default_skills/skill-creator/scripts/run_eval.py +317 -0
- data/lib/octo/default_skills/skill-creator/scripts/run_loop.py +331 -0
- data/lib/octo/default_skills/skill-creator/scripts/utils.py +47 -0
- data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb +143 -0
- data/lib/octo/idle_compression_timer.rb +115 -0
- data/lib/octo/json_ui_controller.rb +204 -0
- data/lib/octo/message_format/anthropic.rb +409 -0
- data/lib/octo/message_format/bedrock.rb +361 -0
- data/lib/octo/message_format/open_ai.rb +222 -0
- data/lib/octo/message_history.rb +373 -0
- data/lib/octo/openai_stream_aggregator.rb +130 -0
- data/lib/octo/plain_ui_controller.rb +166 -0
- data/lib/octo/providers.rb +534 -0
- data/lib/octo/server/browser_manager.rb +397 -0
- data/lib/octo/server/channel/adapters/base.rb +82 -0
- data/lib/octo/server/channel/adapters/dingtalk/adapter.rb +314 -0
- data/lib/octo/server/channel/adapters/dingtalk/api_client.rb +391 -0
- data/lib/octo/server/channel/adapters/dingtalk/stream_client.rb +203 -0
- data/lib/octo/server/channel/adapters/discord/adapter.rb +229 -0
- data/lib/octo/server/channel/adapters/discord/api_client.rb +107 -0
- data/lib/octo/server/channel/adapters/discord/gateway_client.rb +270 -0
- data/lib/octo/server/channel/adapters/feishu/adapter.rb +320 -0
- data/lib/octo/server/channel/adapters/feishu/bot.rb +478 -0
- data/lib/octo/server/channel/adapters/feishu/file_processor.rb +36 -0
- data/lib/octo/server/channel/adapters/feishu/message_parser.rb +129 -0
- data/lib/octo/server/channel/adapters/feishu/ws_client.rb +423 -0
- data/lib/octo/server/channel/adapters/telegram/adapter.rb +375 -0
- data/lib/octo/server/channel/adapters/telegram/api_client.rb +205 -0
- data/lib/octo/server/channel/adapters/wecom/adapter.rb +148 -0
- data/lib/octo/server/channel/adapters/wecom/media_downloader.rb +115 -0
- data/lib/octo/server/channel/adapters/wecom/ws_client.rb +395 -0
- data/lib/octo/server/channel/adapters/weixin/adapter.rb +692 -0
- data/lib/octo/server/channel/adapters/weixin/api_client.rb +402 -0
- data/lib/octo/server/channel/channel_config.rb +178 -0
- data/lib/octo/server/channel/channel_manager.rb +468 -0
- data/lib/octo/server/channel/channel_ui_controller.rb +224 -0
- data/lib/octo/server/channel.rb +33 -0
- data/lib/octo/server/discover.rb +77 -0
- data/lib/octo/server/epipe_safe_io.rb +105 -0
- data/lib/octo/server/http_server.rb +3554 -0
- data/lib/octo/server/scheduler.rb +317 -0
- data/lib/octo/server/server_master.rb +325 -0
- data/lib/octo/server/session_registry.rb +431 -0
- data/lib/octo/server/web_ui_controller.rb +487 -0
- data/lib/octo/session_manager.rb +385 -0
- data/lib/octo/skill.rb +466 -0
- data/lib/octo/skill_loader.rb +328 -0
- data/lib/octo/tools/base.rb +118 -0
- data/lib/octo/tools/browser.rb +625 -0
- data/lib/octo/tools/edit.rb +165 -0
- data/lib/octo/tools/file_reader.rb +549 -0
- data/lib/octo/tools/glob.rb +162 -0
- data/lib/octo/tools/grep.rb +356 -0
- data/lib/octo/tools/invoke_skill.rb +96 -0
- data/lib/octo/tools/list_tasks.rb +54 -0
- data/lib/octo/tools/redo_task.rb +41 -0
- data/lib/octo/tools/request_user_feedback.rb +84 -0
- data/lib/octo/tools/security.rb +333 -0
- data/lib/octo/tools/terminal/output_cleaner.rb +63 -0
- data/lib/octo/tools/terminal/persistent_session.rb +268 -0
- data/lib/octo/tools/terminal/safe_rm.sh +106 -0
- data/lib/octo/tools/terminal/session_manager.rb +213 -0
- data/lib/octo/tools/terminal.rb +1828 -0
- data/lib/octo/tools/todo_manager.rb +374 -0
- data/lib/octo/tools/trash_manager.rb +388 -0
- data/lib/octo/tools/undo_task.rb +35 -0
- data/lib/octo/tools/web_fetch.rb +242 -0
- data/lib/octo/tools/web_search.rb +260 -0
- data/lib/octo/tools/write.rb +77 -0
- data/lib/octo/ui2/block_font.rb +10 -0
- data/lib/octo/ui2/components/base_component.rb +163 -0
- data/lib/octo/ui2/components/command_suggestions.rb +290 -0
- data/lib/octo/ui2/components/common_component.rb +96 -0
- data/lib/octo/ui2/components/inline_input.rb +226 -0
- data/lib/octo/ui2/components/input_area.rb +1338 -0
- data/lib/octo/ui2/components/message_component.rb +99 -0
- data/lib/octo/ui2/components/modal_component.rb +419 -0
- data/lib/octo/ui2/components/todo_area.rb +149 -0
- data/lib/octo/ui2/components/tool_component.rb +107 -0
- data/lib/octo/ui2/components/welcome_banner.rb +139 -0
- data/lib/octo/ui2/layout_manager.rb +807 -0
- data/lib/octo/ui2/line_editor.rb +363 -0
- data/lib/octo/ui2/markdown_renderer.rb +100 -0
- data/lib/octo/ui2/output_buffer.rb +370 -0
- data/lib/octo/ui2/progress_handle.rb +362 -0
- data/lib/octo/ui2/progress_indicator.rb +55 -0
- data/lib/octo/ui2/screen_buffer.rb +273 -0
- data/lib/octo/ui2/terminal_detector.rb +119 -0
- data/lib/octo/ui2/theme_manager.rb +85 -0
- data/lib/octo/ui2/themes/base_theme.rb +105 -0
- data/lib/octo/ui2/themes/hacker_theme.rb +62 -0
- data/lib/octo/ui2/themes/minimal_theme.rb +56 -0
- data/lib/octo/ui2/thinking_verbs.rb +26 -0
- data/lib/octo/ui2/ui_controller.rb +1625 -0
- data/lib/octo/ui2/view_renderer.rb +177 -0
- data/lib/octo/ui2.rb +40 -0
- data/lib/octo/ui_interface.rb +154 -0
- data/lib/octo/utils/arguments_parser.rb +191 -0
- data/lib/octo/utils/browser_detector.rb +195 -0
- data/lib/octo/utils/encoding.rb +92 -0
- data/lib/octo/utils/environment_detector.rb +140 -0
- data/lib/octo/utils/file_ignore_helper.rb +170 -0
- data/lib/octo/utils/file_processor.rb +601 -0
- data/lib/octo/utils/gitignore_parser.rb +154 -0
- data/lib/octo/utils/limit_stack.rb +152 -0
- data/lib/octo/utils/logger.rb +124 -0
- data/lib/octo/utils/login_shell.rb +72 -0
- data/lib/octo/utils/model_pricing.rb +646 -0
- data/lib/octo/utils/parser_manager.rb +165 -0
- data/lib/octo/utils/path_helper.rb +15 -0
- data/lib/octo/utils/scripts_manager.rb +59 -0
- data/lib/octo/utils/string_matcher.rb +158 -0
- data/lib/octo/utils/trash_directory.rb +112 -0
- data/lib/octo/utils/workspace_rules.rb +46 -0
- data/lib/octo/version.rb +5 -0
- data/lib/octo/web/app.css +7141 -0
- data/lib/octo/web/app.js +543 -0
- data/lib/octo/web/apple-touch-icon.png +0 -0
- data/lib/octo/web/auth.js +150 -0
- data/lib/octo/web/channels.js +276 -0
- data/lib/octo/web/datepicker.js +205 -0
- data/lib/octo/web/favicon.png +0 -0
- data/lib/octo/web/i18n.js +1073 -0
- data/lib/octo/web/icon-512.png +0 -0
- data/lib/octo/web/icon-dark.svg +25 -0
- data/lib/octo/web/icon.svg +29 -0
- data/lib/octo/web/index.html +871 -0
- data/lib/octo/web/marked.min.js +69 -0
- data/lib/octo/web/onboard.js +491 -0
- data/lib/octo/web/profile.js +442 -0
- data/lib/octo/web/sessions.js +4421 -0
- data/lib/octo/web/settings.js +913 -0
- data/lib/octo/web/sidebar.js +32 -0
- data/lib/octo/web/skills.js +885 -0
- data/lib/octo/web/tasks.js +297 -0
- data/lib/octo/web/theme.js +105 -0
- data/lib/octo/web/trash.js +343 -0
- data/lib/octo/web/vendor/hljs/highlight.min.js +1244 -0
- data/lib/octo/web/vendor/hljs/hljs-theme.css +95 -0
- data/lib/octo/web/vendor/katex/auto-render.min.js +1 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
- data/lib/octo/web/vendor/katex/katex.min.css +1 -0
- data/lib/octo/web/vendor/katex/katex.min.js +1 -0
- data/lib/octo/web/version.js +449 -0
- data/lib/octo/web/weixin-qr.html +209 -0
- data/lib/octo/web/ws-dispatcher.js +357 -0
- data/lib/octo/web/ws.js +128 -0
- data/lib/octo.rb +145 -0
- data/scripts/build/build.sh +329 -0
- data/scripts/build/lib/apt.sh +56 -0
- data/scripts/build/lib/brew.sh +89 -0
- data/scripts/build/lib/colors.sh +17 -0
- data/scripts/build/lib/gem.sh +95 -0
- data/scripts/build/lib/mise.sh +125 -0
- data/scripts/build/lib/network.sh +157 -0
- data/scripts/build/lib/os.sh +57 -0
- data/scripts/build/lib/shell.sh +37 -0
- data/scripts/build/src/install.sh.cc +174 -0
- data/scripts/build/src/install_browser.sh.cc +101 -0
- data/scripts/build/src/install_full.sh.cc +290 -0
- data/scripts/build/src/install_rails_deps.sh.cc +145 -0
- data/scripts/build/src/install_system_deps.sh.cc +123 -0
- data/scripts/build/src/uninstall.sh.cc +101 -0
- data/scripts/install.ps1 +532 -0
- data/scripts/install.sh +567 -0
- data/scripts/install_browser.sh +479 -0
- data/scripts/install_full.sh +838 -0
- data/scripts/install_rails_deps.sh +746 -0
- data/scripts/install_system_deps.sh +518 -0
- data/scripts/uninstall.sh +287 -0
- data/sig/octo.rbs +4 -0
- metadata +614 -0
|
@@ -0,0 +1,800 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Octo
|
|
4
|
+
class Agent
|
|
5
|
+
# LLM API call management
|
|
6
|
+
# Handles API calls with retry logic, fallback model support, and progress indication
|
|
7
|
+
module LlmCaller
|
|
8
|
+
# Number of consecutive RetryableError failures (503/429/5xx) before switching to fallback.
|
|
9
|
+
# Network-level errors (connection failures, timeouts) do NOT trigger fallback — they are
|
|
10
|
+
# retried on the primary model for the full max_retries budget, since they are likely
|
|
11
|
+
# transient infrastructure blips rather than a model-level outage.
|
|
12
|
+
RETRIES_BEFORE_FALLBACK = 3
|
|
13
|
+
|
|
14
|
+
# After switching to the fallback model, allow this many retries before giving up.
|
|
15
|
+
# Kept lower than max_retries (10) because we have already exhausted the primary model.
|
|
16
|
+
MAX_RETRIES_ON_FALLBACK = 5
|
|
17
|
+
|
|
18
|
+
# Execute LLM API call with progress indicator, retry logic, and cost tracking.
|
|
19
|
+
#
|
|
20
|
+
# Fallback / probing state machine (driven by AgentConfig):
|
|
21
|
+
#
|
|
22
|
+
# :primary_ok (nil)
|
|
23
|
+
# Normal operation — use the configured model.
|
|
24
|
+
# After RETRIES_BEFORE_FALLBACK consecutive failures → :fallback_active
|
|
25
|
+
#
|
|
26
|
+
# :fallback_active
|
|
27
|
+
# Use fallback model. After FALLBACK_COOLING_OFF_SECONDS (30 min) the
|
|
28
|
+
# config transitions to :probing on the next call_llm entry.
|
|
29
|
+
#
|
|
30
|
+
# :probing
|
|
31
|
+
# Silently attempt the primary model once.
|
|
32
|
+
# Success → config transitions back to :primary_ok, user notified.
|
|
33
|
+
# Failure → renew cooling-off clock, back to :fallback_active, then
|
|
34
|
+
# retry the *same* request with the fallback model so the
|
|
35
|
+
# user experiences no extra delay.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash] API response with :content, :tool_calls, :usage, etc.
|
|
38
|
+
# NOTE on progress lifecycle:
|
|
39
|
+
# call_llm intentionally does NOT start or stop the progress indicator.
|
|
40
|
+
# Ownership lives with the caller (Agent#think for normal/compression
|
|
41
|
+
# paths, Agent#trigger_idle_compression for idle compression). This
|
|
42
|
+
# avoids nested active/done pairs clobbering each other — a bug that
|
|
43
|
+
# silently dropped the idle-compression summary line.
|
|
44
|
+
#
|
|
45
|
+
# Inside call_llm we only *update in place* during retries, so the
|
|
46
|
+
# already-live progress slot shows meaningful transient status
|
|
47
|
+
# ("Network failed… attempt 2/10", etc.).
|
|
48
|
+
private def call_llm
|
|
49
|
+
# Transition :fallback_active → :probing if cooling-off has expired.
|
|
50
|
+
@config.maybe_start_probing
|
|
51
|
+
|
|
52
|
+
tools_to_send = @tool_registry.all_definitions
|
|
53
|
+
|
|
54
|
+
max_retries = 10
|
|
55
|
+
retry_delay = 5
|
|
56
|
+
retries = 0
|
|
57
|
+
|
|
58
|
+
# Track whether any of the retry/fallback branches below opened a
|
|
59
|
+
# "retrying" progress slot via show_progress(progress_type:
|
|
60
|
+
# "retrying", phase: "active"). If so, we MUST close it before
|
|
61
|
+
# leaving call_llm — otherwise the UI's legacy shim in
|
|
62
|
+
# UI2::UIController keeps the :quiet ProgressHandle alive, its
|
|
63
|
+
# ticker thread keeps running, and the user sees a frozen
|
|
64
|
+
# "Network failed: ... (681s)" line long after the task finished.
|
|
65
|
+
#
|
|
66
|
+
# The close is done in the outer ensure below so it runs on:
|
|
67
|
+
# - normal success (response returned)
|
|
68
|
+
# - unrecoverable failure (raise propagates out)
|
|
69
|
+
# - BadRequestError reasoning-content retry success
|
|
70
|
+
retrying_progress_opened = false
|
|
71
|
+
# One-shot flag set by the BadRequestError rescue below when the server
|
|
72
|
+
# complained about missing reasoning_content. The subsequent retry will
|
|
73
|
+
# pad every assistant message's reasoning_content, which satisfies
|
|
74
|
+
# DeepSeek / Kimi thinking-mode providers even when the earlier turns
|
|
75
|
+
# were produced by a different provider (e.g. MiniMax keeps thinking
|
|
76
|
+
# inline in content and never emits a reasoning_content field, so the
|
|
77
|
+
# history-evidence heuristic in MessageHistory can't infer thinking
|
|
78
|
+
# mode on its own). We retry at most once — if padding doesn't fix it,
|
|
79
|
+
# the error is something else and we let it propagate.
|
|
80
|
+
force_reasoning_content_pad = false
|
|
81
|
+
thinking_retry_attempted = false
|
|
82
|
+
# One-shot flag for context-overflow recovery. When the server complains
|
|
83
|
+
# the input exceeds the model's context window, we run a forced
|
|
84
|
+
# compression with pull_back_from_tail: 1 (preserves the model's
|
|
85
|
+
# two-checkpoint prompt cache) and retry the original request once.
|
|
86
|
+
# We retry at most once — if still overflowing afterward, the issue is
|
|
87
|
+
# something else (e.g. tool schemas alone exceed the window) and we let
|
|
88
|
+
# the error propagate.
|
|
89
|
+
context_overflow_retry_attempted = false
|
|
90
|
+
|
|
91
|
+
begin
|
|
92
|
+
begin
|
|
93
|
+
# Use active_messages (Time Machine) when undone, otherwise send full history.
|
|
94
|
+
# to_api strips internal fields and handles orphaned tool_calls.
|
|
95
|
+
messages_to_send = if respond_to?(:active_messages)
|
|
96
|
+
active_messages(force_reasoning_content_pad: force_reasoning_content_pad)
|
|
97
|
+
else
|
|
98
|
+
@history.to_api(force_reasoning_content_pad: force_reasoning_content_pad)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
response = @client.send_messages_with_tools(
|
|
102
|
+
messages_to_send,
|
|
103
|
+
model: current_model,
|
|
104
|
+
tools: tools_to_send,
|
|
105
|
+
max_tokens: @config.max_tokens,
|
|
106
|
+
enable_caching: @config.enable_prompt_caching,
|
|
107
|
+
reasoning_effort: @reasoning_effort,
|
|
108
|
+
on_chunk: build_progress_on_chunk
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Successful response — if we were probing, confirm primary is healthy.
|
|
112
|
+
handle_probe_success if @config.probing?
|
|
113
|
+
|
|
114
|
+
# ── Upstream truncation detector ──────────────────────────────────
|
|
115
|
+
# OpenRouter / Bedrock and other routers sometimes close the SSE
|
|
116
|
+
# stream mid-tool_use: we receive finish_reason="stop" together with
|
|
117
|
+
# a syntactically valid tool_call whose `arguments` JSON is empty,
|
|
118
|
+
# "{}" (placeholder before any key was streamed), or otherwise
|
|
119
|
+
# unparseable. Treat this as retryable — otherwise the agent would
|
|
120
|
+
# execute a tool with empty args (often failing cryptically) or
|
|
121
|
+
# silently exit thinking the task is done.
|
|
122
|
+
#
|
|
123
|
+
# Raises UpstreamTruncatedError (a RetryableError) so the rescue
|
|
124
|
+
# block below handles retry + fallback identically to 5xx/429.
|
|
125
|
+
detect_upstream_truncation!(response)
|
|
126
|
+
|
|
127
|
+
rescue Faraday::TimeoutError => e
|
|
128
|
+
# ── Read-timeout path (distinct from connection-level failures) ──
|
|
129
|
+
# Faraday::TimeoutError on our non-streaming POST almost always means
|
|
130
|
+
# the *response* took longer than the 300s read-timeout to come back —
|
|
131
|
+
# i.e. the model is trying to produce a huge output in one shot
|
|
132
|
+
# (e.g. "write me a 2000-line snake game"). Blindly retrying the same
|
|
133
|
+
# request with the same prompt reproduces the same timeout.
|
|
134
|
+
#
|
|
135
|
+
# Strategy:
|
|
136
|
+
# 1. On the FIRST timeout in a task, inject a `[SYSTEM]` user message
|
|
137
|
+
# telling the model to break the work into smaller steps, then
|
|
138
|
+
# retry. The history edit changes the prompt, so the retry is
|
|
139
|
+
# materially different from the failed attempt.
|
|
140
|
+
# 2. On subsequent timeouts in the same task, fall back to the
|
|
141
|
+
# generic "just retry" behaviour (the model may have ignored
|
|
142
|
+
# the hint; don't pile on duplicate hints).
|
|
143
|
+
# 3. Probing-mode timeouts still go through handle_probe_failure.
|
|
144
|
+
retries += 1
|
|
145
|
+
|
|
146
|
+
if @config.probing?
|
|
147
|
+
handle_probe_failure
|
|
148
|
+
retry
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
if retries <= max_retries
|
|
152
|
+
inject_large_output_hint_if_first_timeout(e)
|
|
153
|
+
@ui&.show_progress(
|
|
154
|
+
"Response too slow (likely generating too much at once): #{e.message}",
|
|
155
|
+
progress_type: "retrying",
|
|
156
|
+
phase: "active",
|
|
157
|
+
metadata: { attempt: retries, total: max_retries }
|
|
158
|
+
)
|
|
159
|
+
retrying_progress_opened = true
|
|
160
|
+
sleep retry_delay
|
|
161
|
+
retry
|
|
162
|
+
else
|
|
163
|
+
raise AgentError, "[LLM] Request timed out after #{max_retries} retries: #{e.message}"
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
rescue Faraday::ConnectionFailed, Faraday::SSLError, Errno::ECONNREFUSED, Errno::ETIMEDOUT => e
|
|
167
|
+
retries += 1
|
|
168
|
+
|
|
169
|
+
# Probing failure: primary still down — renew cooling-off and retry with fallback.
|
|
170
|
+
if @config.probing?
|
|
171
|
+
handle_probe_failure
|
|
172
|
+
retry
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Connection-level errors (DNS, TCP refused, open-timeout, TLS) are
|
|
176
|
+
# transient infrastructure blips — do NOT trigger fallback, and do
|
|
177
|
+
# NOT inject the "break into steps" hint (the model did nothing wrong).
|
|
178
|
+
# Just retry on the current model up to max_retries.
|
|
179
|
+
if retries <= max_retries
|
|
180
|
+
@ui&.show_progress(
|
|
181
|
+
"Network failed: #{e.message}",
|
|
182
|
+
progress_type: "retrying",
|
|
183
|
+
phase: "active",
|
|
184
|
+
metadata: { attempt: retries, total: max_retries }
|
|
185
|
+
)
|
|
186
|
+
retrying_progress_opened = true
|
|
187
|
+
sleep retry_delay
|
|
188
|
+
retry
|
|
189
|
+
else
|
|
190
|
+
# Don't show_error here — let the outer rescue block handle it to avoid duplicates.
|
|
191
|
+
# Progress cleanup is the caller's responsibility (via its own ensure block).
|
|
192
|
+
raise AgentError, "[LLM] Network connection failed after #{max_retries} retries: #{e.message}"
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
rescue RetryableError => e
|
|
196
|
+
retries += 1
|
|
197
|
+
|
|
198
|
+
# Probing failure: primary still down — renew cooling-off and retry with fallback.
|
|
199
|
+
if @config.probing?
|
|
200
|
+
handle_probe_failure
|
|
201
|
+
retry
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# RetryableError (503/429/5xx/ThrottlingException) signals a service-level outage.
|
|
205
|
+
# After RETRIES_BEFORE_FALLBACK attempts, switch to the fallback model and reset the
|
|
206
|
+
# retry counter — but cap fallback retries at MAX_RETRIES_ON_FALLBACK (< max_retries)
|
|
207
|
+
# since we have already confirmed the primary is struggling.
|
|
208
|
+
current_max = @config.fallback_active? ? MAX_RETRIES_ON_FALLBACK : max_retries
|
|
209
|
+
|
|
210
|
+
if retries <= current_max
|
|
211
|
+
if retries == RETRIES_BEFORE_FALLBACK && !@config.fallback_active?
|
|
212
|
+
if try_activate_fallback(current_model)
|
|
213
|
+
retries = 0
|
|
214
|
+
retry
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
@ui&.show_progress(
|
|
218
|
+
e.message,
|
|
219
|
+
progress_type: "retrying",
|
|
220
|
+
phase: "active",
|
|
221
|
+
metadata: { attempt: retries, total: current_max }
|
|
222
|
+
)
|
|
223
|
+
retrying_progress_opened = true
|
|
224
|
+
sleep retry_delay
|
|
225
|
+
retry
|
|
226
|
+
else
|
|
227
|
+
# Don't show_error here — let the outer rescue block handle it to avoid duplicates.
|
|
228
|
+
# Progress cleanup is the caller's responsibility (via its own ensure block).
|
|
229
|
+
raise AgentError, "[LLM] Service unavailable after #{current_max} retries"
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
rescue Octo::BadRequestError => e
|
|
233
|
+
# One-shot recovery for "context too long" errors. The model's
|
|
234
|
+
# context window is exceeded by the current history+tools+system
|
|
235
|
+
# prompt. We run a forced compression with pull_back_from_tail: 1
|
|
236
|
+
# (preserves the two-checkpoint prompt cache so the compression
|
|
237
|
+
# call itself still hits cache#A on the second-to-last position),
|
|
238
|
+
# then retry the original request once.
|
|
239
|
+
if !context_overflow_retry_attempted &&
|
|
240
|
+
!@compressing_for_overflow &&
|
|
241
|
+
context_too_long_error?(e) &&
|
|
242
|
+
respond_to?(:compress_messages_if_needed, true)
|
|
243
|
+
context_overflow_retry_attempted = true
|
|
244
|
+
Octo::Logger.info(
|
|
245
|
+
"[context-overflow] caught BadRequestError, attempting forced compression with pull-back",
|
|
246
|
+
error_message: e.message[0, 200],
|
|
247
|
+
history_size: @history.size,
|
|
248
|
+
previous_total_tokens: @previous_total_tokens
|
|
249
|
+
)
|
|
250
|
+
# Layer 1: standard cache-preserving compression (pull_back: 1).
|
|
251
|
+
# Handles 99% of real overflow cases (newest message tipped the
|
|
252
|
+
# request just past the window).
|
|
253
|
+
if perform_context_overflow_compression(mode: :standard)
|
|
254
|
+
retry
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Layer 2: aggressive fallback. The Layer 1 compression call
|
|
258
|
+
# itself overflowed — happens when a single newly-appended
|
|
259
|
+
# message is enormous (huge tool_result, pasted file, etc.) so
|
|
260
|
+
# popping just K=1 didn't bring the request below the window.
|
|
261
|
+
# Pop ~half the history this time; sacrifices prompt cache to
|
|
262
|
+
# guarantee the compression call fits.
|
|
263
|
+
Octo::Logger.warn(
|
|
264
|
+
"[context-overflow] standard compression failed, escalating to aggressive mode"
|
|
265
|
+
)
|
|
266
|
+
if perform_context_overflow_compression(mode: :aggressive)
|
|
267
|
+
retry
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Both layers exhausted. Let the original error propagate so the
|
|
271
|
+
# user sees the underlying provider message. This should be
|
|
272
|
+
# extremely rare — would require both halves of the history to
|
|
273
|
+
# individually exceed the window, which is essentially impossible
|
|
274
|
+
# under the "previous turn succeeded" invariant.
|
|
275
|
+
Octo::Logger.error(
|
|
276
|
+
"[context-overflow] both standard and aggressive compression failed; " \
|
|
277
|
+
"propagating original error"
|
|
278
|
+
)
|
|
279
|
+
raise
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# One-shot recovery for thinking-mode providers (DeepSeek V4, Kimi K2)
|
|
283
|
+
# that require every assistant message in the history to carry a
|
|
284
|
+
# reasoning_content field. The history-evidence heuristic in
|
|
285
|
+
# MessageHistory#to_api can miss this when the preceding turns came
|
|
286
|
+
# from a different thinking style (e.g. MiniMax keeps <think>...</think>
|
|
287
|
+
# inline in content and never emits reasoning_content) — so we detect
|
|
288
|
+
# the error here and retry once with forced padding.
|
|
289
|
+
if !thinking_retry_attempted && reasoning_content_missing_error?(e)
|
|
290
|
+
thinking_retry_attempted = true
|
|
291
|
+
force_reasoning_content_pad = true
|
|
292
|
+
Octo::Logger.info(
|
|
293
|
+
"[thinking-mode] retrying with forced reasoning_content padding " \
|
|
294
|
+
"(model=#{@config.model_name.inspect} base_url=#{@config.base_url.inspect})"
|
|
295
|
+
)
|
|
296
|
+
retry
|
|
297
|
+
end
|
|
298
|
+
raise
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Collect token usage data from API response (no cost tracking)
|
|
302
|
+
token_data = collect_iteration_tokens(response[:usage])
|
|
303
|
+
response[:token_usage] = token_data
|
|
304
|
+
|
|
305
|
+
# [DIAG] Log raw client response shape. Only emit when we see the
|
|
306
|
+
# "finish_reason=stop + non-empty tool_calls" combo, or when any
|
|
307
|
+
# tool_call's arguments look empty/unparseable — both indicate the
|
|
308
|
+
# upstream (Bedrock/relay/model) cut the tool_use stream short.
|
|
309
|
+
# Normal responses produce no log line (too noisy).
|
|
310
|
+
begin
|
|
311
|
+
tool_calls = response[:tool_calls] || []
|
|
312
|
+
if !tool_calls.empty?
|
|
313
|
+
raw_tcs = tool_calls.map do |c|
|
|
314
|
+
args_str = c[:arguments].is_a?(String) ? c[:arguments] : c[:arguments].to_s
|
|
315
|
+
parseable = begin
|
|
316
|
+
JSON.parse(args_str)
|
|
317
|
+
true
|
|
318
|
+
rescue StandardError
|
|
319
|
+
false
|
|
320
|
+
end
|
|
321
|
+
{
|
|
322
|
+
name: c[:name].to_s,
|
|
323
|
+
args_len: args_str.length,
|
|
324
|
+
args_parseable: parseable,
|
|
325
|
+
args_head: args_str[0, 120]
|
|
326
|
+
}
|
|
327
|
+
end
|
|
328
|
+
truncated_call = raw_tcs.any? { |t| t[:args_len] == 0 || t[:args_len] == 2 || !t[:args_parseable] }
|
|
329
|
+
suspicious = response[:finish_reason] == "stop"
|
|
330
|
+
|
|
331
|
+
if suspicious || truncated_call
|
|
332
|
+
Octo::Logger.warn("llm.response_suspicious",
|
|
333
|
+
model: current_model,
|
|
334
|
+
finish_reason: response[:finish_reason].to_s,
|
|
335
|
+
tool_calls_count: raw_tcs.size,
|
|
336
|
+
tool_calls: raw_tcs,
|
|
337
|
+
completion_tokens: token_data[:completion_tokens],
|
|
338
|
+
ttft_ms: response.dig(:latency, :ttft_ms),
|
|
339
|
+
combo_stop_with_toolcalls: suspicious,
|
|
340
|
+
has_truncated_args: truncated_call
|
|
341
|
+
)
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
rescue StandardError => e
|
|
345
|
+
Octo::Logger.warn("llm.response_log_failed", error: e.message)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
response
|
|
349
|
+
ensure
|
|
350
|
+
# Close any "retrying" progress slot that was opened during the
|
|
351
|
+
# retry/fallback loop above. The legacy UI shim allocates a
|
|
352
|
+
# separate :quiet ProgressHandle under the "retrying" key; if it
|
|
353
|
+
# is never finished its ticker thread keeps running and the user
|
|
354
|
+
# sees a stale "Network failed: ... (NNN s)" line long after the
|
|
355
|
+
# task has completed. This ensure runs on:
|
|
356
|
+
# - successful retry → close the slot, message is "Recovered"
|
|
357
|
+
# so the final frame is informative rather than blank
|
|
358
|
+
# - unrecoverable failure that raises out → close the slot so
|
|
359
|
+
# the spinner doesn't linger while the error bubbles up
|
|
360
|
+
if retrying_progress_opened
|
|
361
|
+
@ui&.show_progress(progress_type: "retrying", phase: "done")
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Attempt to activate the provider fallback model for the given primary model.
|
|
367
|
+
# Shows a user-visible warning when switching. Returns true if a fallback was found
|
|
368
|
+
# and activated, false if no fallback is configured.
|
|
369
|
+
# @param failed_model [String] the model name that is currently failing
|
|
370
|
+
# @return [Boolean]
|
|
371
|
+
private def try_activate_fallback(failed_model)
|
|
372
|
+
fallback = @config.fallback_model_for(failed_model)
|
|
373
|
+
return false unless fallback
|
|
374
|
+
|
|
375
|
+
@config.activate_fallback!(fallback)
|
|
376
|
+
@ui&.show_warning(
|
|
377
|
+
"Model #{failed_model} appears unavailable. " \
|
|
378
|
+
"Automatically switching to fallback model: #{fallback}"
|
|
379
|
+
)
|
|
380
|
+
true
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# Called when a probe attempt (testing primary after cooling-off) succeeds.
|
|
384
|
+
# Resets the state machine to :primary_ok and notifies the user.
|
|
385
|
+
private def handle_probe_success
|
|
386
|
+
primary = @config.model_name
|
|
387
|
+
@config.confirm_fallback_ok!
|
|
388
|
+
@ui&.show_warning("Primary model #{primary} is healthy again. Switched back automatically.")
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# Called when a probe attempt fails.
|
|
392
|
+
# Renews the cooling-off clock (back to :fallback_active) so the *same*
|
|
393
|
+
# request is immediately retried with the fallback model — no extra delay.
|
|
394
|
+
private def handle_probe_failure
|
|
395
|
+
fallback = @config.instance_variable_get(:@fallback_model)
|
|
396
|
+
primary = @config.model_name
|
|
397
|
+
@config.activate_fallback!(fallback) # renews @fallback_since
|
|
398
|
+
@ui&.show_warning(
|
|
399
|
+
"Primary model #{primary} still unavailable. " \
|
|
400
|
+
"Continuing with fallback model: #{fallback}"
|
|
401
|
+
)
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
# Run a forced compression to recover from a context-overflow error.
|
|
405
|
+
# Called by the BadRequestError rescue when context_too_long_error?
|
|
406
|
+
# returns true.
|
|
407
|
+
#
|
|
408
|
+
# Two-layer defence:
|
|
409
|
+
# ────────────────────────────────────────────────────────────────────
|
|
410
|
+
# Layer 1 (mode: :standard, default) — preserves prompt cache.
|
|
411
|
+
# Pop K=1 message from @history tail, then run compression. This
|
|
412
|
+
# frees just enough token budget for the compression LLM call
|
|
413
|
+
# itself to fit, while preserving the model's two-checkpoint prompt
|
|
414
|
+
# cache (cache#A at second-to-last position is still hit). The
|
|
415
|
+
# popped message is reattached to the rebuilt history's tail by
|
|
416
|
+
# handle_compression_response, so recent task progress is not lost.
|
|
417
|
+
# Handles 99% of real-world cases where overflow is caused by the
|
|
418
|
+
# newest message pushing total just past the window.
|
|
419
|
+
#
|
|
420
|
+
# Layer 2 (mode: :aggressive) — sacrifices prompt cache to survive.
|
|
421
|
+
# Pop ~half the history (capped) from the tail. This dramatically
|
|
422
|
+
# shrinks the compression call's input regardless of how big any
|
|
423
|
+
# single message is. Used as a fallback when Layer 1 itself raises
|
|
424
|
+
# context_too_long — i.e. a single newly-appended message is so
|
|
425
|
+
# large (e.g. >50K-token tool_result, pasted huge file) that even
|
|
426
|
+
# removing it didn't bring the request under the window, OR the
|
|
427
|
+
# popped message was small but earlier history grew past the limit.
|
|
428
|
+
# Pulled-back messages are still reattached after compression so no
|
|
429
|
+
# user content is silently dropped.
|
|
430
|
+
#
|
|
431
|
+
# @param mode [Symbol] :standard or :aggressive
|
|
432
|
+
# @return [Boolean] true if compression succeeded (caller should retry
|
|
433
|
+
# the original request), false if compression was unable to run
|
|
434
|
+
# (compression disabled, history too short, etc.) or itself failed
|
|
435
|
+
# — caller decides whether to escalate to the next layer or
|
|
436
|
+
# propagate the original error.
|
|
437
|
+
private def perform_context_overflow_compression(mode: :standard)
|
|
438
|
+
return false unless respond_to?(:compress_messages_if_needed, true)
|
|
439
|
+
|
|
440
|
+
# Compute pull-back count.
|
|
441
|
+
# Standard: K=1 (cache-preserving).
|
|
442
|
+
# Aggressive: pop ~half the history, but never less than 4 and never
|
|
443
|
+
# more than (history_size - 2) so we always keep system + at least
|
|
444
|
+
# one recent message. Capped at 64 to bound the worst case (an
|
|
445
|
+
# enormous history that should never realistically occur).
|
|
446
|
+
pull_back =
|
|
447
|
+
if mode == :aggressive
|
|
448
|
+
half = @history.size / 2
|
|
449
|
+
[[half, 4].max, [@history.size - 2, 64].min].min
|
|
450
|
+
else
|
|
451
|
+
1
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
@compressing_for_overflow = true
|
|
455
|
+
compression_context = nil
|
|
456
|
+
|
|
457
|
+
begin
|
|
458
|
+
compression_context = compress_messages_if_needed(
|
|
459
|
+
force: true,
|
|
460
|
+
pull_back_from_tail: pull_back
|
|
461
|
+
)
|
|
462
|
+
return false if compression_context.nil?
|
|
463
|
+
|
|
464
|
+
compression_message = compression_context[:compression_message]
|
|
465
|
+
@history.append(compression_message)
|
|
466
|
+
|
|
467
|
+
response = call_llm # recursive — guarded by @compressing_for_overflow
|
|
468
|
+
handle_compression_response(response, compression_context)
|
|
469
|
+
Octo::Logger.info(
|
|
470
|
+
"[context-overflow] compression succeeded",
|
|
471
|
+
mode: mode,
|
|
472
|
+
pull_back: pull_back
|
|
473
|
+
)
|
|
474
|
+
true
|
|
475
|
+
rescue => e
|
|
476
|
+
# Compression failed mid-flight. Restore @history to a sensible state:
|
|
477
|
+
# roll back the compression instruction we appended, and re-append the
|
|
478
|
+
# pulled-back messages so the user's recent work isn't silently lost.
|
|
479
|
+
if compression_context
|
|
480
|
+
cm = compression_context[:compression_message]
|
|
481
|
+
@history.rollback_before(cm) if cm
|
|
482
|
+
(compression_context[:pulled_back_messages] || []).each do |m|
|
|
483
|
+
@history.append(m)
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
Octo::Logger.warn(
|
|
487
|
+
"[context-overflow] compression failed during overflow recovery",
|
|
488
|
+
mode: mode,
|
|
489
|
+
pull_back: pull_back,
|
|
490
|
+
error_class: e.class.name,
|
|
491
|
+
error_message: e.message[0, 200]
|
|
492
|
+
)
|
|
493
|
+
false
|
|
494
|
+
ensure
|
|
495
|
+
@compressing_for_overflow = false
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
# True when a 400 BadRequestError is specifically about a missing
|
|
500
|
+
# reasoning_content field in thinking mode (DeepSeek V4, Kimi K2 thinking).
|
|
501
|
+
# We require TWO distinct substrings to avoid false positives — a generic
|
|
502
|
+
# 400 that happens to mention "reasoning_content" in passing (e.g. a
|
|
503
|
+
# validation hint in some unrelated provider) must NOT trigger the pad
|
|
504
|
+
# retry, which would silently add an empty field to every assistant
|
|
505
|
+
# message in the history.
|
|
506
|
+
private def reasoning_content_missing_error?(err)
|
|
507
|
+
return false unless err.is_a?(Octo::BadRequestError)
|
|
508
|
+
|
|
509
|
+
msg = err.message.to_s.downcase
|
|
510
|
+
msg.include?("reasoning_content") &&
|
|
511
|
+
(msg.include?("thinking") || msg.include?("must be passed back") ||
|
|
512
|
+
msg.include?("must be provided"))
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# True when a 400 BadRequestError indicates the request exceeded the
|
|
516
|
+
# model's context window (i.e. the conversation history is too long).
|
|
517
|
+
#
|
|
518
|
+
# We deliberately favour broad detection over narrow precision:
|
|
519
|
+
# - False positive cost: one extra (no-op) compression cycle.
|
|
520
|
+
# - False negative cost: user is stuck — every retry hits the same wall.
|
|
521
|
+
# So the matcher is intentionally permissive.
|
|
522
|
+
#
|
|
523
|
+
# Coverage (verified against real production error strings):
|
|
524
|
+
#
|
|
525
|
+
# OpenAI:
|
|
526
|
+
# "This model's maximum context length is 128000 tokens. However
|
|
527
|
+
# you requested ... Please reduce the length of the messages."
|
|
528
|
+
# error.code == "context_length_exceeded"
|
|
529
|
+
#
|
|
530
|
+
# Anthropic:
|
|
531
|
+
# "prompt is too long: 218849 tokens > 200000 maximum"
|
|
532
|
+
#
|
|
533
|
+
# Qwen / Alibaba (DashScope):
|
|
534
|
+
# "You passed 117345 input tokens and requested 8192 output tokens.
|
|
535
|
+
# However the model's context length is only 125536 tokens, resulting
|
|
536
|
+
# in a maximum input length of 117344 tokens. Please reduce the length
|
|
537
|
+
# of the input prompt. (parameter=input_tokens, value=117345)"
|
|
538
|
+
#
|
|
539
|
+
# Qwen / Alibaba (DashScope) — newer/terser format (qwen3.6 series):
|
|
540
|
+
# "InternalError.Algo.InvalidParameter: Range of input length should be [1, 229376]"
|
|
541
|
+
#
|
|
542
|
+
# DeepSeek / Kimi / MiniMax / most OpenAI-compatible relays:
|
|
543
|
+
# Variants of OpenAI-style "context length" / "tokens exceeds" wording.
|
|
544
|
+
#
|
|
545
|
+
# Generic gateways (Portkey, OpenRouter):
|
|
546
|
+
# "The total number of tokens exceeds the model's maximum context length"
|
|
547
|
+
private def context_too_long_error?(err)
|
|
548
|
+
return false unless err.is_a?(Octo::BadRequestError)
|
|
549
|
+
|
|
550
|
+
msg = err.message.to_s.downcase
|
|
551
|
+
|
|
552
|
+
# Strong phrases — any one of these is conclusive on its own.
|
|
553
|
+
# Each phrase is two-or-more semantic words to avoid single-word noise.
|
|
554
|
+
strong_phrases = [
|
|
555
|
+
"context length", # OpenAI / Qwen / many compat APIs
|
|
556
|
+
"context_length_exceeded", # OpenAI error.code
|
|
557
|
+
"maximum context", # OpenAI variant
|
|
558
|
+
"maximum input length", # Qwen
|
|
559
|
+
"prompt is too long", # Anthropic
|
|
560
|
+
"input is too long", # Anthropic-compat relays
|
|
561
|
+
"exceeds the maximum context", # Portkey & generic gateways
|
|
562
|
+
"exceeds the model's context", # Generic
|
|
563
|
+
"exceeds the model's maximum", # Generic
|
|
564
|
+
"reduce the length of the input", # Qwen action hint
|
|
565
|
+
"reduce the length of the messages", # OpenAI action hint
|
|
566
|
+
"reduce the length of your", # Generic action hint
|
|
567
|
+
"reduce the length of the prompt", # Generic action hint
|
|
568
|
+
"range of input length" # Qwen DashScope qwen3.6+ terse format
|
|
569
|
+
]
|
|
570
|
+
return true if strong_phrases.any? { |p| msg.include?(p) }
|
|
571
|
+
|
|
572
|
+
# Pattern 1: Anthropic-style "<N> tokens > <N> maximum"
|
|
573
|
+
return true if msg =~ /\d+\s*tokens?\s*>\s*\d+/
|
|
574
|
+
|
|
575
|
+
# Pattern 2: Qwen-style structured field "parameter=input_tokens"
|
|
576
|
+
return true if msg.include?("parameter=input_tokens")
|
|
577
|
+
|
|
578
|
+
false
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Detect upstream tool-call truncation and raise UpstreamTruncatedError
|
|
582
|
+
# so the standard RetryableError rescue (with fallback model support)
|
|
583
|
+
# handles retry identically to 5xx/429.
|
|
584
|
+
#
|
|
585
|
+
# Background: OpenRouter routes to Anthropic/Bedrock/etc. and passes
|
|
586
|
+
# through whatever the upstream sends. If the upstream closes the SSE
|
|
587
|
+
# stream mid-tool_use (observed with Anthropic at ~127 s TTFT under
|
|
588
|
+
# load), OpenRouter does NOT surface an error — it emits a valid
|
|
589
|
+
# `tool_calls[]` whose `arguments` is empty, `"{}"`, or non-parseable
|
|
590
|
+
# JSON. Without this check the agent would either execute the tool with
|
|
591
|
+
# empty args or (worse) silently exit thinking the task finished.
|
|
592
|
+
#
|
|
593
|
+
# Rule is deliberately narrow: we only intercept the case where the
|
|
594
|
+
# model streamed literally nothing into the tool_call arguments —
|
|
595
|
+
# i.e. `nil`, empty string, or the placeholder `"{}"`. Partial/invalid
|
|
596
|
+
# JSON (e.g. `{"path": "/tmp/x"`) is left to the existing
|
|
597
|
+
# ArgumentsParser → BadArgumentsError path, because the model already
|
|
598
|
+
# committed to specific values and feeding the parse error back as a
|
|
599
|
+
# tool_result lets it self-correct in one round-trip (faster than a
|
|
600
|
+
# blind retry from scratch).
|
|
601
|
+
private def detect_upstream_truncation!(response)
|
|
602
|
+
tool_calls = response[:tool_calls]
|
|
603
|
+
return if tool_calls.nil? || tool_calls.empty?
|
|
604
|
+
|
|
605
|
+
truncated = tool_calls.find { |tc| tool_call_args_truncated?(tc[:arguments]) }
|
|
606
|
+
return unless truncated
|
|
607
|
+
|
|
608
|
+
args_str = truncated[:arguments].is_a?(String) ? truncated[:arguments] : truncated[:arguments].to_s
|
|
609
|
+
Octo::Logger.warn("llm.upstream_truncation_detected",
|
|
610
|
+
model: current_model,
|
|
611
|
+
tool_name: truncated[:name].to_s,
|
|
612
|
+
args_len: args_str.length,
|
|
613
|
+
args_head: args_str[0, 80],
|
|
614
|
+
finish_reason: response[:finish_reason].to_s,
|
|
615
|
+
completion_tokens: response.dig(:token_usage, :completion_tokens),
|
|
616
|
+
ttft_ms: response.dig(:latency, :ttft_ms)
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Inject a one-shot [SYSTEM] hint so a plain retry isn't doomed to the
|
|
620
|
+
# same fate when the truncation correlates with large tool_call args
|
|
621
|
+
# (e.g. writing a 5000-char file in one go). For infrastructure-level
|
|
622
|
+
# blips this hint is harmless — the retry usually succeeds on its own
|
|
623
|
+
# and the hint just sits in history without affecting behaviour.
|
|
624
|
+
inject_upstream_truncation_hint_if_first(truncated)
|
|
625
|
+
|
|
626
|
+
raise Octo::UpstreamTruncatedError,
|
|
627
|
+
"[LLM] Upstream truncated tool_call `#{truncated[:name]}` " \
|
|
628
|
+
"(args=#{args_str[0, 40].inspect}). Retrying..."
|
|
629
|
+
end
|
|
630
|
+
|
|
631
|
+
# True when a tool_call's arguments field looks COMPLETELY empty —
|
|
632
|
+
# i.e. the upstream stream was cut before the model wrote any real
|
|
633
|
+
# content into the arguments JSON.
|
|
634
|
+
#
|
|
635
|
+
# Rules:
|
|
636
|
+
# - nil / non-String / empty string → truncated (nothing at all)
|
|
637
|
+
# - parses to {} (empty object) → truncated (placeholder only)
|
|
638
|
+
# - anything else (including partial/invalid JSON like `{"path":
|
|
639
|
+
# "/tmp/x"` where the model already started writing) → NOT
|
|
640
|
+
# truncated by this detector
|
|
641
|
+
#
|
|
642
|
+
# Partial-JSON cases are deliberately left to the existing
|
|
643
|
+
# ArgumentsParser → BadArgumentsError path, which surfaces the parse
|
|
644
|
+
# error back to the LLM as a tool_result so it can self-correct. That
|
|
645
|
+
# is more efficient than a blind retry when the model already wrote
|
|
646
|
+
# most of the args.
|
|
647
|
+
private def tool_call_args_truncated?(args)
|
|
648
|
+
return true if args.nil?
|
|
649
|
+
return true unless args.is_a?(String)
|
|
650
|
+
return true if args.empty?
|
|
651
|
+
|
|
652
|
+
parsed = begin
|
|
653
|
+
JSON.parse(args)
|
|
654
|
+
rescue JSON::ParserError
|
|
655
|
+
# Partial/invalid JSON — let ArgumentsParser handle it downstream.
|
|
656
|
+
return false
|
|
657
|
+
end
|
|
658
|
+
|
|
659
|
+
parsed.is_a?(Hash) && parsed.empty?
|
|
660
|
+
end
|
|
661
|
+
|
|
662
|
+
# On the FIRST Faraday::TimeoutError within a task, append a [SYSTEM]
|
|
663
|
+
# user message to the history instructing the model to break its work
|
|
664
|
+
# into smaller steps. Subsequent timeouts in the same task are ignored
|
|
665
|
+
# here (caller just retries) so we don't pollute history with duplicate
|
|
666
|
+
# hints.
|
|
667
|
+
#
|
|
668
|
+
# The injected message carries `system_injected: true` so it is:
|
|
669
|
+
# - Hidden from UI replay (session_serializer / replay_history filters)
|
|
670
|
+
# - Skipped by prompt-caching marker placement (client.rb)
|
|
671
|
+
# - Skipped by message compression's "recent user turn" protection
|
|
672
|
+
# (message_compressor_helper.rb)
|
|
673
|
+
#
|
|
674
|
+
# Reset per-task via Agent#run (see @task_timeout_hint_injected = false).
|
|
675
|
+
private def inject_large_output_hint_if_first_timeout(err)
|
|
676
|
+
return if @task_timeout_hint_injected
|
|
677
|
+
|
|
678
|
+
@task_timeout_hint_injected = true
|
|
679
|
+
|
|
680
|
+
hint = "[SYSTEM] The previous LLM response timed out (read timeout after ~300s). " \
|
|
681
|
+
"This usually means the model was trying to produce too much output in a single response. " \
|
|
682
|
+
"Please change your approach:\n" \
|
|
683
|
+
"- Break the task into multiple smaller steps, each producing a short response.\n" \
|
|
684
|
+
"- For long files: first create a skeleton with `write` (structure + placeholder comments only), " \
|
|
685
|
+
"then fill in each section with separate `edit` calls.\n" \
|
|
686
|
+
"- Keep each single tool-call argument (especially file content) well under ~500 lines.\n" \
|
|
687
|
+
"- Do NOT attempt to output the entire deliverable in one response."
|
|
688
|
+
|
|
689
|
+
@history.append({
|
|
690
|
+
role: "user",
|
|
691
|
+
content: hint,
|
|
692
|
+
system_injected: true,
|
|
693
|
+
task_id: @current_task_id
|
|
694
|
+
})
|
|
695
|
+
|
|
696
|
+
Octo::Logger.info(
|
|
697
|
+
"[llm_caller] Read-timeout detected — injected 'break into smaller steps' hint " \
|
|
698
|
+
"(error=#{err.class}: #{err.message})"
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
@ui&.show_warning(
|
|
702
|
+
"LLM response timed out — asking model to break the task into smaller steps and retrying..."
|
|
703
|
+
)
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
# On the FIRST upstream-truncation detection within a task, append a
|
|
707
|
+
# [SYSTEM] user message nudging the model toward smaller tool_call args.
|
|
708
|
+
# This guards against the (real but rare) case where the upstream SSE
|
|
709
|
+
# cut correlates with large tool_call payloads — a plain retry on the
|
|
710
|
+
# same oversized args would keep tripping the same wire.
|
|
711
|
+
#
|
|
712
|
+
# For purely infrastructural truncations (Anthropic edge blip, router
|
|
713
|
+
# hiccup), the hint is harmless — the retry will succeed and the hint
|
|
714
|
+
# just sits unused in history. Cheaper than letting the agent burn
|
|
715
|
+
# through its retry budget on the same oversized payload.
|
|
716
|
+
#
|
|
717
|
+
# Same plumbing as inject_large_output_hint_if_first_timeout: one-shot
|
|
718
|
+
# per task, carries `system_injected: true` so it's hidden from UI
|
|
719
|
+
# replay and skipped by compression/caching placement logic. Reset per
|
|
720
|
+
# task via Agent#run (see @task_upstream_truncation_hint_injected).
|
|
721
|
+
private def inject_upstream_truncation_hint_if_first(truncated_call)
|
|
722
|
+
return if @task_upstream_truncation_hint_injected
|
|
723
|
+
|
|
724
|
+
@task_upstream_truncation_hint_injected = true
|
|
725
|
+
|
|
726
|
+
tool_name = truncated_call[:name].to_s
|
|
727
|
+
hint = "[SYSTEM] The previous response was cut short by the upstream provider " \
|
|
728
|
+
"before the `#{tool_name}` tool_call finished streaming. " \
|
|
729
|
+
"The partial tool_call has been discarded. To avoid the same problem on retry, " \
|
|
730
|
+
"please adapt your approach:\n" \
|
|
731
|
+
"- Prefer smaller tool_call arguments — large single-shot payloads are more likely to be truncated.\n" \
|
|
732
|
+
"- For long file content: create the file first with a minimal skeleton via `write`, " \
|
|
733
|
+
"then append sections one at a time with `edit`.\n" \
|
|
734
|
+
"- Break large tasks into multiple smaller tool calls instead of one big one.\n" \
|
|
735
|
+
"- Keep each tool-call argument comfortably under ~2000 characters when possible."
|
|
736
|
+
|
|
737
|
+
@history.append({
|
|
738
|
+
role: "user",
|
|
739
|
+
content: hint,
|
|
740
|
+
system_injected: true,
|
|
741
|
+
task_id: @current_task_id
|
|
742
|
+
})
|
|
743
|
+
|
|
744
|
+
Octo::Logger.info(
|
|
745
|
+
"[llm_caller] Upstream truncation — injected 'smaller tool_call args' hint " \
|
|
746
|
+
"(tool=#{tool_name.inspect})"
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
@ui&.show_warning(
|
|
750
|
+
"Upstream response was truncated mid tool-call — asking model to use smaller steps and retrying..."
|
|
751
|
+
)
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
# Build a streaming progress callback for Client#send_messages_with_tools.
|
|
755
|
+
# Returns nil when no UI is attached, so the client skips the streaming
|
|
756
|
+
# plumbing entirely. Callback throttles UI updates to avoid flooding the
|
|
757
|
+
# progress handle on fast streams.
|
|
758
|
+
private def build_progress_on_chunk
|
|
759
|
+
return nil unless @ui
|
|
760
|
+
last_emit_at = 0.0
|
|
761
|
+
min_interval = 0.25
|
|
762
|
+
->(input_tokens:, output_tokens:) {
|
|
763
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
764
|
+
return if now - last_emit_at < min_interval && output_tokens > 0
|
|
765
|
+
last_emit_at = now
|
|
766
|
+
@ui.stream_thinking_progress(input_tokens: input_tokens, output_tokens: output_tokens)
|
|
767
|
+
}
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
# Collect token usage data for current iteration and return it.
|
|
771
|
+
# Does NOT calculate cost — cost tracking has been removed.
|
|
772
|
+
# @param usage [Hash] Usage data from API
|
|
773
|
+
# @return [Hash] token_data ready for show_token_usage
|
|
774
|
+
def collect_iteration_tokens(usage)
|
|
775
|
+
prompt_tokens = usage[:prompt_tokens] || 0
|
|
776
|
+
completion_tokens = usage[:completion_tokens] || 0
|
|
777
|
+
total_tokens = usage[:total_tokens] || (prompt_tokens + completion_tokens)
|
|
778
|
+
cache_write = usage[:cache_creation_input_tokens] || 0
|
|
779
|
+
cache_read = usage[:cache_read_input_tokens] || 0
|
|
780
|
+
|
|
781
|
+
delta_tokens =
|
|
782
|
+
if usage[:total_is_per_turn]
|
|
783
|
+
total_tokens
|
|
784
|
+
else
|
|
785
|
+
total_tokens - @previous_total_tokens
|
|
786
|
+
end
|
|
787
|
+
@previous_total_tokens = total_tokens
|
|
788
|
+
|
|
789
|
+
{
|
|
790
|
+
delta_tokens: delta_tokens,
|
|
791
|
+
prompt_tokens: prompt_tokens,
|
|
792
|
+
completion_tokens: completion_tokens,
|
|
793
|
+
total_tokens: total_tokens,
|
|
794
|
+
cache_write: cache_write,
|
|
795
|
+
cache_read: cache_read
|
|
796
|
+
}
|
|
797
|
+
end
|
|
798
|
+
end
|
|
799
|
+
end
|
|
800
|
+
end
|