RubyGems - octo-agent - Versions diffs - 0.11.2 - Mend

octo-agent 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (319) hide show

checksums.yaml +7 -0
data/.clacky/skills/commit/SKILL.md +423 -0
data/.clacky/skills/gem-release/SKILL.md +199 -0
data/.clacky/skills/gem-release/scripts/release.sh +304 -0
data/.clacky/skills/oss-upload/SKILL.md +47 -0
data/.octorules +106 -0
data/.rspec +3 -0
data/.rubocop.yml +8 -0
data/CHANGELOG.md +76 -0
data/CODE_OF_CONDUCT.md +132 -0
data/CONTRIBUTING.md +92 -0
data/Dockerfile +28 -0
data/LICENSE.txt +22 -0
data/POSITIONING.md +46 -0
data/README.md +134 -0
data/README_CN.md +134 -0
data/Rakefile +34 -0
data/benchmark/fixtures/sample_project/Gemfile +3 -0
data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
data/benchmark/results/EVALUATION_REPORT.md +165 -0
data/benchmark/results/baseline_20260511_174424.json +128 -0
data/benchmark/results/report_20260511_175256.json +271 -0
data/benchmark/results/report_20260511_175444.json +271 -0
data/benchmark/results/treatment_20260511_175103.json +130 -0
data/benchmark/runner.rb +441 -0
data/bin/octo +7 -0
data/docs/agent-first-ui-design.md +77 -0
data/docs/billing-system.md +318 -0
data/docs/channel-architecture.md +235 -0
data/docs/engineering-article.md +343 -0
data/docs/session-skill-invocation.md +69 -0
data/docs/time_machine_design.md +247 -0
data/docs/ui2-architecture.md +124 -0
data/homebrew/README.md +96 -0
data/homebrew/openocto.rb +24 -0
data/lib/octo/agent/hook_manager.rb +61 -0
data/lib/octo/agent/llm_caller.rb +800 -0
data/lib/octo/agent/memory_updater.rb +246 -0
data/lib/octo/agent/message_compressor.rb +225 -0
data/lib/octo/agent/message_compressor_helper.rb +869 -0
data/lib/octo/agent/next_message_suggester.rb +215 -0
data/lib/octo/agent/session_serializer.rb +685 -0
data/lib/octo/agent/skill_auto_creator.rb +114 -0
data/lib/octo/agent/skill_evolution.rb +61 -0
data/lib/octo/agent/skill_manager.rb +466 -0
data/lib/octo/agent/skill_reflector.rb +89 -0
data/lib/octo/agent/system_prompt_builder.rb +101 -0
data/lib/octo/agent/time_machine.rb +214 -0
data/lib/octo/agent/tool_executor.rb +454 -0
data/lib/octo/agent/tool_registry.rb +150 -0
data/lib/octo/agent.rb +2180 -0
data/lib/octo/agent_config.rb +989 -0
data/lib/octo/agent_profile.rb +112 -0
data/lib/octo/anthropic_stream_aggregator.rb +137 -0
data/lib/octo/background_task_registry.rb +324 -0
data/lib/octo/banner.rb +34 -0
data/lib/octo/bedrock_stream_aggregator.rb +137 -0
data/lib/octo/block_font.rb +331 -0
data/lib/octo/cli.rb +968 -0
data/lib/octo/client.rb +623 -0
data/lib/octo/default_agents/SOUL.md +3 -0
data/lib/octo/default_agents/USER.md +1 -0
data/lib/octo/default_agents/base_prompt.md +66 -0
data/lib/octo/default_agents/coding/profile.yml +2 -0
data/lib/octo/default_agents/coding/system_prompt.md +67 -0
data/lib/octo/default_agents/general/profile.yml +2 -0
data/lib/octo/default_agents/general/system_prompt.md +16 -0
data/lib/octo/default_parsers/doc_parser.rb +69 -0
data/lib/octo/default_parsers/docx_parser.rb +188 -0
data/lib/octo/default_parsers/pdf_parser.rb +120 -0
data/lib/octo/default_parsers/pdf_parser_ocr.py +103 -0
data/lib/octo/default_parsers/pdf_parser_plumber.py +62 -0
data/lib/octo/default_parsers/pptx_parser.rb +140 -0
data/lib/octo/default_parsers/xlsx_parser.rb +121 -0
data/lib/octo/default_skills/browser-setup/SKILL.md +426 -0
data/lib/octo/default_skills/channel-manager/SKILL.md +623 -0
data/lib/octo/default_skills/channel-manager/dingtalk_setup.rb +191 -0
data/lib/octo/default_skills/channel-manager/discord_setup.rb +199 -0
data/lib/octo/default_skills/channel-manager/feishu_setup.rb +574 -0
data/lib/octo/default_skills/channel-manager/import_lark_skills.rb +97 -0
data/lib/octo/default_skills/channel-manager/install_feishu_skills.rb +105 -0
data/lib/octo/default_skills/channel-manager/weixin_setup.rb +274 -0
data/lib/octo/default_skills/code-explorer/SKILL.md +36 -0
data/lib/octo/default_skills/cron-task-creator/SKILL.md +257 -0
data/lib/octo/default_skills/cron-task-creator/evals/evals.json +38 -0
data/lib/octo/default_skills/onboard/SKILL.md +578 -0
data/lib/octo/default_skills/onboard/scripts/import_external_skills.rb +413 -0
data/lib/octo/default_skills/onboard/scripts/install_builtin_skills.rb +97 -0
data/lib/octo/default_skills/persist-memory/SKILL.md +59 -0
data/lib/octo/default_skills/personal-website/SKILL.md +113 -0
data/lib/octo/default_skills/personal-website/publish.rb +235 -0
data/lib/octo/default_skills/product-help/SKILL.md +123 -0
data/lib/octo/default_skills/product-help/docs/agent-config.md +74 -0
data/lib/octo/default_skills/product-help/docs/best-practices.md +49 -0
data/lib/octo/default_skills/product-help/docs/browser-tool.md +53 -0
data/lib/octo/default_skills/product-help/docs/built-in-skills.md +43 -0
data/lib/octo/default_skills/product-help/docs/cli-reference.md +82 -0
data/lib/octo/default_skills/product-help/docs/create-your-first-skill.md +47 -0
data/lib/octo/default_skills/product-help/docs/faq.md +98 -0
data/lib/octo/default_skills/product-help/docs/how-to-use-a-skill.md +58 -0
data/lib/octo/default_skills/product-help/docs/installation.md +59 -0
data/lib/octo/default_skills/product-help/docs/memory-system.md +61 -0
data/lib/octo/default_skills/product-help/docs/octorules.md +62 -0
data/lib/octo/default_skills/product-help/docs/session-management.md +63 -0
data/lib/octo/default_skills/product-help/docs/skill-basics.md +55 -0
data/lib/octo/default_skills/product-help/docs/skill-frontmatter.md +61 -0
data/lib/octo/default_skills/product-help/docs/web-server.md +49 -0
data/lib/octo/default_skills/product-help/docs/what-is-octo.md +37 -0
data/lib/octo/default_skills/product-help/docs/windows-installation.md +36 -0
data/lib/octo/default_skills/product-help/docs/writing-tips.md +53 -0
data/lib/octo/default_skills/recall-memory/SKILL.md +65 -0
data/lib/octo/default_skills/skill-add/SKILL.md +59 -0
data/lib/octo/default_skills/skill-add/scripts/install_from_zip.rb +295 -0
data/lib/octo/default_skills/skill-creator/SKILL.md +602 -0
data/lib/octo/default_skills/skill-creator/agents/analyzer.md +274 -0
data/lib/octo/default_skills/skill-creator/agents/comparator.md +202 -0
data/lib/octo/default_skills/skill-creator/agents/grader.md +223 -0
data/lib/octo/default_skills/skill-creator/eval-viewer/generate_review.py +471 -0
data/lib/octo/default_skills/skill-creator/eval-viewer/viewer.html +1325 -0
data/lib/octo/default_skills/skill-creator/references/schemas.md +430 -0
data/lib/octo/default_skills/skill-creator/scripts/__init__.py +0 -0
data/lib/octo/default_skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
data/lib/octo/default_skills/skill-creator/scripts/generate_report.py +326 -0
data/lib/octo/default_skills/skill-creator/scripts/improve_description.py +310 -0
data/lib/octo/default_skills/skill-creator/scripts/quick_validate.py +103 -0
data/lib/octo/default_skills/skill-creator/scripts/run_eval.py +317 -0
data/lib/octo/default_skills/skill-creator/scripts/run_loop.py +331 -0
data/lib/octo/default_skills/skill-creator/scripts/utils.py +47 -0
data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb +143 -0
data/lib/octo/idle_compression_timer.rb +115 -0
data/lib/octo/json_ui_controller.rb +204 -0
data/lib/octo/message_format/anthropic.rb +409 -0
data/lib/octo/message_format/bedrock.rb +361 -0
data/lib/octo/message_format/open_ai.rb +222 -0
data/lib/octo/message_history.rb +373 -0
data/lib/octo/openai_stream_aggregator.rb +130 -0
data/lib/octo/plain_ui_controller.rb +166 -0
data/lib/octo/providers.rb +534 -0
data/lib/octo/server/browser_manager.rb +397 -0
data/lib/octo/server/channel/adapters/base.rb +82 -0
data/lib/octo/server/channel/adapters/dingtalk/adapter.rb +314 -0
data/lib/octo/server/channel/adapters/dingtalk/api_client.rb +391 -0
data/lib/octo/server/channel/adapters/dingtalk/stream_client.rb +203 -0
data/lib/octo/server/channel/adapters/discord/adapter.rb +229 -0
data/lib/octo/server/channel/adapters/discord/api_client.rb +107 -0
data/lib/octo/server/channel/adapters/discord/gateway_client.rb +270 -0
data/lib/octo/server/channel/adapters/feishu/adapter.rb +320 -0
data/lib/octo/server/channel/adapters/feishu/bot.rb +478 -0
data/lib/octo/server/channel/adapters/feishu/file_processor.rb +36 -0
data/lib/octo/server/channel/adapters/feishu/message_parser.rb +129 -0
data/lib/octo/server/channel/adapters/feishu/ws_client.rb +423 -0
data/lib/octo/server/channel/adapters/telegram/adapter.rb +375 -0
data/lib/octo/server/channel/adapters/telegram/api_client.rb +205 -0
data/lib/octo/server/channel/adapters/wecom/adapter.rb +148 -0
data/lib/octo/server/channel/adapters/wecom/media_downloader.rb +115 -0
data/lib/octo/server/channel/adapters/wecom/ws_client.rb +395 -0
data/lib/octo/server/channel/adapters/weixin/adapter.rb +692 -0
data/lib/octo/server/channel/adapters/weixin/api_client.rb +402 -0
data/lib/octo/server/channel/channel_config.rb +178 -0
data/lib/octo/server/channel/channel_manager.rb +468 -0
data/lib/octo/server/channel/channel_ui_controller.rb +224 -0
data/lib/octo/server/channel.rb +33 -0
data/lib/octo/server/discover.rb +77 -0
data/lib/octo/server/epipe_safe_io.rb +105 -0
data/lib/octo/server/http_server.rb +3554 -0
data/lib/octo/server/scheduler.rb +317 -0
data/lib/octo/server/server_master.rb +325 -0
data/lib/octo/server/session_registry.rb +431 -0
data/lib/octo/server/web_ui_controller.rb +487 -0
data/lib/octo/session_manager.rb +385 -0
data/lib/octo/skill.rb +466 -0
data/lib/octo/skill_loader.rb +328 -0
data/lib/octo/tools/base.rb +118 -0
data/lib/octo/tools/browser.rb +625 -0
data/lib/octo/tools/edit.rb +165 -0
data/lib/octo/tools/file_reader.rb +549 -0
data/lib/octo/tools/glob.rb +162 -0
data/lib/octo/tools/grep.rb +356 -0
data/lib/octo/tools/invoke_skill.rb +96 -0
data/lib/octo/tools/list_tasks.rb +54 -0
data/lib/octo/tools/redo_task.rb +41 -0
data/lib/octo/tools/request_user_feedback.rb +84 -0
data/lib/octo/tools/security.rb +333 -0
data/lib/octo/tools/terminal/output_cleaner.rb +63 -0
data/lib/octo/tools/terminal/persistent_session.rb +268 -0
data/lib/octo/tools/terminal/safe_rm.sh +106 -0
data/lib/octo/tools/terminal/session_manager.rb +213 -0
data/lib/octo/tools/terminal.rb +1828 -0
data/lib/octo/tools/todo_manager.rb +374 -0
data/lib/octo/tools/trash_manager.rb +388 -0
data/lib/octo/tools/undo_task.rb +35 -0
data/lib/octo/tools/web_fetch.rb +242 -0
data/lib/octo/tools/web_search.rb +260 -0
data/lib/octo/tools/write.rb +77 -0
data/lib/octo/ui2/block_font.rb +10 -0
data/lib/octo/ui2/components/base_component.rb +163 -0
data/lib/octo/ui2/components/command_suggestions.rb +290 -0
data/lib/octo/ui2/components/common_component.rb +96 -0
data/lib/octo/ui2/components/inline_input.rb +226 -0
data/lib/octo/ui2/components/input_area.rb +1338 -0
data/lib/octo/ui2/components/message_component.rb +99 -0
data/lib/octo/ui2/components/modal_component.rb +419 -0
data/lib/octo/ui2/components/todo_area.rb +149 -0
data/lib/octo/ui2/components/tool_component.rb +107 -0
data/lib/octo/ui2/components/welcome_banner.rb +139 -0
data/lib/octo/ui2/layout_manager.rb +807 -0
data/lib/octo/ui2/line_editor.rb +363 -0
data/lib/octo/ui2/markdown_renderer.rb +100 -0
data/lib/octo/ui2/output_buffer.rb +370 -0
data/lib/octo/ui2/progress_handle.rb +362 -0
data/lib/octo/ui2/progress_indicator.rb +55 -0
data/lib/octo/ui2/screen_buffer.rb +273 -0
data/lib/octo/ui2/terminal_detector.rb +119 -0
data/lib/octo/ui2/theme_manager.rb +85 -0
data/lib/octo/ui2/themes/base_theme.rb +105 -0
data/lib/octo/ui2/themes/hacker_theme.rb +62 -0
data/lib/octo/ui2/themes/minimal_theme.rb +56 -0
data/lib/octo/ui2/thinking_verbs.rb +26 -0
data/lib/octo/ui2/ui_controller.rb +1625 -0
data/lib/octo/ui2/view_renderer.rb +177 -0
data/lib/octo/ui2.rb +40 -0
data/lib/octo/ui_interface.rb +154 -0
data/lib/octo/utils/arguments_parser.rb +191 -0
data/lib/octo/utils/browser_detector.rb +195 -0
data/lib/octo/utils/encoding.rb +92 -0
data/lib/octo/utils/environment_detector.rb +140 -0
data/lib/octo/utils/file_ignore_helper.rb +170 -0
data/lib/octo/utils/file_processor.rb +601 -0
data/lib/octo/utils/gitignore_parser.rb +154 -0
data/lib/octo/utils/limit_stack.rb +152 -0
data/lib/octo/utils/logger.rb +124 -0
data/lib/octo/utils/login_shell.rb +72 -0
data/lib/octo/utils/model_pricing.rb +646 -0
data/lib/octo/utils/parser_manager.rb +165 -0
data/lib/octo/utils/path_helper.rb +15 -0
data/lib/octo/utils/scripts_manager.rb +59 -0
data/lib/octo/utils/string_matcher.rb +158 -0
data/lib/octo/utils/trash_directory.rb +112 -0
data/lib/octo/utils/workspace_rules.rb +46 -0
data/lib/octo/version.rb +5 -0
data/lib/octo/web/app.css +7141 -0
data/lib/octo/web/app.js +543 -0
data/lib/octo/web/apple-touch-icon.png +0 -0
data/lib/octo/web/auth.js +150 -0
data/lib/octo/web/channels.js +276 -0
data/lib/octo/web/datepicker.js +205 -0
data/lib/octo/web/favicon.png +0 -0
data/lib/octo/web/i18n.js +1073 -0
data/lib/octo/web/icon-512.png +0 -0
data/lib/octo/web/icon-dark.svg +25 -0
data/lib/octo/web/icon.svg +29 -0
data/lib/octo/web/index.html +871 -0
data/lib/octo/web/marked.min.js +69 -0
data/lib/octo/web/onboard.js +491 -0
data/lib/octo/web/profile.js +442 -0
data/lib/octo/web/sessions.js +4421 -0
data/lib/octo/web/settings.js +913 -0
data/lib/octo/web/sidebar.js +32 -0
data/lib/octo/web/skills.js +885 -0
data/lib/octo/web/tasks.js +297 -0
data/lib/octo/web/theme.js +105 -0
data/lib/octo/web/trash.js +343 -0
data/lib/octo/web/vendor/hljs/highlight.min.js +1244 -0
data/lib/octo/web/vendor/hljs/hljs-theme.css +95 -0
data/lib/octo/web/vendor/katex/auto-render.min.js +1 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
data/lib/octo/web/vendor/katex/katex.min.css +1 -0
data/lib/octo/web/vendor/katex/katex.min.js +1 -0
data/lib/octo/web/version.js +449 -0
data/lib/octo/web/weixin-qr.html +209 -0
data/lib/octo/web/ws-dispatcher.js +357 -0
data/lib/octo/web/ws.js +128 -0
data/lib/octo.rb +145 -0
data/scripts/build/build.sh +329 -0
data/scripts/build/lib/apt.sh +56 -0
data/scripts/build/lib/brew.sh +89 -0
data/scripts/build/lib/colors.sh +17 -0
data/scripts/build/lib/gem.sh +95 -0
data/scripts/build/lib/mise.sh +125 -0
data/scripts/build/lib/network.sh +157 -0
data/scripts/build/lib/os.sh +57 -0
data/scripts/build/lib/shell.sh +37 -0
data/scripts/build/src/install.sh.cc +174 -0
data/scripts/build/src/install_browser.sh.cc +101 -0
data/scripts/build/src/install_full.sh.cc +290 -0
data/scripts/build/src/install_rails_deps.sh.cc +145 -0
data/scripts/build/src/install_system_deps.sh.cc +123 -0
data/scripts/build/src/uninstall.sh.cc +101 -0
data/scripts/install.ps1 +532 -0
data/scripts/install.sh +567 -0
data/scripts/install_browser.sh +479 -0
data/scripts/install_full.sh +838 -0
data/scripts/install_rails_deps.sh +746 -0
data/scripts/install_system_deps.sh +518 -0
data/scripts/uninstall.sh +287 -0
data/sig/octo.rbs +4 -0
metadata +614 -0

data/lib/octo/default_skills/skill-creator/scripts/run_loop.py ADDED Viewed

@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+Octo adaptation:
+- Queries execute serially (no parallel workers; --num-workers ignored)
+- Model comes from ~/.octo/config.yml (--model is kept for compat but ignored)
+- Skill dir: ~/.octo/skills/
+"""
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+    return train_set, test_set
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+    # Legacy params (kept for API compat, ignored in Octo)
+    num_workers: int = 1,
+    model: str = "",
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
+    history = []
+    exit_reason = "unknown"
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"Description: {current_description}", file=sys.stderr)
+            print(f"{'='*60}", file=sys.stderr)
+        # Run eval on all queries (train + test) serially
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+        )
+        eval_elapsed = time.time() - t0
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # Backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
+            break
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
+            break
+        # Improve description based on train results
+        if verbose:
+            print(f"\nImproving description...", file=sys.stderr)
+        t0 = time.time()
+        # Blind history to test scores so improvement model can't overfit to them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,  # ignored internally; model comes from config.yml
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+        if verbose:
+            print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+        current_description = new_description
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+    if verbose:
+        print(f"\nExit reason: {exit_reason}", file=sys.stderr)
+        print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop (Octo)")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--timeout", type=int, default=45, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=1, help="Number of runs per query (serially)")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction to hold out for testing (0 to disable)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="HTML report path ('auto'=temp file, 'none'=disable)")
+    parser.add_argument("--results-dir", default=None, help="Save results.json + report.html to a timestamped subdir here")
+    # Ignored legacy args (kept for CLI compat)
+    parser.add_argument("--num-workers", type=int, default=1, help="Ignored — Octo runs serially")
+    parser.add_argument("--model", default="", help="Ignored — model comes from ~/.octo/config.yml")
+    args = parser.parse_args()
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+    name, _, _ = parse_skill_md(skill_path)
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        live_report_path.write_text(
+            "<html><body><h1>Starting optimization loop...</h1>"
+            "<meta http-equiv='refresh' content='5'></body></html>"
+        )
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+    # Determine output directory
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+    log_dir = results_dir / "logs" if results_dir else None
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+        num_workers=args.num_workers,
+        model=args.model,
+    )
+    # Output JSON
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+    # Write final HTML report
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nReport: {live_report_path}", file=sys.stderr)
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+    if results_dir:
+        print(f"Results saved to: {results_dir}", file=sys.stderr)
+if __name__ == "__main__":
+    main()

data/lib/octo/default_skills/skill-creator/scripts/utils.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+from pathlib import Path
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+    return name, description, content

data/lib/octo/default_skills/skill-creator/scripts/validate_skill_frontmatter.rb ADDED Viewed

@@ -0,0 +1,143 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+# validate_skill_frontmatter.rb
+#
+# Validates and auto-fixes the YAML frontmatter of a SKILL.md file.
+#
+# Usage:
+#   ruby validate_skill_frontmatter.rb <path/to/SKILL.md>
+#
+# What it does:
+#   1. Parses the frontmatter between --- delimiters
+#   2. If YAML is invalid OR description is not a plain String:
+#      - Extracts name/description via regex fallback
+#      - Re-wraps description in single quotes (collapsed to one line)
+#      - Rewrites the frontmatter in the file
+#   3. Exits 0 on success (with or without auto-fix), 1 on unrecoverable error
+require "yaml"
+path = ARGV[0]
+if path.nil? || path.strip.empty?
+  warn "Usage: ruby validate_skill_frontmatter.rb <path/to/SKILL.md>"
+  exit 1
+end
+unless File.exist?(path)
+  warn "File not found: #{path}"
+  exit 1
+end
+content = File.read(path)
+# Extract frontmatter block
+fm_match = content.match(/\A(---\n)(.*?)(\n---[ \t]*\n?)/m)
+unless fm_match
+  warn "ERROR: No frontmatter block found in #{path}"
+  exit 1
+end
+prefix      = fm_match[1]          # "---\n"
+yaml_raw    = fm_match[2]          # raw YAML text
+suffix      = fm_match[3]          # "\n---\n"
+body        = content[fm_match.end(0)..]  # rest of file after frontmatter
+# Attempt normal YAML parse
+parse_ok = false
+data = nil
+begin
+  data = YAML.safe_load(yaml_raw) || {}
+  parse_ok = data["description"].is_a?(String)
+rescue Psych::Exception => e
+  warn "YAML parse error: #{e.message}"
+end
+if parse_ok
+  puts "OK: name=#{data['name'].inspect} description_length=#{data['description'].length}"
+  exit 0
+end
+# --- Auto-fix ---
+puts "Frontmatter invalid or description broken — attempting auto-fix..."
+# Regex fallback: extract name and description lines
+name_match = yaml_raw.match(/^name:\s*(.+)$/)
+unless name_match
+  warn "ERROR: Cannot extract 'name' field from frontmatter. Manual fix required."
+  exit 1
+end
+name_value = name_match[1].strip.gsub(/\A['"]|['"]\z/, "")
+# description may be:
+#   description: some text           (unquoted)
+#   description: 'some text'         (single-quoted)
+#   description: "some text"         (double-quoted)
+#   description: first line\n  continuation  (multi-line block scalar)
+desc_match = yaml_raw.match(/^description:\s*(.+?)(?=\n[a-z]|\z)/m)
+unless desc_match
+  warn "ERROR: Cannot extract 'description' field from frontmatter. Manual fix required."
+  exit 1
+end
+raw_desc = desc_match[1].strip
+# Strip existing outer quotes if present (simple single-line quoted values)
+if raw_desc.start_with?("'") && raw_desc.end_with?("'")
+  raw_desc = raw_desc[1..-2]
+elsif raw_desc.start_with?('"') && raw_desc.end_with?('"')
+  raw_desc = raw_desc[1..-2]
+end
+# Collapse multi-line: strip leading whitespace from continuation lines
+description_value = raw_desc.gsub(/\n\s+/, " ").strip
+# Escape any single quotes inside the description value
+description_value_escaped = description_value.gsub("'", "''")
+# Extract all other frontmatter lines (everything except name: and description:)
+other_lines = yaml_raw.each_line.reject do |line|
+  line.match?(/^(name|description):/) || line.match?(/^\s+\S/) && yaml_raw.match?(/^description:.*\n(\s+.+\n)*/m)
+end
+# More precise: collect lines that are not part of the name/description block
+remaining = []
+skip_continuation = false
+yaml_raw.each_line do |line|
+  if line.match?(/^(name|description):/)
+    skip_continuation = true
+    next
+  end
+  if skip_continuation && line.match?(/^\s+\S/)
+    next  # continuation of a multi-line block value
+  end
+  skip_continuation = false
+  remaining << line unless line.strip.empty? && remaining.empty?
+end
+# Rebuild frontmatter
+fixed_fm_lines = []
+fixed_fm_lines << "name: #{name_value}"
+fixed_fm_lines << "description: '#{description_value_escaped}'"
+remaining.each { |l| fixed_fm_lines << l.chomp }
+# Remove trailing blank lines from remaining
+fixed_fm = fixed_fm_lines.join("\n").strip
+new_content = "#{prefix}#{fixed_fm}#{suffix}#{body}"
+File.write(path, new_content)
+puts "Auto-fixed and saved: #{path}"
+# Final verification
+begin
+  verify_content = File.read(path)
+  verify_match = verify_content.match(/\A---\n(.*?)\n---/m)
+  verify_data = YAML.safe_load(verify_match[1])
+  raise "description not a String" unless verify_data["description"].is_a?(String)
+  puts "OK: name=#{verify_data['name'].inspect} description_length=#{verify_data['description'].length}"
+rescue => e
+  warn "ERROR: Auto-fix failed, manual intervention required: #{e.message}"
+  exit 1
+end

data/lib/octo/idle_compression_timer.rb ADDED Viewed

@@ -0,0 +1,115 @@
+# frozen_string_literal: true
+module Octo
+  # IdleCompressionTimer triggers memory compression after a period of inactivity.
+  #
+  # Both CLI and WebUI use the same agent-level compression logic; this class
+  # abstracts the "wait N seconds, then compress" pattern so it can be shared.
+  #
+  # Usage:
+  #   timer = IdleCompressionTimer.new(agent: agent, session_manager: sm) do |success|
+  #     # called on the compression thread after compression finishes
+  #     broadcast_update if success
+  #   end
+  #   timer.start   # call after each agent run completes
+  #   timer.cancel  # call when new user input arrives
+  class IdleCompressionTimer
+    # Seconds of inactivity before idle compression is triggered
+    IDLE_DELAY = 180
+    # @param agent [Octo::Agent] the agent whose messages will be compressed
+    # @param session_manager [Octo::SessionManager, nil] used to persist session after compression
+    # @param logger [#call, nil] optional logger lambda: ->(msg, level:) { ... }
+    # @param on_compress [Proc, nil] block called after compression attempt with success (bool)
+    def initialize(agent:, session_manager: nil, logger: nil, &on_compress)
+      @agent           = agent
+      @session_manager = session_manager
+      @logger          = logger
+      @on_compress     = on_compress
+      @timer_thread    = nil
+      @compress_thread = nil
+      @mutex           = Mutex.new
+    end
+    # Start (or restart) the idle timer.
+    # Cancels any existing timer first, then waits IDLE_DELAY seconds before compressing.
+    def start
+      cancel # reset any existing timer
+      @timer_thread = Thread.new do
+        Thread.current.name = "idle-compression-timer"
+        sleep IDLE_DELAY
+        # Register @compress_thread inside the mutex BEFORE the thread starts running,
+        # so cancel() can always find and interrupt it even if it fires immediately.
+        compress_thread = nil
+        @mutex.synchronize do
+          compress_thread = Thread.new do
+            Thread.current.name = "idle-compression-work"
+            run_compression
+          end
+          @compress_thread = compress_thread
+        end
+        compress_thread.join
+        @mutex.synchronize { @compress_thread = nil; @timer_thread = nil }
+      end
+    end
+    # Cancel the timer and any in-progress compression.
+    # Raises AgentInterrupted on the compress thread and waits for it to fully exit,
+    # ensuring history rollback completes before the caller starts a new agent.run.
+    def cancel
+      compress_thread_to_join = nil
+      @mutex.synchronize do
+        @timer_thread&.kill
+        if @compress_thread&.alive?
+          @compress_thread.raise(Octo::AgentInterrupted, "Idle timer cancelled")
+          compress_thread_to_join = @compress_thread
+        end
+        @timer_thread    = nil
+        @compress_thread = nil
+      end
+      # Join outside the mutex to avoid deadlock.
+      # This blocks until the compress thread has finished rolling back history,
+      # so the subsequent agent.run sees a clean, consistent history.
+      compress_thread_to_join&.join(5)
+    end
+    # True if the timer or compression is currently active.
+    def active?
+      @mutex.synchronize { @timer_thread&.alive? || @compress_thread&.alive? }
+    end
+    # True only when compression work is actually in flight (not during the
+    # pre-compression idle countdown). Used by callers that want to treat
+    # Ctrl+C during active compression as "stop compressing" rather than
+    # "exit the program".
+    def compressing?
+      @mutex.synchronize { @compress_thread&.alive? || false }
+    end
+    private def run_compression
+      success = @agent.trigger_idle_compression
+      if success && @session_manager
+        @session_manager.save(@agent.to_session_data(status: :success))
+      end
+      @on_compress&.call(success)
+    rescue Octo::AgentInterrupted
+      log("Idle compression cancelled", level: :info)
+      @on_compress&.call(false)
+    rescue => e
+      log("Idle compression error: #{e.message}", level: :error)
+      @on_compress&.call(false)
+    end
+    private def log(message, level: :info)
+      @logger&.call(message, level: level)
+    end
+  end
+end