RubyGems - ruby-claw - Versions diffs - 0.1.2 → 0.2.0 - Mend

ruby-claw 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +94 -0
data/README.md +214 -10
data/exe/claw +42 -1
data/lib/claw/auto_forge.rb +66 -0
data/lib/claw/benchmark/benchmark.rb +79 -0
data/lib/claw/benchmark/diff.rb +69 -0
data/lib/claw/benchmark/report.rb +87 -0
data/lib/claw/benchmark/runner.rb +91 -0
data/lib/claw/benchmark/scorer.rb +69 -0
data/lib/claw/benchmark/task.rb +63 -0
data/lib/claw/benchmark/tasks/claw_remember.rb +20 -0
data/lib/claw/benchmark/tasks/claw_session.rb +18 -0
data/lib/claw/benchmark/tasks/evolution_trace.rb +18 -0
data/lib/claw/benchmark/tasks/mana_call_func.rb +21 -0
data/lib/claw/benchmark/tasks/mana_eval.rb +18 -0
data/lib/claw/benchmark/tasks/mana_knowledge.rb +19 -0
data/lib/claw/benchmark/tasks/mana_var_readwrite.rb +18 -0
data/lib/claw/benchmark/tasks/runtime_fork.rb +18 -0
data/lib/claw/benchmark/tasks/runtime_snapshot.rb +18 -0
data/lib/claw/benchmark/trigger.rb +68 -0
data/lib/claw/chat.rb +119 -6
data/lib/claw/child_runtime.rb +196 -0
data/lib/claw/cli.rb +177 -0
data/lib/claw/commands.rb +131 -0
data/lib/claw/config.rb +5 -1
data/lib/claw/console/event_logger.rb +69 -0
data/lib/claw/console/public/app.js +264 -0
data/lib/claw/console/public/style.css +330 -0
data/lib/claw/console/server.rb +253 -0
data/lib/claw/console/sse.rb +28 -0
data/lib/claw/console/views/experiments.erb +8 -0
data/lib/claw/console/views/index.erb +27 -0
data/lib/claw/console/views/layout.erb +29 -0
data/lib/claw/console/views/memory.erb +13 -0
data/lib/claw/console/views/monitor.erb +15 -0
data/lib/claw/console/views/prompt.erb +15 -0
data/lib/claw/console/views/snapshots.erb +12 -0
data/lib/claw/console/views/tools.erb +13 -0
data/lib/claw/console/views/traces.erb +9 -0
data/lib/claw/console.rb +5 -0
data/lib/claw/evolution.rb +227 -0
data/lib/claw/forge.rb +144 -0
data/lib/claw/hub.rb +67 -0
data/lib/claw/init.rb +199 -0
data/lib/claw/knowledge.rb +36 -2
data/lib/claw/memory_store.rb +2 -2
data/lib/claw/plan_mode.rb +110 -0
data/lib/claw/resource.rb +35 -0
data/lib/claw/resources/binding_resource.rb +128 -0
data/lib/claw/resources/context_resource.rb +73 -0
data/lib/claw/resources/filesystem_resource.rb +107 -0
data/lib/claw/resources/memory_resource.rb +74 -0
data/lib/claw/resources/worktree_resource.rb +133 -0
data/lib/claw/roles.rb +56 -0
data/lib/claw/runtime.rb +189 -0
data/lib/claw/serializer.rb +10 -7
data/lib/claw/tool.rb +99 -0
data/lib/claw/tool_index.rb +84 -0
data/lib/claw/tool_registry.rb +100 -0
data/lib/claw/trace.rb +86 -0
data/lib/claw/tui/agent_executor.rb +92 -0
data/lib/claw/tui/chat_panel.rb +81 -0
data/lib/claw/tui/command_bar.rb +22 -0
data/lib/claw/tui/file_card.rb +88 -0
data/lib/claw/tui/folding.rb +80 -0
data/lib/claw/tui/input_handler.rb +73 -0
data/lib/claw/tui/layout.rb +34 -0
data/lib/claw/tui/messages.rb +31 -0
data/lib/claw/tui/model.rb +411 -0
data/lib/claw/tui/object_explorer.rb +136 -0
data/lib/claw/tui/status_bar.rb +30 -0
data/lib/claw/tui/status_panel.rb +133 -0
data/lib/claw/tui/styles.rb +58 -0
data/lib/claw/tui/tui.rb +54 -0
data/lib/claw/version.rb +1 -1
data/lib/claw.rb +99 -1
metadata +223 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 14400b4f156bbcd289918bf982f233f7c1055a246df1efac571b2e4852ef5752
-  data.tar.gz: 6f9cb9cee99ae272c0605c80f2256686ac256d7fa8c2cf4a3e19bdb330beee92
+  metadata.gz: a656bdeb580e8d10ca17f5e6f74a7406027da83488c4b1d256a4dd5e4e93de01
+  data.tar.gz: 6c4474049cad5f10148c9dd9b5960e6b1b521ecaef73720a326e796861a58a92
 SHA512:
-  metadata.gz: 6e7401c9af8cbe84fff2d677d34b4aab9d939e2e89e6355815a67fee4a5fcbf293c55f892bf144c096a78c8979d2d1d2eb52308a2ce4a2264f3d8456cb42a53b
-  data.tar.gz: 80ddb09282da9a9ec07ea6eb6a2ca4aa2282c61e1c3dd4338545873eb8fbb2a145955422600ea1eaa4829508818743adb2bd86be50ad1a6a8e70211b858919b8
+  metadata.gz: 68810da4b3c804ff7bd61fa9d4662ce622ad20380dc91222c979ae6ab5b82fd311f5c56ec3e96702df4bcec8790015f4b4ac5b81aeb8f5187cc3d41c7eba3f9c
+  data.tar.gz: 559a0c8d419cc42215a2d96d78575d3b38e14b9f4915e14859da25dbdf3f05fed591b895cf77261b8793555b8fb6faa61b7f67b56a9a5e95c5dc24a35e11a294

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,99 @@
 # Changelog
+## [0.2.0] - 2026-04-06
+### Added
+- **Three-layer tool system** (V9): core (always loaded), project (on-demand), hub (remote)
+  - `Claw::Tool` mixin with declarative DSL: `tool_name`, `description`, `parameter`
+  - `Claw::ToolIndex` — regex-based file scanning of `.ruby-claw/tools/*.rb` without require
+  - `Claw::ToolRegistry` — manages tool lifecycle: search, load, unload, register with Mana
+  - `search_tools` and `load_tool` agent-facing Mana tools for dynamic discovery
+- `Claw::Forge` — `/forge <method_name>` promotes eval-defined methods to formal tool classes
+- `Claw::AutoForge` — detects repeated eval patterns in traces, suggests tool promotion
+- `Claw::Hub` — HTTP client for community tool hub (search + download)
+- **Web Console** (V10): local Sinatra-based observability UI at localhost:4567
+  - `Claw::Console::Server` — Sinatra app with 8 page routes + full REST API
+  - `Claw::Console::EventLogger` — structured JSONL append-only event log with Mutex
+  - `Claw::Console::SSE` — Server-Sent Events streaming for real-time monitoring
+  - Pages: Dashboard, Prompt Inspector, LLM Monitor, Trace Explorer, Memory, Tools, Snapshots, Experiments
+  - API endpoints: GET/POST for status, events, traces, memory, prompt, tools, snapshots
+- `claw console [--port N]` CLI subcommand
+- Proactive `remember` tool guidance in system prompt
+### Fixed
+- Path traversal vulnerability in `/api/traces/:id` — now validates IDs
+- Hub download path sanitization — prevents directory traversal via tool names
+- Console POST endpoints now validate JSON and required fields
+- CLI `--port` parsing handles missing argument
+- `Forge` filename sanitization handles uppercase method names
+- `Claw.reset!` now clears `Tool.tool_classes` to prevent test leaks
+## [0.1.8] - 2026-04-05
+### Added
+- `Claw::ChildRuntime` — multi-agent parent-child architecture with isolated threads
+- `Claw::Resources::WorktreeResource` — git worktree isolation for child agents
+- `Runtime#fork_async` spawns child agents with deep-copied variables and optional role/model override
+- Child lifecycle: `start!` / `join` / `cancel!` / `diff` / `merge!` with Mutex-based thread safety
+- Resource `merge_from!` interface for merging child changes back to parent
+## [0.1.7] - 2026-04-05
+### Added
+- `Claw::Benchmark` framework — automated task-based evaluation of agent capabilities
+- 9 built-in benchmark tasks across mana, claw, runtime, and evolution layers
+- `Claw::Benchmark::Scorer` — scoring formula: correctness, rounds, tokens, tool path (Levenshtein)
+- `Claw::Benchmark::Report` — Markdown report generation with per-task and per-layer breakdown
+- `Claw::Benchmark::Diff` — compare two benchmark reports
+- `Claw::Benchmark::Trigger` — auto-triggers evolution on score regression or 3 consecutive failures
+- CLI: `claw benchmark run`, `claw benchmark diff <a> <b>`
+## [0.1.6] - 2026-04-05
+### Added
+- Full-screen TUI built on Charm Ruby (bubbletea, lipgloss, bubbles, glamour)
+- MVU architecture: Model/Update/View with 4-zone layout (status bar, chat panel, status panel, command bar)
+- `Claw::PlanMode` — two-phase plan-then-execute workflow with fork safety
+- `Claw::Roles` — agent identity management via `.ruby-claw/roles/*.md`
+- `Claw::Commands` — extracted pure-function slash command module
+- `Claw::CLI` — headless CLI for non-interactive subcommands
+- TUI modules: syntax highlighting, tab completion, object explorer, file cards, text folding
+- CLI subcommands: `claw status`, `claw history`, `claw rollback`, `claw trace`, `claw evolve`, `claw benchmark`
+### Changed
+- Default `claw` entry point now launches TUI instead of legacy REPL
+- `Chat.start` delegates to `TUI.start` for backward compatibility
+- `claw init` now creates `roles/` directory with default role
+## [0.1.5] - 2026-04-05
+### Added
+- `Claw::Evolution` — self-evolution loop: reads traces, LLM diagnosis, fork/apply/test/keep-or-rollback
+- `/evolve` REPL command to trigger an evolution cycle
+- Evolution logs written to `.ruby-claw/evolution/`
+## [0.1.4] - 2026-04-05
+### Added
+- `Claw::Init` — `claw init` scaffolds a new project with editable gem source
+- Clones ruby-claw and ruby-mana to `.ruby-claw/gems/`
+- Generates Gemfile with `path:` references, `system_prompt.md`, empty `MEMORY.md`
+- Initializes git repo in `.ruby-claw/` with initial commit
+- CLI subcommands: `claw init`, `claw version`, `claw help`
+## [0.1.3] - 2026-04-05
+### Added
+- `Claw::Trace` — writes per-task Markdown trace files to `.ruby-claw/traces/`
+- Traces capture timing, token usage, and tool call details per LLM iteration
+- Auto-writes traces after each chat execution
+### Changed
+- Serializer `encode_value` now uses `MarshalMd.dump` instead of `Marshal.dump`
+- Backward compatibility: old `"marshal"` type entries still decoded via `Marshal.load`
+- `BindingResource` and all resources use MarshalMd for deep copy
+- Added `marshal-md` gem dependency
 ## [0.1.2] - 2026-04-04
 ### Changed

data/README.md CHANGED Viewed

@@ -14,12 +14,14 @@ gem install ruby-claw
 ## Features
-### Interactive Chat REPL
+### Interactive TUI
+Running `claw` launches a full-screen terminal UI (built on Charm Ruby's bubbletea) with 4 zones: top status bar, left chat panel, right status panel, and bottom command bar.
+`Claw.chat` still works for the legacy REPL mode:
 ```ruby
 require "claw"
 Claw.chat
 ```
-Or from command line: `claw`
 - Auto-detects Ruby code vs natural language
 - Streaming output with markdown rendering
@@ -27,16 +29,22 @@ Or from command line: `claw`
 - Session persists across restarts
 ### Persistent Memory
-Claw stores memories as human-readable Markdown in `.mana/`:
+Claw stores memories as human-readable Markdown in `.ruby-claw/`:
 ```
-.mana/
-  MEMORY.md       # Long-term facts (editable!)
-  session.md      # Conversation summary
-  values.json     # Variable snapshots
-  definitions.rb  # Method definitions
+.ruby-claw/
+  MEMORY.md          # Long-term facts (editable!)
+  session.md         # Conversation summary
+  system_prompt.md   # Custom agent personality
+  values.json        # Variable snapshots
+  definitions.rb     # Method definitions
   log/
-    2026-03-29.md  # Daily interaction log
+    2026-03-29.md    # Daily interaction log
+  traces/
+    20260405_103000.md  # Execution traces
+  evolution/
+    20260405_accept.md  # Evolution logs
+  gems/              # Editable gem source (after claw init)
 ```
 The LLM can `remember` facts that persist across sessions:
@@ -62,9 +70,192 @@ claw> greet("world")  # => "Hello world"
 ### Memory Compaction
 When conversation grows large, old messages are automatically summarized in the background.
+### Incognito Mode
+Temporarily disable memory loading and saving:
+```ruby
+Claw.incognito do
+  ~"translate <text> to French, store in <french>"
+  # No memories loaded, nothing remembered
+end
+Claw::Memory.incognito?  # => true inside the block
+```
 ### Keyword Memory Search
 With many memories (>20), only the most relevant are injected into prompts.
+### Reversible Runtime
+Snapshot and rollback the entire agent state (context, memory, variables, filesystem):
+```
+claw> /snapshot before-refactor
+  ✓ snapshot #2 created (before-refactor)
+claw> # ... make changes ...
+claw> /rollback 2
+  ✓ rolled back to snapshot #2
+```
+**REPL commands:**
+| Command | Description |
+|---------|-------------|
+| `/snapshot [label]` | Snapshot all resources |
+| `/rollback <id>` | Rollback to a snapshot |
+| `/diff [id_a id_b]` | Show diff between snapshots |
+| `/history` | List all snapshots |
+| `/status` | Show current resource state |
+| `/evolve` | Run a self-evolution cycle |
+| `/role <name>` | Switch agent role/identity |
+| `/forge <method>` | Promote a method to a formal tool |
+### Plan Mode
+`/plan` toggles plan mode. When active, the LLM generates a step-by-step plan without executing any tools. The user reviews the proposed steps, then confirms execution -- which runs in a safe fork so the original state is preserved if anything goes wrong.
+### Roles
+Role files are Markdown documents stored in `.ruby-claw/roles/`. Each role defines an agent identity (system prompt, constraints, tool permissions).
+- `/role <name>` switches the active agent identity at runtime
+- `claw init` creates a default role
+### Benchmark
+`claw benchmark run` executes the benchmark suite -- 9 built-in tasks spanning the mana, claw, runtime, and evolution layers. Each task runs 3 times, and scoring covers:
+- **Correctness** -- did the agent produce the right result?
+- **Rounds efficiency** -- how many LLM round-trips were needed?
+- **Token efficiency** -- total token usage
+- **Tool path accuracy** -- did the agent call the expected tools in the expected order?
+`claw benchmark diff <a> <b>` compares two benchmark reports side by side. Auto-triggers an evolution cycle on score regression or 3 consecutive failures.
+### Multi-Agent
+`runtime.fork_async(prompt:, vars:, role:)` spawns a child agent that runs in an isolated thread with deep-copied variables and an optional git worktree for filesystem isolation.
+Child lifecycle methods:
+- `child.join` -- block until the child finishes
+- `child.cancel!` -- abort the child
+- `child.diff` -- inspect changes made by the child
+- `child.merge!` -- merge the child's results back into the parent
+All operations are thread-safe with Mutex protection.
+### Execution Traces
+Every LLM interaction is logged as a Markdown file in `.ruby-claw/traces/`:
+```markdown
+# Task: compute average of numbers
+- Model: claude-sonnet-4-20250514
+- Steps: 2
+- Total tokens: 1100 in / 350 out
+- Total latency: 1400ms
+## Step 1
+- Latency: 800ms
+- Tokens: 500 in / 200 out
+### Tool calls
+- **read_var**(name: "numbers") -> [1, 2, 3]
+```
+### Tool System
+Claw has a three-layer tool architecture:
+1. **Core tools** (always loaded): `read_var`, `write_var`, `call_func`, `eval`, `remember`, `search_tools`, `load_tool`
+2. **Project tools** (on-demand): `.ruby-claw/tools/*.rb` — indexed at startup, loaded via `load_tool`
+3. **Hub tools** (remote): community tools from a ruby-claw-toolhub, downloaded on demand
+Create a project tool:
+```ruby
+# .ruby-claw/tools/format_report.rb
+class FormatReport
+  include Claw::Tool
+  tool_name   "format_report"
+  description "Format raw data into a readable report"
+  parameter   :data,  type: "Hash",   required: true,  desc: "Raw data"
+  parameter   :style, type: "String", required: false, desc: "brief or detailed"
+  def call(data:, style: "brief")
+    # ...
+  end
+end
+```
+The agent discovers tools via `search_tools` and loads them via `load_tool`. Use `/forge <method_name>` to promote an eval-defined method into a formal tool class.
+### Web Console
+`claw console` launches a local web UI at `http://127.0.0.1:4567` for observability and operations:
+- **Dashboard** — version, tool/memory/snapshot counts
+- **Prompt Inspector** — view and edit the assembled system prompt
+- **LLM Monitor** — real-time event stream via Server-Sent Events
+- **Trace Explorer** — browse execution traces
+- **Memory Manager** — add/remove long-term memories
+- **Tool Manager** — view core tools, load/unload project tools
+- **Snapshot Manager** — create snapshots, rollback state
+All data is served via a REST API (`/api/status`, `/api/traces`, `/api/memory`, etc.).
+### Project Scaffolding
+Initialize a project with editable gem source for self-evolution:
+```bash
+claw init
+```
+Creates:
+```
+.ruby-claw/
+  gems/
+    ruby-claw/    # Editable source
+    ruby-mana/
+  tools/            # Project tool classes
+  roles/            # Agent role definitions
+  benchmarks/       # Benchmark reports
+  system_prompt.md  # Customizable agent personality
+  MEMORY.md
+  .git/             # Filesystem snapshots
+```
+### Self-Evolution
+The agent can improve its own code:
+```
+claw> /evolve
+  ⚡ running evolution cycle...
+  ✓ accepted: Improve error message specificity
+```
+Flow: read traces → LLM diagnoses improvement → fork runtime → apply change → run tests → keep or rollback.
+Evolution logs are written to `.ruby-claw/evolution/`.
+### CLI Subcommands
+| Command | Description |
+|---------|-------------|
+| `claw` | Launch the TUI (default) |
+| `claw init` | Scaffold a new project |
+| `claw status` | Show current resource state |
+| `claw history` | List all snapshots |
+| `claw rollback <id>` | Rollback to a snapshot |
+| `claw trace [id]` | View execution traces |
+| `claw evolve` | Run a self-evolution cycle |
+| `claw benchmark run` | Run the benchmark suite |
+| `claw benchmark diff <a> <b>` | Compare two benchmark reports |
+| `claw console` | Launch the web console UI |
+| `claw version` | Print version |
+| `claw help` | Show help |
 ## Configuration
 ```ruby
@@ -75,6 +266,9 @@ Claw.configure do |c|
   c.persist_session = true       # Save/restore session across restarts
   c.memory_top_k = 10           # Max memories to inject when searching
   c.on_compact = ->(summary) { puts summary }
+  c.tools_dir = nil              # Custom tools directory (default: .ruby-claw/tools)
+  c.hub_url = nil                # Remote tool hub URL
+  c.console_port = 4567          # Web console port
 end
 # Mana config (inherited)
@@ -84,7 +278,17 @@ Mana.configure do |c|
 end
 ```
-## Relationship with ruby-mana
+## Architecture
+Claw extends mana via its tool registration interface — no monkey-patching:
+```ruby
+# Claw registers the "remember" tool into mana's engine
+Mana.register_tool(remember_tool_definition) { |input| ... }
+# Claw injects long-term memories into mana's system prompt
+Mana.register_prompt_section { |context| memory_text }
+```
 - **ruby-mana** = Embedded LLM engine (`~"..."` syntax, binding manipulation, tool calling)
 - **ruby-claw** = Agent framework (chat REPL, memory, persistence, knowledge)

data/exe/claw CHANGED Viewed

@@ -4,4 +4,45 @@
 require "dotenv/load" rescue nil
 require "claw"
-Claw.chat
+HELP_TEXT = <<~HELP
+  Usage: claw [command]
+  Commands:
+    (none)         Start TUI (default)
+    init           Initialize a new Claw project
+    status         Print runtime state
+    history        List snapshots
+    rollback ID    Rollback to snapshot
+    trace [ID]     View execution trace (list if no ID)
+    evolve         Run evolution cycle
+    benchmark run  Run benchmark suite
+    benchmark diff A B  Compare two benchmark reports
+    console        Launch web console UI
+    version        Show version
+    help           Show this message
+HELP
+case ARGV.first
+when "init"
+  Claw::Init.run
+when "status"
+  Claw::CLI.run(:status)
+when "history"
+  Claw::CLI.run(:history)
+when "rollback"
+  Claw::CLI.run(:rollback, ARGV[1])
+when "trace"
+  Claw::CLI.run(:trace, ARGV[1])
+when "evolve"
+  Claw::CLI.run(:evolve)
+when "benchmark"
+  Claw::CLI.run(:benchmark, *ARGV[1..])
+when "console"
+  Claw::CLI.run(:console, *ARGV[1..])
+when "version", "--version", "-v"
+  puts "claw #{Claw::VERSION}"
+when "help", "--help", "-h"
+  puts HELP_TEXT
+else
+  Claw::TUI.start(TOPLEVEL_BINDING)
+end

data/lib/claw/auto_forge.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+module Claw
+  # Detects repeated eval patterns in traces and suggests tool promotion.
+  module AutoForge
+    THRESHOLD = 3 # Minimum occurrences before suggesting
+    class << self
+      # Analyze recent traces for repeated eval patterns.
+      #
+      # @param traces_dir [String] path to .ruby-claw/traces/
+      # @param limit [Integer] number of recent traces to analyze
+      # @return [Array<Hash>] suggestions [{method_name:, occurrences:, sample_code:}]
+      def analyze(traces_dir, limit: 10)
+        return [] unless traces_dir && Dir.exist?(traces_dir)
+        files = Dir.glob(File.join(traces_dir, "*.md")).sort.last(limit)
+        return [] if files.empty?
+        # Collect all eval tool calls that define methods
+        method_counts = Hash.new { |h, k| h[k] = { count: 0, sample: nil } }
+        files.each do |file|
+          content = File.read(file)
+          # Look for eval tool calls containing method definitions
+          content.scan(/eval.*?```ruby\s*\n(.*?)```/m).each do |match|
+            code = match[0]
+            # Extract method name from `def method_name`
+            code.scan(/\bdef\s+(\w+)/).each do |name_match|
+              name = name_match[0]
+              method_counts[name][:count] += 1
+              method_counts[name][:sample] ||= code.strip
+            end
+          end
+        end
+        method_counts
+          .select { |_, v| v[:count] >= THRESHOLD }
+          .map { |name, v| { method_name: name, occurrences: v[:count], sample_code: v[:sample] } }
+          .sort_by { |s| -s[:occurrences] }
+      end
+      # Quick check: are there any suggestions?
+      #
+      # @param traces_dir [String]
+      # @return [Boolean]
+      def suggest?(traces_dir)
+        !analyze(traces_dir).empty?
+      end
+      # Format suggestions for display.
+      #
+      # @param suggestions [Array<Hash>]
+      # @return [String]
+      def format_suggestions(suggestions)
+        return "" if suggestions.empty?
+        lines = ["Detected repeated method patterns — consider promoting to tools:"]
+        suggestions.each do |s|
+          lines << "  · #{s[:method_name]} (#{s[:occurrences]}x) — /forge #{s[:method_name]}"
+        end
+        lines.join("\n")
+      end
+    end
+  end
+end

data/lib/claw/benchmark/benchmark.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+require_relative "task"
+require_relative "scorer"
+require_relative "runner"
+require_relative "report"
+require_relative "diff"
+require_relative "trigger"
+module Claw
+  module Benchmark
+    # Run the full benchmark suite with progress output.
+    #
+    # @param claw_dir [String] path to .ruby-claw directory
+    # @return [SuiteResult]
+    def self.run!(claw_dir: ".ruby-claw")
+      tasks = load_builtin_tasks
+      if tasks.empty?
+        $stderr.puts "No benchmark tasks found."
+        return
+      end
+      puts "Running #{tasks.size} benchmark tasks (#{Runner::RUNS_PER_TASK} runs each)...\n\n"
+      runner = Runner.new
+      suite = runner.run_all(tasks) do |task_id, run_idx, total, completed|
+        pct = (completed.to_f / total * 100).round(0)
+        print "\r  [#{pct}%] #{task_id} run #{run_idx}/#{Runner::RUNS_PER_TASK}"
+      end
+      puts "\n\n"
+      report_text = Report.generate(suite)
+      path = Report.save(report_text, claw_dir)
+      puts report_text
+      puts "\nReport saved to #{path}"
+      suite
+    end
+    # Compare two benchmark reports.
+    #
+    # @param path_a [String]
+    # @param path_b [String]
+    def self.diff!(path_a, path_b)
+      unless path_a && path_b
+        $stderr.puts "Usage: claw benchmark diff <report_a> <report_b>"
+        return
+      end
+      puts Diff.compare(path_a, path_b)
+    end
+    # Load all built-in task definitions.
+    #
+    # @return [Array<Task>]
+    def self.load_builtin_tasks
+      tasks_dir = File.join(__dir__, "tasks")
+      return [] unless Dir.exist?(tasks_dir)
+      Dir.glob(File.join(tasks_dir, "*.rb")).sort.each { |f| require f }
+      Tasks.all
+    end
+    private_class_method :load_builtin_tasks
+    # Registry for built-in tasks.
+    module Tasks
+      @registry = []
+      def self.register(task)
+        @registry << task
+      end
+      def self.all
+        @registry.dup
+      end
+    end
+  end
+end

data/lib/claw/benchmark/diff.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+module Claw
+  module Benchmark
+    # Compare two benchmark reports.
+    module Diff
+      # Compare two report files.
+      #
+      # @param path_a [String] path to report A
+      # @param path_b [String] path to report B
+      # @return [String] comparison output
+      def self.compare(path_a, path_b)
+        raise "Report not found: #{path_a}" unless File.exist?(path_a)
+        raise "Report not found: #{path_b}" unless File.exist?(path_b)
+        scores_a = extract_scores(File.read(path_a))
+        scores_b = extract_scores(File.read(path_b))
+        all_tasks = (scores_a.keys + scores_b.keys).uniq.sort
+        lines = ["# Benchmark Diff\n"]
+        lines << "**A:** #{File.basename(path_a)}"
+        lines << "**B:** #{File.basename(path_b)}"
+        lines << ""
+        # Suite score comparison
+        suite_a = scores_a.values.sum / [scores_a.size, 1].max.to_f
+        suite_b = scores_b.values.sum / [scores_b.size, 1].max.to_f
+        delta = suite_b - suite_a
+        indicator = delta > 0 ? "↑" : delta < 0 ? "↓" : "="
+        lines << "**Suite score:** #{suite_a.round(1)} → #{suite_b.round(1)} (#{indicator} #{delta.abs.round(1)})"
+        lines << ""
+        # Per-task changes
+        lines << "| Task | A | B | Delta |"
+        lines << "|------|---|---|-------|"
+        all_tasks.each do |task|
+          sa = scores_a[task] || 0
+          sb = scores_b[task] || 0
+          d = sb - sa
+          sign = d > 0 ? "+" : ""
+          lines << "| #{task} | #{sa.round(1)} | #{sb.round(1)} | #{sign}#{d.round(1)} |"
+        end
+        lines.join("\n")
+      end
+      # Extract task scores from a report Markdown file.
+      # Looks for lines like "### task_id (...)" followed by "- **Score:** N"
+      def self.extract_scores(text)
+        scores = {}
+        current_task = nil
+        text.each_line do |line|
+          if line.match?(/^### (\S+)/)
+            current_task = line.match(/^### (\S+)/)[1]
+          elsif current_task && line.match?(/\*\*Score:\*\*\s*([\d.]+)/)
+            scores[current_task] = line.match(/\*\*Score:\*\*\s*([\d.]+)/)[1].to_f
+            current_task = nil
+          end
+        end
+        scores
+      end
+      private_class_method :extract_scores
+    end
+  end
+end