ruby-claw 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +94 -0
  3. data/README.md +214 -10
  4. data/exe/claw +42 -1
  5. data/lib/claw/auto_forge.rb +66 -0
  6. data/lib/claw/benchmark/benchmark.rb +79 -0
  7. data/lib/claw/benchmark/diff.rb +69 -0
  8. data/lib/claw/benchmark/report.rb +87 -0
  9. data/lib/claw/benchmark/runner.rb +91 -0
  10. data/lib/claw/benchmark/scorer.rb +69 -0
  11. data/lib/claw/benchmark/task.rb +63 -0
  12. data/lib/claw/benchmark/tasks/claw_remember.rb +20 -0
  13. data/lib/claw/benchmark/tasks/claw_session.rb +18 -0
  14. data/lib/claw/benchmark/tasks/evolution_trace.rb +18 -0
  15. data/lib/claw/benchmark/tasks/mana_call_func.rb +21 -0
  16. data/lib/claw/benchmark/tasks/mana_eval.rb +18 -0
  17. data/lib/claw/benchmark/tasks/mana_knowledge.rb +19 -0
  18. data/lib/claw/benchmark/tasks/mana_var_readwrite.rb +18 -0
  19. data/lib/claw/benchmark/tasks/runtime_fork.rb +18 -0
  20. data/lib/claw/benchmark/tasks/runtime_snapshot.rb +18 -0
  21. data/lib/claw/benchmark/trigger.rb +68 -0
  22. data/lib/claw/chat.rb +119 -6
  23. data/lib/claw/child_runtime.rb +196 -0
  24. data/lib/claw/cli.rb +177 -0
  25. data/lib/claw/commands.rb +131 -0
  26. data/lib/claw/config.rb +5 -1
  27. data/lib/claw/console/event_logger.rb +69 -0
  28. data/lib/claw/console/public/app.js +264 -0
  29. data/lib/claw/console/public/style.css +330 -0
  30. data/lib/claw/console/server.rb +253 -0
  31. data/lib/claw/console/sse.rb +28 -0
  32. data/lib/claw/console/views/experiments.erb +8 -0
  33. data/lib/claw/console/views/index.erb +27 -0
  34. data/lib/claw/console/views/layout.erb +29 -0
  35. data/lib/claw/console/views/memory.erb +13 -0
  36. data/lib/claw/console/views/monitor.erb +15 -0
  37. data/lib/claw/console/views/prompt.erb +15 -0
  38. data/lib/claw/console/views/snapshots.erb +12 -0
  39. data/lib/claw/console/views/tools.erb +13 -0
  40. data/lib/claw/console/views/traces.erb +9 -0
  41. data/lib/claw/console.rb +5 -0
  42. data/lib/claw/evolution.rb +227 -0
  43. data/lib/claw/forge.rb +144 -0
  44. data/lib/claw/hub.rb +67 -0
  45. data/lib/claw/init.rb +199 -0
  46. data/lib/claw/knowledge.rb +36 -2
  47. data/lib/claw/memory_store.rb +2 -2
  48. data/lib/claw/plan_mode.rb +110 -0
  49. data/lib/claw/resource.rb +35 -0
  50. data/lib/claw/resources/binding_resource.rb +128 -0
  51. data/lib/claw/resources/context_resource.rb +73 -0
  52. data/lib/claw/resources/filesystem_resource.rb +107 -0
  53. data/lib/claw/resources/memory_resource.rb +74 -0
  54. data/lib/claw/resources/worktree_resource.rb +133 -0
  55. data/lib/claw/roles.rb +56 -0
  56. data/lib/claw/runtime.rb +189 -0
  57. data/lib/claw/serializer.rb +10 -7
  58. data/lib/claw/tool.rb +99 -0
  59. data/lib/claw/tool_index.rb +84 -0
  60. data/lib/claw/tool_registry.rb +100 -0
  61. data/lib/claw/trace.rb +86 -0
  62. data/lib/claw/tui/agent_executor.rb +92 -0
  63. data/lib/claw/tui/chat_panel.rb +81 -0
  64. data/lib/claw/tui/command_bar.rb +22 -0
  65. data/lib/claw/tui/file_card.rb +88 -0
  66. data/lib/claw/tui/folding.rb +80 -0
  67. data/lib/claw/tui/input_handler.rb +73 -0
  68. data/lib/claw/tui/layout.rb +34 -0
  69. data/lib/claw/tui/messages.rb +31 -0
  70. data/lib/claw/tui/model.rb +411 -0
  71. data/lib/claw/tui/object_explorer.rb +136 -0
  72. data/lib/claw/tui/status_bar.rb +30 -0
  73. data/lib/claw/tui/status_panel.rb +133 -0
  74. data/lib/claw/tui/styles.rb +58 -0
  75. data/lib/claw/tui/tui.rb +54 -0
  76. data/lib/claw/version.rb +1 -1
  77. data/lib/claw.rb +99 -1
  78. metadata +223 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14400b4f156bbcd289918bf982f233f7c1055a246df1efac571b2e4852ef5752
4
- data.tar.gz: 6f9cb9cee99ae272c0605c80f2256686ac256d7fa8c2cf4a3e19bdb330beee92
3
+ metadata.gz: a656bdeb580e8d10ca17f5e6f74a7406027da83488c4b1d256a4dd5e4e93de01
4
+ data.tar.gz: 6c4474049cad5f10148c9dd9b5960e6b1b521ecaef73720a326e796861a58a92
5
5
  SHA512:
6
- metadata.gz: 6e7401c9af8cbe84fff2d677d34b4aab9d939e2e89e6355815a67fee4a5fcbf293c55f892bf144c096a78c8979d2d1d2eb52308a2ce4a2264f3d8456cb42a53b
7
- data.tar.gz: 80ddb09282da9a9ec07ea6eb6a2ca4aa2282c61e1c3dd4338545873eb8fbb2a145955422600ea1eaa4829508818743adb2bd86be50ad1a6a8e70211b858919b8
6
+ metadata.gz: 68810da4b3c804ff7bd61fa9d4662ce622ad20380dc91222c979ae6ab5b82fd311f5c56ec3e96702df4bcec8790015f4b4ac5b81aeb8f5187cc3d41c7eba3f9c
7
+ data.tar.gz: 559a0c8d419cc42215a2d96d78575d3b38e14b9f4915e14859da25dbdf3f05fed591b895cf77261b8793555b8fb6faa61b7f67b56a9a5e95c5dc24a35e11a294
data/CHANGELOG.md CHANGED
@@ -1,5 +1,99 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.2.0] - 2026-04-06
4
+
5
+ ### Added
6
+ - **Three-layer tool system** (V9): core (always loaded), project (on-demand), hub (remote)
7
+ - `Claw::Tool` mixin with declarative DSL: `tool_name`, `description`, `parameter`
8
+ - `Claw::ToolIndex` — regex-based file scanning of `.ruby-claw/tools/*.rb` without require
9
+ - `Claw::ToolRegistry` — manages tool lifecycle: search, load, unload, register with Mana
10
+ - `search_tools` and `load_tool` agent-facing Mana tools for dynamic discovery
11
+ - `Claw::Forge` — `/forge <method_name>` promotes eval-defined methods to formal tool classes
12
+ - `Claw::AutoForge` — detects repeated eval patterns in traces, suggests tool promotion
13
+ - `Claw::Hub` — HTTP client for community tool hub (search + download)
14
+ - **Web Console** (V10): local Sinatra-based observability UI at localhost:4567
15
+ - `Claw::Console::Server` — Sinatra app with 8 page routes + full REST API
16
+ - `Claw::Console::EventLogger` — structured JSONL append-only event log with Mutex
17
+ - `Claw::Console::SSE` — Server-Sent Events streaming for real-time monitoring
18
+ - Pages: Dashboard, Prompt Inspector, LLM Monitor, Trace Explorer, Memory, Tools, Snapshots, Experiments
19
+ - API endpoints: GET/POST for status, events, traces, memory, prompt, tools, snapshots
20
+ - `claw console [--port N]` CLI subcommand
21
+ - Proactive `remember` tool guidance in system prompt
22
+
23
+ ### Fixed
24
+ - Path traversal vulnerability in `/api/traces/:id` — now validates IDs
25
+ - Hub download path sanitization — prevents directory traversal via tool names
26
+ - Console POST endpoints now validate JSON and required fields
27
+ - CLI `--port` parsing handles missing argument
28
+ - `Forge` filename sanitization handles uppercase method names
29
+ - `Claw.reset!` now clears `Tool.tool_classes` to prevent test leaks
30
+
31
+ ## [0.1.8] - 2026-04-05
32
+
33
+ ### Added
34
+ - `Claw::ChildRuntime` — multi-agent parent-child architecture with isolated threads
35
+ - `Claw::Resources::WorktreeResource` — git worktree isolation for child agents
36
+ - `Runtime#fork_async` spawns child agents with deep-copied variables and optional role/model override
37
+ - Child lifecycle: `start!` / `join` / `cancel!` / `diff` / `merge!` with Mutex-based thread safety
38
+ - Resource `merge_from!` interface for merging child changes back to parent
39
+
40
+ ## [0.1.7] - 2026-04-05
41
+
42
+ ### Added
43
+ - `Claw::Benchmark` framework — automated task-based evaluation of agent capabilities
44
+ - 9 built-in benchmark tasks across mana, claw, runtime, and evolution layers
45
+ - `Claw::Benchmark::Scorer` — scoring formula: correctness, rounds, tokens, tool path (Levenshtein)
46
+ - `Claw::Benchmark::Report` — Markdown report generation with per-task and per-layer breakdown
47
+ - `Claw::Benchmark::Diff` — compare two benchmark reports
48
+ - `Claw::Benchmark::Trigger` — auto-triggers evolution on score regression or 3 consecutive failures
49
+ - CLI: `claw benchmark run`, `claw benchmark diff <a> <b>`
50
+
51
+ ## [0.1.6] - 2026-04-05
52
+
53
+ ### Added
54
+ - Full-screen TUI built on Charm Ruby (bubbletea, lipgloss, bubbles, glamour)
55
+ - MVU architecture: Model/Update/View with 4-zone layout (status bar, chat panel, status panel, command bar)
56
+ - `Claw::PlanMode` — two-phase plan-then-execute workflow with fork safety
57
+ - `Claw::Roles` — agent identity management via `.ruby-claw/roles/*.md`
58
+ - `Claw::Commands` — extracted pure-function slash command module
59
+ - `Claw::CLI` — headless CLI for non-interactive subcommands
60
+ - TUI modules: syntax highlighting, tab completion, object explorer, file cards, text folding
61
+ - CLI subcommands: `claw status`, `claw history`, `claw rollback`, `claw trace`, `claw evolve`, `claw benchmark`
62
+
63
+ ### Changed
64
+ - Default `claw` entry point now launches TUI instead of legacy REPL
65
+ - `Chat.start` delegates to `TUI.start` for backward compatibility
66
+ - `claw init` now creates `roles/` directory with default role
67
+
68
+ ## [0.1.5] - 2026-04-05
69
+
70
+ ### Added
71
+ - `Claw::Evolution` — self-evolution loop: reads traces, LLM diagnosis, fork/apply/test/keep-or-rollback
72
+ - `/evolve` REPL command to trigger an evolution cycle
73
+ - Evolution logs written to `.ruby-claw/evolution/`
74
+
75
+ ## [0.1.4] - 2026-04-05
76
+
77
+ ### Added
78
+ - `Claw::Init` — `claw init` scaffolds a new project with editable gem source
79
+ - Clones ruby-claw and ruby-mana to `.ruby-claw/gems/`
80
+ - Generates Gemfile with `path:` references, `system_prompt.md`, empty `MEMORY.md`
81
+ - Initializes git repo in `.ruby-claw/` with initial commit
82
+ - CLI subcommands: `claw init`, `claw version`, `claw help`
83
+
84
+ ## [0.1.3] - 2026-04-05
85
+
86
+ ### Added
87
+ - `Claw::Trace` — writes per-task Markdown trace files to `.ruby-claw/traces/`
88
+ - Traces capture timing, token usage, and tool call details per LLM iteration
89
+ - Auto-writes traces after each chat execution
90
+
91
+ ### Changed
92
+ - Serializer `encode_value` now uses `MarshalMd.dump` instead of `Marshal.dump`
93
+ - Backward compatibility: old `"marshal"` type entries still decoded via `Marshal.load`
94
+ - `BindingResource` and all resources use MarshalMd for deep copy
95
+ - Added `marshal-md` gem dependency
96
+
3
97
  ## [0.1.2] - 2026-04-04
4
98
 
5
99
  ### Changed
data/README.md CHANGED
@@ -14,12 +14,14 @@ gem install ruby-claw
14
14
 
15
15
  ## Features
16
16
 
17
- ### Interactive Chat REPL
17
+ ### Interactive TUI
18
+ Running `claw` launches a full-screen terminal UI (built on Charm Ruby's bubbletea) with 4 zones: top status bar, left chat panel, right status panel, and bottom command bar.
19
+
20
+ `Claw.chat` still works for the legacy REPL mode:
18
21
  ```ruby
19
22
  require "claw"
20
23
  Claw.chat
21
24
  ```
22
- Or from command line: `claw`
23
25
 
24
26
  - Auto-detects Ruby code vs natural language
25
27
  - Streaming output with markdown rendering
@@ -27,16 +29,22 @@ Or from command line: `claw`
27
29
  - Session persists across restarts
28
30
 
29
31
  ### Persistent Memory
30
- Claw stores memories as human-readable Markdown in `.mana/`:
32
+ Claw stores memories as human-readable Markdown in `.ruby-claw/`:
31
33
 
32
34
  ```
33
- .mana/
34
- MEMORY.md # Long-term facts (editable!)
35
- session.md # Conversation summary
36
- values.json # Variable snapshots
37
- definitions.rb # Method definitions
35
+ .ruby-claw/
36
+ MEMORY.md # Long-term facts (editable!)
37
+ session.md # Conversation summary
38
+ system_prompt.md # Custom agent personality
39
+ values.json # Variable snapshots
40
+ definitions.rb # Method definitions
38
41
  log/
39
- 2026-03-29.md # Daily interaction log
42
+ 2026-03-29.md # Daily interaction log
43
+ traces/
44
+ 20260405_103000.md # Execution traces
45
+ evolution/
46
+ 20260405_accept.md # Evolution logs
47
+ gems/ # Editable gem source (after claw init)
40
48
  ```
41
49
 
42
50
  The LLM can `remember` facts that persist across sessions:
@@ -62,9 +70,192 @@ claw> greet("world") # => "Hello world"
62
70
  ### Memory Compaction
63
71
  When conversation grows large, old messages are automatically summarized in the background.
64
72
 
73
+ ### Incognito Mode
74
+ Temporarily disable memory loading and saving:
75
+ ```ruby
76
+ Claw.incognito do
77
+ ~"translate <text> to French, store in <french>"
78
+ # No memories loaded, nothing remembered
79
+ end
80
+
81
+ Claw::Memory.incognito? # => true inside the block
82
+ ```
83
+
65
84
  ### Keyword Memory Search
66
85
  With many memories (>20), only the most relevant are injected into prompts.
67
86
 
87
+ ### Reversible Runtime
88
+
89
+ Snapshot and rollback the entire agent state (context, memory, variables, filesystem):
90
+
91
+ ```
92
+ claw> /snapshot before-refactor
93
+ ✓ snapshot #2 created (before-refactor)
94
+
95
+ claw> # ... make changes ...
96
+
97
+ claw> /rollback 2
98
+ ✓ rolled back to snapshot #2
99
+ ```
100
+
101
+ **REPL commands:**
102
+ | Command | Description |
103
+ |---------|-------------|
104
+ | `/snapshot [label]` | Snapshot all resources |
105
+ | `/rollback <id>` | Rollback to a snapshot |
106
+ | `/diff [id_a id_b]` | Show diff between snapshots |
107
+ | `/history` | List all snapshots |
108
+ | `/status` | Show current resource state |
109
+ | `/evolve` | Run a self-evolution cycle |
110
+ | `/role <name>` | Switch agent role/identity |
111
+ | `/forge <method>` | Promote a method to a formal tool |
112
+
113
+ ### Plan Mode
114
+
115
+ `/plan` toggles plan mode. When active, the LLM generates a step-by-step plan without executing any tools. The user reviews the proposed steps, then confirms execution -- which runs in a safe fork so the original state is preserved if anything goes wrong.
116
+
117
+ ### Roles
118
+
119
+ Role files are Markdown documents stored in `.ruby-claw/roles/`. Each role defines an agent identity (system prompt, constraints, tool permissions).
120
+
121
+ - `/role <name>` switches the active agent identity at runtime
122
+ - `claw init` creates a default role
123
+
124
+ ### Benchmark
125
+
126
+ `claw benchmark run` executes the benchmark suite -- 9 built-in tasks spanning the mana, claw, runtime, and evolution layers. Each task runs 3 times, and scoring covers:
127
+
128
+ - **Correctness** -- did the agent produce the right result?
129
+ - **Rounds efficiency** -- how many LLM round-trips were needed?
130
+ - **Token efficiency** -- total token usage
131
+ - **Tool path accuracy** -- did the agent call the expected tools in the expected order?
132
+
133
+ `claw benchmark diff <a> <b>` compares two benchmark reports side by side. Auto-triggers an evolution cycle on score regression or 3 consecutive failures.
134
+
135
+ ### Multi-Agent
136
+
137
+ `runtime.fork_async(prompt:, vars:, role:)` spawns a child agent that runs in an isolated thread with deep-copied variables and an optional git worktree for filesystem isolation.
138
+
139
+ Child lifecycle methods:
140
+
141
+ - `child.join` -- block until the child finishes
142
+ - `child.cancel!` -- abort the child
143
+ - `child.diff` -- inspect changes made by the child
144
+ - `child.merge!` -- merge the child's results back into the parent
145
+
146
+ All operations are thread-safe with Mutex protection.
147
+
148
+ ### Execution Traces
149
+
150
+ Every LLM interaction is logged as a Markdown file in `.ruby-claw/traces/`:
151
+
152
+ ```markdown
153
+ # Task: compute average of numbers
154
+ - Model: claude-sonnet-4-20250514
155
+ - Steps: 2
156
+ - Total tokens: 1100 in / 350 out
157
+ - Total latency: 1400ms
158
+
159
+ ## Step 1
160
+ - Latency: 800ms
161
+ - Tokens: 500 in / 200 out
162
+ ### Tool calls
163
+ - **read_var**(name: "numbers") -> [1, 2, 3]
164
+ ```
165
+
166
+ ### Tool System
167
+
168
+ Claw has a three-layer tool architecture:
169
+
170
+ 1. **Core tools** (always loaded): `read_var`, `write_var`, `call_func`, `eval`, `remember`, `search_tools`, `load_tool`
171
+ 2. **Project tools** (on-demand): `.ruby-claw/tools/*.rb` — indexed at startup, loaded via `load_tool`
172
+ 3. **Hub tools** (remote): community tools from a ruby-claw-toolhub, downloaded on demand
173
+
174
+ Create a project tool:
175
+ ```ruby
176
+ # .ruby-claw/tools/format_report.rb
177
+ class FormatReport
178
+ include Claw::Tool
179
+ tool_name "format_report"
180
+ description "Format raw data into a readable report"
181
+ parameter :data, type: "Hash", required: true, desc: "Raw data"
182
+ parameter :style, type: "String", required: false, desc: "brief or detailed"
183
+
184
+ def call(data:, style: "brief")
185
+ # ...
186
+ end
187
+ end
188
+ ```
189
+
190
+ The agent discovers tools via `search_tools` and loads them via `load_tool`. Use `/forge <method_name>` to promote an eval-defined method into a formal tool class.
191
+
192
+ ### Web Console
193
+
194
+ `claw console` launches a local web UI at `http://127.0.0.1:4567` for observability and operations:
195
+
196
+ - **Dashboard** — version, tool/memory/snapshot counts
197
+ - **Prompt Inspector** — view and edit the assembled system prompt
198
+ - **LLM Monitor** — real-time event stream via Server-Sent Events
199
+ - **Trace Explorer** — browse execution traces
200
+ - **Memory Manager** — add/remove long-term memories
201
+ - **Tool Manager** — view core tools, load/unload project tools
202
+ - **Snapshot Manager** — create snapshots, rollback state
203
+
204
+ All data is served via a REST API (`/api/status`, `/api/traces`, `/api/memory`, etc.).
205
+
206
+ ### Project Scaffolding
207
+
208
+ Initialize a project with editable gem source for self-evolution:
209
+
210
+ ```bash
211
+ claw init
212
+ ```
213
+
214
+ Creates:
215
+ ```
216
+ .ruby-claw/
217
+ gems/
218
+ ruby-claw/ # Editable source
219
+ ruby-mana/
220
+ tools/ # Project tool classes
221
+ roles/ # Agent role definitions
222
+ benchmarks/ # Benchmark reports
223
+ system_prompt.md # Customizable agent personality
224
+ MEMORY.md
225
+ .git/ # Filesystem snapshots
226
+ ```
227
+
228
+ ### Self-Evolution
229
+
230
+ The agent can improve its own code:
231
+
232
+ ```
233
+ claw> /evolve
234
+ ⚡ running evolution cycle...
235
+ ✓ accepted: Improve error message specificity
236
+ ```
237
+
238
+ Flow: read traces → LLM diagnoses improvement → fork runtime → apply change → run tests → keep or rollback.
239
+
240
+ Evolution logs are written to `.ruby-claw/evolution/`.
241
+
242
+ ### CLI Subcommands
243
+
244
+ | Command | Description |
245
+ |---------|-------------|
246
+ | `claw` | Launch the TUI (default) |
247
+ | `claw init` | Scaffold a new project |
248
+ | `claw status` | Show current resource state |
249
+ | `claw history` | List all snapshots |
250
+ | `claw rollback <id>` | Rollback to a snapshot |
251
+ | `claw trace [id]` | View execution traces |
252
+ | `claw evolve` | Run a self-evolution cycle |
253
+ | `claw benchmark run` | Run the benchmark suite |
254
+ | `claw benchmark diff <a> <b>` | Compare two benchmark reports |
255
+ | `claw console` | Launch the web console UI |
256
+ | `claw version` | Print version |
257
+ | `claw help` | Show help |
258
+
68
259
  ## Configuration
69
260
 
70
261
  ```ruby
@@ -75,6 +266,9 @@ Claw.configure do |c|
75
266
  c.persist_session = true # Save/restore session across restarts
76
267
  c.memory_top_k = 10 # Max memories to inject when searching
77
268
  c.on_compact = ->(summary) { puts summary }
269
+ c.tools_dir = nil # Custom tools directory (default: .ruby-claw/tools)
270
+ c.hub_url = nil # Remote tool hub URL
271
+ c.console_port = 4567 # Web console port
78
272
  end
79
273
 
80
274
  # Mana config (inherited)
@@ -84,7 +278,17 @@ Mana.configure do |c|
84
278
  end
85
279
  ```
86
280
 
87
- ## Relationship with ruby-mana
281
+ ## Architecture
282
+
283
+ Claw extends mana via its tool registration interface — no monkey-patching:
284
+
285
+ ```ruby
286
+ # Claw registers the "remember" tool into mana's engine
287
+ Mana.register_tool(remember_tool_definition) { |input| ... }
288
+
289
+ # Claw injects long-term memories into mana's system prompt
290
+ Mana.register_prompt_section { |context| memory_text }
291
+ ```
88
292
 
89
293
  - **ruby-mana** = Embedded LLM engine (`~"..."` syntax, binding manipulation, tool calling)
90
294
  - **ruby-claw** = Agent framework (chat REPL, memory, persistence, knowledge)
data/exe/claw CHANGED
@@ -4,4 +4,45 @@
4
4
  require "dotenv/load" rescue nil
5
5
  require "claw"
6
6
 
7
- Claw.chat
7
+ HELP_TEXT = <<~HELP
8
+ Usage: claw [command]
9
+
10
+ Commands:
11
+ (none) Start TUI (default)
12
+ init Initialize a new Claw project
13
+ status Print runtime state
14
+ history List snapshots
15
+ rollback ID Rollback to snapshot
16
+ trace [ID] View execution trace (list if no ID)
17
+ evolve Run evolution cycle
18
+ benchmark run Run benchmark suite
19
+ benchmark diff A B Compare two benchmark reports
20
+ console Launch web console UI
21
+ version Show version
22
+ help Show this message
23
+ HELP
24
+
25
+ case ARGV.first
26
+ when "init"
27
+ Claw::Init.run
28
+ when "status"
29
+ Claw::CLI.run(:status)
30
+ when "history"
31
+ Claw::CLI.run(:history)
32
+ when "rollback"
33
+ Claw::CLI.run(:rollback, ARGV[1])
34
+ when "trace"
35
+ Claw::CLI.run(:trace, ARGV[1])
36
+ when "evolve"
37
+ Claw::CLI.run(:evolve)
38
+ when "benchmark"
39
+ Claw::CLI.run(:benchmark, *ARGV[1..])
40
+ when "console"
41
+ Claw::CLI.run(:console, *ARGV[1..])
42
+ when "version", "--version", "-v"
43
+ puts "claw #{Claw::VERSION}"
44
+ when "help", "--help", "-h"
45
+ puts HELP_TEXT
46
+ else
47
+ Claw::TUI.start(TOPLEVEL_BINDING)
48
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ # Detects repeated eval patterns in traces and suggests tool promotion.
5
+ module AutoForge
6
+ THRESHOLD = 3 # Minimum occurrences before suggesting
7
+
8
+ class << self
9
+ # Analyze recent traces for repeated eval patterns.
10
+ #
11
+ # @param traces_dir [String] path to .ruby-claw/traces/
12
+ # @param limit [Integer] number of recent traces to analyze
13
+ # @return [Array<Hash>] suggestions [{method_name:, occurrences:, sample_code:}]
14
+ def analyze(traces_dir, limit: 10)
15
+ return [] unless traces_dir && Dir.exist?(traces_dir)
16
+
17
+ files = Dir.glob(File.join(traces_dir, "*.md")).sort.last(limit)
18
+ return [] if files.empty?
19
+
20
+ # Collect all eval tool calls that define methods
21
+ method_counts = Hash.new { |h, k| h[k] = { count: 0, sample: nil } }
22
+
23
+ files.each do |file|
24
+ content = File.read(file)
25
+ # Look for eval tool calls containing method definitions
26
+ content.scan(/eval.*?```ruby\s*\n(.*?)```/m).each do |match|
27
+ code = match[0]
28
+ # Extract method name from `def method_name`
29
+ code.scan(/\bdef\s+(\w+)/).each do |name_match|
30
+ name = name_match[0]
31
+ method_counts[name][:count] += 1
32
+ method_counts[name][:sample] ||= code.strip
33
+ end
34
+ end
35
+ end
36
+
37
+ method_counts
38
+ .select { |_, v| v[:count] >= THRESHOLD }
39
+ .map { |name, v| { method_name: name, occurrences: v[:count], sample_code: v[:sample] } }
40
+ .sort_by { |s| -s[:occurrences] }
41
+ end
42
+
43
+ # Quick check: are there any suggestions?
44
+ #
45
+ # @param traces_dir [String]
46
+ # @return [Boolean]
47
+ def suggest?(traces_dir)
48
+ !analyze(traces_dir).empty?
49
+ end
50
+
51
+ # Format suggestions for display.
52
+ #
53
+ # @param suggestions [Array<Hash>]
54
+ # @return [String]
55
+ def format_suggestions(suggestions)
56
+ return "" if suggestions.empty?
57
+
58
+ lines = ["Detected repeated method patterns — consider promoting to tools:"]
59
+ suggestions.each do |s|
60
+ lines << " · #{s[:method_name]} (#{s[:occurrences]}x) — /forge #{s[:method_name]}"
61
+ end
62
+ lines.join("\n")
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "task"
4
+ require_relative "scorer"
5
+ require_relative "runner"
6
+ require_relative "report"
7
+ require_relative "diff"
8
+ require_relative "trigger"
9
+
10
+ module Claw
11
+ module Benchmark
12
+ # Run the full benchmark suite with progress output.
13
+ #
14
+ # @param claw_dir [String] path to .ruby-claw directory
15
+ # @return [SuiteResult]
16
+ def self.run!(claw_dir: ".ruby-claw")
17
+ tasks = load_builtin_tasks
18
+ if tasks.empty?
19
+ $stderr.puts "No benchmark tasks found."
20
+ return
21
+ end
22
+
23
+ puts "Running #{tasks.size} benchmark tasks (#{Runner::RUNS_PER_TASK} runs each)...\n\n"
24
+
25
+ runner = Runner.new
26
+ suite = runner.run_all(tasks) do |task_id, run_idx, total, completed|
27
+ pct = (completed.to_f / total * 100).round(0)
28
+ print "\r [#{pct}%] #{task_id} run #{run_idx}/#{Runner::RUNS_PER_TASK}"
29
+ end
30
+ puts "\n\n"
31
+
32
+ report_text = Report.generate(suite)
33
+ path = Report.save(report_text, claw_dir)
34
+ puts report_text
35
+ puts "\nReport saved to #{path}"
36
+
37
+ suite
38
+ end
39
+
40
+ # Compare two benchmark reports.
41
+ #
42
+ # @param path_a [String]
43
+ # @param path_b [String]
44
+ def self.diff!(path_a, path_b)
45
+ unless path_a && path_b
46
+ $stderr.puts "Usage: claw benchmark diff <report_a> <report_b>"
47
+ return
48
+ end
49
+
50
+ puts Diff.compare(path_a, path_b)
51
+ end
52
+
53
+ # Load all built-in task definitions.
54
+ #
55
+ # @return [Array<Task>]
56
+ def self.load_builtin_tasks
57
+ tasks_dir = File.join(__dir__, "tasks")
58
+ return [] unless Dir.exist?(tasks_dir)
59
+
60
+ Dir.glob(File.join(tasks_dir, "*.rb")).sort.each { |f| require f }
61
+
62
+ Tasks.all
63
+ end
64
+ private_class_method :load_builtin_tasks
65
+
66
+ # Registry for built-in tasks.
67
+ module Tasks
68
+ @registry = []
69
+
70
+ def self.register(task)
71
+ @registry << task
72
+ end
73
+
74
+ def self.all
75
+ @registry.dup
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ module Benchmark
5
+ # Compare two benchmark reports.
6
+ module Diff
7
+ # Compare two report files.
8
+ #
9
+ # @param path_a [String] path to report A
10
+ # @param path_b [String] path to report B
11
+ # @return [String] comparison output
12
+ def self.compare(path_a, path_b)
13
+ raise "Report not found: #{path_a}" unless File.exist?(path_a)
14
+ raise "Report not found: #{path_b}" unless File.exist?(path_b)
15
+
16
+ scores_a = extract_scores(File.read(path_a))
17
+ scores_b = extract_scores(File.read(path_b))
18
+
19
+ all_tasks = (scores_a.keys + scores_b.keys).uniq.sort
20
+
21
+ lines = ["# Benchmark Diff\n"]
22
+ lines << "**A:** #{File.basename(path_a)}"
23
+ lines << "**B:** #{File.basename(path_b)}"
24
+ lines << ""
25
+
26
+ # Suite score comparison
27
+ suite_a = scores_a.values.sum / [scores_a.size, 1].max.to_f
28
+ suite_b = scores_b.values.sum / [scores_b.size, 1].max.to_f
29
+ delta = suite_b - suite_a
30
+ indicator = delta > 0 ? "↑" : delta < 0 ? "↓" : "="
31
+ lines << "**Suite score:** #{suite_a.round(1)} → #{suite_b.round(1)} (#{indicator} #{delta.abs.round(1)})"
32
+ lines << ""
33
+
34
+ # Per-task changes
35
+ lines << "| Task | A | B | Delta |"
36
+ lines << "|------|---|---|-------|"
37
+ all_tasks.each do |task|
38
+ sa = scores_a[task] || 0
39
+ sb = scores_b[task] || 0
40
+ d = sb - sa
41
+ sign = d > 0 ? "+" : ""
42
+ lines << "| #{task} | #{sa.round(1)} | #{sb.round(1)} | #{sign}#{d.round(1)} |"
43
+ end
44
+
45
+ lines.join("\n")
46
+ end
47
+
48
+ # Extract task scores from a report Markdown file.
49
+ # Looks for lines like "### task_id (...)" followed by "- **Score:** N"
50
+ def self.extract_scores(text)
51
+ scores = {}
52
+ current_task = nil
53
+
54
+ text.each_line do |line|
55
+ if line.match?(/^### (\S+)/)
56
+ current_task = line.match(/^### (\S+)/)[1]
57
+ elsif current_task && line.match?(/\*\*Score:\*\*\s*([\d.]+)/)
58
+ scores[current_task] = line.match(/\*\*Score:\*\*\s*([\d.]+)/)[1].to_f
59
+ current_task = nil
60
+ end
61
+ end
62
+
63
+ scores
64
+ end
65
+
66
+ private_class_method :extract_scores
67
+ end
68
+ end
69
+ end