rubyn-code 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +182 -11
- data/db/migrations/014_multi_agent_upgrade.rb +79 -0
- data/lib/rubyn_code/agent/conversation.rb +89 -3
- data/lib/rubyn_code/agent/llm_caller.rb +2 -2
- data/lib/rubyn_code/agent/loop.rb +49 -9
- data/lib/rubyn_code/agent/system_prompt_builder.rb +37 -2
- data/lib/rubyn_code/agent/tool_processor.rb +3 -1
- data/lib/rubyn_code/auth/oauth.rb +1 -1
- data/lib/rubyn_code/auth/token_store.rb +49 -4
- data/lib/rubyn_code/checkpoint/hook.rb +26 -0
- data/lib/rubyn_code/checkpoint/manager.rb +109 -0
- data/lib/rubyn_code/chisel/debt.rb +65 -0
- data/lib/rubyn_code/chisel/inspection.rb +93 -0
- data/lib/rubyn_code/chisel.rb +127 -0
- data/lib/rubyn_code/cli/app.rb +2 -2
- data/lib/rubyn_code/cli/commands/agents.rb +31 -0
- data/lib/rubyn_code/cli/commands/chisel.rb +52 -0
- data/lib/rubyn_code/cli/commands/chisel_audit.rb +19 -0
- data/lib/rubyn_code/cli/commands/chisel_debt.rb +28 -0
- data/lib/rubyn_code/cli/commands/chisel_gain.rb +30 -0
- data/lib/rubyn_code/cli/commands/chisel_review.rb +19 -0
- data/lib/rubyn_code/cli/commands/command_template.rb +50 -0
- data/lib/rubyn_code/cli/commands/context.rb +3 -1
- data/lib/rubyn_code/cli/commands/custom_command.rb +42 -0
- data/lib/rubyn_code/cli/commands/custom_loader.rb +69 -0
- data/lib/rubyn_code/cli/commands/goal.rb +87 -0
- data/lib/rubyn_code/cli/commands/learning.rb +62 -0
- data/lib/rubyn_code/cli/commands/loop.rb +58 -0
- data/lib/rubyn_code/cli/commands/mcp.rb +18 -5
- data/lib/rubyn_code/cli/commands/megaplan.rb +50 -0
- data/lib/rubyn_code/cli/commands/registry.rb +14 -9
- data/lib/rubyn_code/cli/commands/rewind.rb +65 -0
- data/lib/rubyn_code/cli/first_run.rb +1 -1
- data/lib/rubyn_code/cli/loop_runner.rb +98 -0
- data/lib/rubyn_code/cli/mention_expander.rb +92 -0
- data/lib/rubyn_code/cli/renderer.rb +3 -2
- data/lib/rubyn_code/cli/repl.rb +37 -14
- data/lib/rubyn_code/cli/repl_commands.rb +77 -2
- data/lib/rubyn_code/cli/repl_setup.rb +9 -1
- data/lib/rubyn_code/cli/setup.rb +13 -0
- data/lib/rubyn_code/cli/stream_formatter.rb +3 -2
- data/lib/rubyn_code/cli/version_check.rb +10 -3
- data/lib/rubyn_code/config/defaults.rb +13 -1
- data/lib/rubyn_code/config/schema.json +4 -0
- data/lib/rubyn_code/config/settings.rb +17 -2
- data/lib/rubyn_code/context/manager.rb +29 -12
- data/lib/rubyn_code/debug.rb +11 -5
- data/lib/rubyn_code/goal/evaluator.rb +95 -0
- data/lib/rubyn_code/hooks/event_map.rb +56 -0
- data/lib/rubyn_code/hooks/external_dispatcher.rb +199 -0
- data/lib/rubyn_code/hooks/goal_hook.rb +88 -0
- data/lib/rubyn_code/hooks/response.rb +83 -0
- data/lib/rubyn_code/hooks/runner.rb +61 -3
- data/lib/rubyn_code/hooks/settings_json_loader.rb +109 -0
- data/lib/rubyn_code/hooks/subprocess_executor.rb +116 -0
- data/lib/rubyn_code/ide/handlers/plan_interview_answer_handler.rb +65 -0
- data/lib/rubyn_code/ide/handlers/plan_interview_cancel_handler.rb +22 -0
- data/lib/rubyn_code/ide/handlers/plan_interview_start_handler.rb +53 -0
- data/lib/rubyn_code/ide/handlers/plan_propose_handler.rb +41 -0
- data/lib/rubyn_code/ide/handlers/prompt_handler.rb +9 -1
- data/lib/rubyn_code/ide/handlers/recover_ci_handler.rb +143 -0
- data/lib/rubyn_code/ide/handlers/session_resume_handler.rb +1 -1
- data/lib/rubyn_code/ide/handlers.rb +17 -2
- data/lib/rubyn_code/ide/protocol.rb +15 -0
- data/lib/rubyn_code/ide/server.rb +39 -1
- data/lib/rubyn_code/index/codebase_index.rb +39 -1
- data/lib/rubyn_code/learning/porter.rb +129 -0
- data/lib/rubyn_code/llm/adapters/anthropic.rb +65 -16
- data/lib/rubyn_code/llm/adapters/openai.rb +1 -1
- data/lib/rubyn_code/llm/adapters/prompt_caching.rb +5 -1
- data/lib/rubyn_code/llm/adapters/token_caching.rb +54 -0
- data/lib/rubyn_code/llm/model_router.rb +2 -2
- data/lib/rubyn_code/mcp/client.rb +59 -0
- data/lib/rubyn_code/mcp/server_extras_bridge.rb +110 -0
- data/lib/rubyn_code/mcp/sse_transport.rb +2 -1
- data/lib/rubyn_code/mcp/tool_bridge.rb +16 -14
- data/lib/rubyn_code/megaplan/ci_recovery.rb +104 -0
- data/lib/rubyn_code/megaplan/interview_session.rb +250 -0
- data/lib/rubyn_code/megaplan/plan_proposer.rb +153 -0
- data/lib/rubyn_code/memory/search.rb +9 -5
- data/lib/rubyn_code/memory/session_persistence.rb +159 -21
- data/lib/rubyn_code/observability/cost_calculator.rb +3 -1
- data/lib/rubyn_code/output/diff_renderer.rb +62 -7
- data/lib/rubyn_code/skills/auto_suggest.rb +70 -2
- data/lib/rubyn_code/skills/registry_client.rb +4 -3
- data/lib/rubyn_code/sub_agents/agent_type.rb +17 -0
- data/lib/rubyn_code/sub_agents/catalog.rb +124 -0
- data/lib/rubyn_code/teams/agent_registry.rb +120 -0
- data/lib/rubyn_code/teams/mailbox.rb +99 -10
- data/lib/rubyn_code/teams/manager.rb +83 -5
- data/lib/rubyn_code/teams/teammate.rb +5 -1
- data/lib/rubyn_code/tools/ask_user.rb +15 -1
- data/lib/rubyn_code/tools/executor.rb +5 -3
- data/lib/rubyn_code/tools/spawn_agent.rb +47 -62
- data/lib/rubyn_code/tools/spawn_teammate.rb +7 -2
- data/lib/rubyn_code/tools/web_fetch.rb +1 -1
- data/lib/rubyn_code/tools/web_search.rb +4 -1
- data/lib/rubyn_code/version.rb +1 -1
- data/lib/rubyn_code.rb +53 -2
- data/skills/megaplan/megaplan.md +156 -0
- data/skills/rubyn_self_test.md +322 -14
- data/skills/self_test/chisel_smoke.rb +84 -0
- data/skills/self_test/fixtures/chisel_sample.rb +64 -0
- metadata +49 -4
data/skills/rubyn_self_test.md
CHANGED
|
@@ -63,30 +63,113 @@ Score: 18/22 (82%) — 4 failures
|
|
|
63
63
|
#### File Cache
|
|
64
64
|
- Read `lib/rubyn_code/version.rb` twice. PASS if both reads succeed (cache should serve the second).
|
|
65
65
|
|
|
66
|
-
#### Output Compressor —
|
|
67
|
-
- Run `bash` with `seq 1 5000` (generates 5,000 lines — well over the bash threshold of 4,000 chars). PASS if the result contains "lines omitted" or is significantly shorter than 5,000 lines. This proves the head_tail compressor is working.
|
|
66
|
+
#### Output Compressor — All Strategies (direct)
|
|
68
67
|
|
|
69
|
-
|
|
70
|
-
|
|
68
|
+
> **Why this is a direct call, not a tool observation.** Earlier versions of this
|
|
69
|
+
> test ran `seq 1 5000`, a big `grep`, etc. through the agent's own tools and
|
|
70
|
+
> hoped the compressor would visibly truncate the result. That is unreliable:
|
|
71
|
+
> whether a given tool invocation is routed through the compressor gate (and at
|
|
72
|
+
> what threshold) depends on the execution path, so the agent often received
|
|
73
|
+
> already-handled output and scored a false FAIL even though the compressor was
|
|
74
|
+
> fine. Instead, drive `OutputCompressor#compress(tool_name, raw_output)`
|
|
75
|
+
> **directly** with inputs crafted to exceed each strategy's threshold, and
|
|
76
|
+
> assert on the marker in the returned string. This is deterministic and matches
|
|
77
|
+
> how the unit specs exercise it.
|
|
71
78
|
|
|
72
|
-
|
|
73
|
-
- Run `grep` searching for `def ` across all of `lib/`. This will match hundreds of method definitions. PASS if the result contains "matches omitted" or shows only a subset of results (the compressor limits to top N matches).
|
|
79
|
+
- **All strategies**: `bash` with the script below. PASS for each strategy whose line says `PASS`. Report the final `COMPRESSION: N/5 strategies active` line in the scorecard.
|
|
74
80
|
|
|
75
|
-
|
|
76
|
-
|
|
81
|
+
```bash
|
|
82
|
+
bundle exec ruby -e '
|
|
83
|
+
require_relative "lib/rubyn_code"
|
|
84
|
+
c = RubynCode::Tools::OutputCompressor
|
|
85
|
+
|
|
86
|
+
results = {}
|
|
87
|
+
|
|
88
|
+
# head_tail (bash, 1000-token threshold): >10 lines, well over 4000 chars
|
|
89
|
+
big = (1..5000).map { |i| "line #{i}" }.join("\n")
|
|
90
|
+
results["head_tail"] = c.new.compress("bash", big).include?("lines omitted")
|
|
91
|
+
|
|
92
|
+
# spec_summary (run_specs, 500-token threshold): verbose passing output
|
|
93
|
+
# collapses to just the "N examples, 0 failures" summary line
|
|
94
|
+
spec_out = (Array.new(200) { |i| " passing example #{i} runs and returns ok value" }.join("\n")) +
|
|
95
|
+
"\n\n42 examples, 0 failures\n"
|
|
96
|
+
results["spec_summary"] = (c.new.compress("run_specs", spec_out).strip == "42 examples, 0 failures")
|
|
97
|
+
|
|
98
|
+
# top_matches (grep, 1000-token threshold): keeps top N, marks the rest
|
|
99
|
+
grep_out = (1..500).map { |i| "lib/file#{i}.rb:#{i}: def method_number_#{i}(arg)" }.join("\n")
|
|
100
|
+
results["top_matches"] = c.new.compress("grep", grep_out).include?("matches omitted")
|
|
101
|
+
|
|
102
|
+
# tree (glob, 500-token threshold): collapses paths to "dir/ (N files)"
|
|
103
|
+
glob_out = (1..500).map { |i| "lib/rubyn_code/subdir#{i % 25}/some_file_name_#{i}.rb" }.join("\n")
|
|
104
|
+
results["tree"] = c.new.compress("glob", glob_out).include?("files)")
|
|
105
|
+
|
|
106
|
+
# relevant_hunks (git_diff, 2000-token threshold): keeps headers, truncates bodies
|
|
107
|
+
hunk = ->(f) { "diff --git a/#{f} b/#{f}\nindex 000..111 100644\n--- a/#{f}\n+++ b/#{f}\n" +
|
|
108
|
+
(Array.new(100) { |i| "+ added source line number #{i}" }.join("\n")) + "\n" }
|
|
109
|
+
diff_out = (1..10).map { |i| hunk.call("file#{i}.rb") }.join
|
|
110
|
+
results["relevant_hunks"] = c.new.compress("git_diff", diff_out).include?("lines in this file omitted")
|
|
77
111
|
|
|
78
|
-
|
|
79
|
-
|
|
112
|
+
results.each { |k, v| puts "STRATEGY #{k}: #{v ? "PASS" : "FAIL"}" }
|
|
113
|
+
puts "COMPRESSION: #{results.values.count(true)}/5 strategies active"
|
|
114
|
+
'
|
|
115
|
+
```
|
|
80
116
|
|
|
81
|
-
|
|
82
|
-
- After running the above tests, note whether any output you received contained truncation markers like "lines omitted", "matches omitted", or "files)". Count how many of the 5 compression strategies actually triggered. Report: "N/5 compression strategies verified active".
|
|
117
|
+
Each strategy is scored independently (5 line items). A healthy build prints `COMPRESSION: 5/5 strategies active`.
|
|
83
118
|
|
|
84
119
|
### 7. Skills System
|
|
85
120
|
- **load_skill**: Load any available skill (e.g., `classes`). PASS if content is returned.
|
|
86
121
|
|
|
87
122
|
### 8. Memory System
|
|
88
|
-
|
|
89
|
-
|
|
123
|
+
|
|
124
|
+
> **Use the real `Memory::Store` / `Memory::Search` API.** Both are constructed
|
|
125
|
+
> as `.new(db, project_path:)` — the `project_path:` keyword is **required**, and
|
|
126
|
+
> `db` must respond to `execute` / `query` / `transaction` (a raw
|
|
127
|
+
> `SQLite3::Database` alone does **not** provide `query`, which `Search` needs).
|
|
128
|
+
> The script below wraps an in-memory SQLite DB to satisfy that interface, exactly
|
|
129
|
+
> as the specs' `setup_test_db` helper does. Writing the memory creates its own
|
|
130
|
+
> table via `Store#ensure_tables`, so no migrations are needed.
|
|
131
|
+
|
|
132
|
+
- **Round-trip**: `bash` with the script below. PASS if the final line is `MEMORY: PASS`.
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
bundle exec ruby -e '
|
|
136
|
+
require_relative "lib/rubyn_code"
|
|
137
|
+
require "sqlite3"
|
|
138
|
+
|
|
139
|
+
# Minimal stand-in for RubynCode::DB::Connection (execute/query/transaction).
|
|
140
|
+
class SelfTestDB
|
|
141
|
+
def initialize(raw) = @raw = raw
|
|
142
|
+
def execute(sql, params = []) = @raw.execute(sql, params)
|
|
143
|
+
def query(sql, params = []) = @raw.execute(sql, params)
|
|
144
|
+
def transaction(&b) = @raw.transaction(&b)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
raw = SQLite3::Database.new(":memory:")
|
|
148
|
+
raw.results_as_hash = true
|
|
149
|
+
db = SelfTestDB.new(raw)
|
|
150
|
+
|
|
151
|
+
project = "/self-test"
|
|
152
|
+
store = RubynCode::Memory::Store.new(db, project_path: project)
|
|
153
|
+
search = RubynCode::Memory::Search.new(db, project_path: project)
|
|
154
|
+
|
|
155
|
+
token = "selftesttoken-marker-xyz"
|
|
156
|
+
store.write(content: "self-test memory #{token}")
|
|
157
|
+
|
|
158
|
+
found = search.search(token).any? { |r| r.content.include?(token) }
|
|
159
|
+
recent_ok = search.recent(limit: 5).any? { |r| r.content.include?(token) }
|
|
160
|
+
|
|
161
|
+
if found && recent_ok
|
|
162
|
+
puts "MEMORY: PASS (write + search + recent all round-trip)"
|
|
163
|
+
elsif found
|
|
164
|
+
puts "MEMORY: PARTIAL (search works, recent did not return it)"
|
|
165
|
+
else
|
|
166
|
+
puts "MEMORY: FAIL (write succeeded but search did not return it)"
|
|
167
|
+
end
|
|
168
|
+
'
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
The script writes a memory with a unique token, then confirms both
|
|
172
|
+
`Search#search` (LIKE query) and `Search#recent` return it.
|
|
90
173
|
|
|
91
174
|
### 9. Configuration
|
|
92
175
|
- **bash**: Run `cat ~/.rubyn-code/config.yml`. PASS if file exists and contains `provider:`.
|
|
@@ -190,6 +273,231 @@ End-to-end exercise of the autoload pipeline against the real registry at `rubyn
|
|
|
190
273
|
```
|
|
191
274
|
before the next response (the `📥` line appears only if the pack wasn't already installed). Do **not** count this as PASS/FAIL — just mention it in the scorecard so the user can verify the renderer side themselves.
|
|
192
275
|
|
|
276
|
+
### 16. Teams System — Multi-Agent
|
|
277
|
+
|
|
278
|
+
Run the following inline Ruby script with `bash`. It exercises the teammate manager, mailbox (including structured messaging), and agent registry in a single SQLite-backed round-trip. PASS if the final line is `ALL PASS`.
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
bundle exec ruby -e '
|
|
282
|
+
require_relative "lib/rubyn_code"
|
|
283
|
+
require "sqlite3"
|
|
284
|
+
|
|
285
|
+
db = SQLite3::Database.new(":memory:")
|
|
286
|
+
db.results_as_hash = true
|
|
287
|
+
|
|
288
|
+
mailbox = RubynCode::Teams::Mailbox.new(db)
|
|
289
|
+
manager = RubynCode::Teams::Manager.new(db, mailbox: mailbox)
|
|
290
|
+
registry = RubynCode::Teams::AgentRegistry.new(manager: manager, mailbox: mailbox)
|
|
291
|
+
|
|
292
|
+
# 1. Spawn root + child teammates
|
|
293
|
+
root = manager.spawn(name: "lead", role: "coordinator")
|
|
294
|
+
child = manager.spawn(name: "coder", role: "developer", parent_agent_id: root.id)
|
|
295
|
+
raise "spawn failed" unless root.root? && !child.root?
|
|
296
|
+
puts "STEP spawn: PASS"
|
|
297
|
+
|
|
298
|
+
# 2. Parent-child tracking
|
|
299
|
+
kids = manager.children_of(root.id)
|
|
300
|
+
raise "children_of broken" unless kids.size == 1 && kids.first.name == "coder"
|
|
301
|
+
raise "roots broken" unless manager.roots.size == 1
|
|
302
|
+
tree = manager.agent_tree(root.id)
|
|
303
|
+
raise "tree broken" unless tree[:children].size == 1
|
|
304
|
+
puts "STEP lineage: PASS"
|
|
305
|
+
|
|
306
|
+
# 3. Structured messaging with correlation_id
|
|
307
|
+
corr_id = mailbox.send_structured(
|
|
308
|
+
from: "lead", to: "coder", type: "task",
|
|
309
|
+
data: { action: "write_tests", files: ["user.rb"] },
|
|
310
|
+
content: "Write tests for user.rb"
|
|
311
|
+
)
|
|
312
|
+
raise "send_structured returned nil" if corr_id.nil?
|
|
313
|
+
|
|
314
|
+
msgs = mailbox.read_inbox("coder")
|
|
315
|
+
raise "inbox empty" if msgs.empty?
|
|
316
|
+
msg = msgs.first
|
|
317
|
+
raise "missing data" unless msg[:data].is_a?(Hash) && msg[:data][:action] == "write_tests"
|
|
318
|
+
raise "missing correlation_id" unless msg[:correlation_id].is_a?(String)
|
|
319
|
+
puts "STEP structured_msg: PASS"
|
|
320
|
+
|
|
321
|
+
# 4. Correlation chain
|
|
322
|
+
mailbox.send(
|
|
323
|
+
from: "coder", to: "lead", content: "Done",
|
|
324
|
+
message_type: "result", correlation_id: msg[:correlation_id],
|
|
325
|
+
data: { status: "ok", tests: 5 }
|
|
326
|
+
)
|
|
327
|
+
chain = mailbox.find_by_correlation_id(msg[:correlation_id])
|
|
328
|
+
raise "correlation chain broken (#{chain.size})" unless chain.size == 2
|
|
329
|
+
puts "STEP correlation: PASS"
|
|
330
|
+
|
|
331
|
+
# 5. Agent discovery
|
|
332
|
+
manager.update_status("coder", "active")
|
|
333
|
+
snap = registry.snapshot
|
|
334
|
+
raise "snapshot broken" unless snap.size == 2
|
|
335
|
+
actives = registry.active
|
|
336
|
+
raise "active filter broken" unless actives.size == 1 && actives.first[:name] == "coder"
|
|
337
|
+
forest = registry.forest
|
|
338
|
+
raise "forest broken" unless forest.size == 1 && forest.first[:children].size == 1
|
|
339
|
+
lineage = registry.lineage(child.id)
|
|
340
|
+
raise "lineage broken" unless lineage.size == 1 && lineage.first.name == "lead"
|
|
341
|
+
report = registry.status_report
|
|
342
|
+
raise "status_report broken" unless report.include?("lead") && report.include?("coder")
|
|
343
|
+
puts "STEP discovery: PASS"
|
|
344
|
+
|
|
345
|
+
# 6. Cleanup + unread_count
|
|
346
|
+
raise "unread wrong" unless mailbox.unread_count("lead") == 1
|
|
347
|
+
mailbox.read_inbox("lead")
|
|
348
|
+
raise "read didnt clear" unless mailbox.unread_count("lead") == 0
|
|
349
|
+
manager.remove("coder")
|
|
350
|
+
manager.remove("lead")
|
|
351
|
+
raise "cleanup failed" unless manager.list.empty?
|
|
352
|
+
puts "STEP cleanup: PASS"
|
|
353
|
+
|
|
354
|
+
puts "ALL PASS"
|
|
355
|
+
'
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
The script tests:
|
|
359
|
+
1. **Spawn** — root and child teammates with parent tracking
|
|
360
|
+
2. **Lineage** — `children_of`, `roots`, `agent_tree`
|
|
361
|
+
3. **Structured messaging** — `send_structured` with typed data payloads
|
|
362
|
+
4. **Correlation chains** — request/response pairing via `correlation_id`
|
|
363
|
+
5. **Agent discovery** — `snapshot`, `active`, `forest`, `lineage`, `status_report`
|
|
364
|
+
6. **Cleanup** — `unread_count`, `read_inbox`, `remove`
|
|
365
|
+
|
|
366
|
+
PASS criteria: all 6 `STEP` lines say PASS and the final line is `ALL PASS`.
|
|
367
|
+
|
|
368
|
+
### 17. Recent Additions — Claude Code / Codex Parity
|
|
369
|
+
|
|
370
|
+
Each feature below ships as its own PR; a check FAILs cleanly if that PR has
|
|
371
|
+
not yet merged into the branch under test. Run the grep/spec checks — they are
|
|
372
|
+
fast and need no API calls.
|
|
373
|
+
|
|
374
|
+
#### 17a. `/goal` — work until a goal is met
|
|
375
|
+
- **grep**: `class GoalHook` in `lib/rubyn_code/hooks/goal_hook.rb`. PASS if found.
|
|
376
|
+
- **grep**: `:stop` in `lib/rubyn_code/hooks/runner.rb`. PASS if found (stop-hook gating wired).
|
|
377
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/hooks/goal_hook_spec.rb spec/rubyn_code/cli/commands/goal_spec.rb --format progress`. PASS if `0 failures`.
|
|
378
|
+
|
|
379
|
+
#### 17b. `/loop` — repeat a prompt/command on an interval
|
|
380
|
+
- **grep**: `class LoopRunner` in `lib/rubyn_code/cli/loop_runner.rb`. PASS if found.
|
|
381
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/cli/loop_runner_spec.rb spec/rubyn_code/cli/commands/loop_spec.rb --format progress`. PASS if `0 failures`.
|
|
382
|
+
- **bash** (behavior): `bundle exec ruby -Ilib -rrubyn_code -e 'puts RubynCode::CLI::LoopRunner.parse_interval("5m")'`. PASS if output is `300`.
|
|
383
|
+
|
|
384
|
+
#### 17c. `AGENTS.md` project instructions
|
|
385
|
+
- **grep**: `AGENTS.md` in `lib/rubyn_code/agent/system_prompt_builder.rb`. PASS if found.
|
|
386
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/agent/system_prompt_builder_spec.rb --format progress`. PASS if `0 failures`.
|
|
387
|
+
|
|
388
|
+
#### 17d. `@`-file mentions
|
|
389
|
+
- **grep**: `class MentionExpander` in `lib/rubyn_code/cli/mention_expander.rb`. PASS if found.
|
|
390
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/cli/mention_expander_spec.rb --format progress`. PASS if `0 failures`.
|
|
391
|
+
|
|
392
|
+
#### 17e. User-defined slash commands
|
|
393
|
+
- **grep**: `module CustomLoader` in `lib/rubyn_code/cli/commands/custom_loader.rb`. PASS if found.
|
|
394
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/cli/commands/custom_loader_spec.rb spec/rubyn_code/cli/commands/command_template_spec.rb --format progress`. PASS if `0 failures`.
|
|
395
|
+
|
|
396
|
+
#### 17f. Custom sub-agents + `/agents`
|
|
397
|
+
- **grep**: `class Catalog` in `lib/rubyn_code/sub_agents/catalog.rb`. PASS if found.
|
|
398
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/sub_agents/catalog_spec.rb spec/rubyn_code/tools/spawn_agent_spec.rb --format progress`. PASS if `0 failures` (spawn_agent must still pass after the refactor).
|
|
399
|
+
|
|
400
|
+
#### 17g. MCP resources & prompts
|
|
401
|
+
- **grep**: `def supports_resources?` in `lib/rubyn_code/mcp/client.rb`. PASS if found.
|
|
402
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/mcp/client_spec.rb spec/rubyn_code/mcp/tool_bridge_spec.rb --format progress`. PASS if `0 failures`.
|
|
403
|
+
|
|
404
|
+
#### 17h. `/rewind` — checkpoint & restore
|
|
405
|
+
- **grep**: `class Manager` in `lib/rubyn_code/checkpoint/manager.rb`. PASS if found.
|
|
406
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/checkpoint --format progress`. PASS if `0 failures`.
|
|
407
|
+
|
|
408
|
+
#### 17i. Learning export/import
|
|
409
|
+
- **grep**: `module Porter` in `lib/rubyn_code/learning/porter.rb`. PASS if found.
|
|
410
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/learning/porter_spec.rb --format progress`. PASS if `0 failures`.
|
|
411
|
+
- **bash** (round-trip): the script below exports instincts to a temp file and re-imports them into a fresh in-memory DB. PASS if the final line is `LEARNING ROUNDTRIP: PASS`.
|
|
412
|
+
|
|
413
|
+
```bash
|
|
414
|
+
bundle exec ruby -Ilib -rrubyn_code -rsqlite3 -rtmpdir -e '
|
|
415
|
+
def db_with_instincts
|
|
416
|
+
raw = SQLite3::Database.new(":memory:"); raw.results_as_hash = true
|
|
417
|
+
raw.execute(File.read("db/migrations/010_create_instincts.sql").split(";").first + ";")
|
|
418
|
+
wrap = Object.new
|
|
419
|
+
wrap.define_singleton_method(:execute) { |sql, p = []| raw.execute(sql, p) }
|
|
420
|
+
wrap.define_singleton_method(:query) { |sql, p = []| raw.execute(sql, p) }
|
|
421
|
+
wrap
|
|
422
|
+
end
|
|
423
|
+
src = db_with_instincts
|
|
424
|
+
src.execute("INSERT INTO instincts (id,project_path,pattern,context_tags,confidence,decay_rate,times_applied,times_helpful,created_at,updated_at) VALUES (?,?,?,?,?,?,?,?,?,?)",
|
|
425
|
+
["x","/p","prefer guard clauses","[]",0.8,0.05,1,1,"2026-01-01T00:00:00Z","2026-01-01T00:00:00Z"])
|
|
426
|
+
Dir.mktmpdir do |d|
|
|
427
|
+
f = File.join(d, "l.json")
|
|
428
|
+
RubynCode::Learning::Porter.export(db: src, path: f)
|
|
429
|
+
dst = db_with_instincts
|
|
430
|
+
res = RubynCode::Learning::Porter.import(db: dst, path: f)
|
|
431
|
+
ok = res[:imported] == 1 && dst.query("SELECT COUNT(*) AS n FROM instincts").first["n"] == 1
|
|
432
|
+
puts(ok ? "LEARNING ROUNDTRIP: PASS" : "LEARNING ROUNDTRIP: FAIL #{res.inspect}")
|
|
433
|
+
end
|
|
434
|
+
'
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
#### 17j. Command registry integrity (all new commands load + register)
|
|
438
|
+
- **bash**: the script below boots the command registry exactly as the REPL does and asserts the new slash commands are present and unique. PASS if the final line is `COMMANDS: PASS`.
|
|
439
|
+
|
|
440
|
+
```bash
|
|
441
|
+
bundle exec ruby -Ilib -rrubyn_code -e '
|
|
442
|
+
reg = RubynCode::CLI::Commands::Registry.new
|
|
443
|
+
[RubynCode::CLI::Commands::Goal, RubynCode::CLI::Commands::Loop,
|
|
444
|
+
RubynCode::CLI::Commands::Agents, RubynCode::CLI::Commands::Rewind,
|
|
445
|
+
RubynCode::CLI::Commands::Learning].each { |c| reg.register(c) }
|
|
446
|
+
want = %w[/goal /loop /agents /rewind /learning]
|
|
447
|
+
missing = want.reject { |n| reg.known?(n) }
|
|
448
|
+
puts(missing.empty? ? "COMMANDS: PASS" : "COMMANDS: FAIL missing #{missing.inspect}")
|
|
449
|
+
'
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
### 18. Chisel — Minimal-Code Enforcement (opt-in)
|
|
453
|
+
|
|
454
|
+
Chisel is rubyn-code's "write the minimum that works" layer. It is **off by
|
|
455
|
+
default** and only changes the agent once a user turns it on (`/chisel full` or
|
|
456
|
+
`chisel_mode` in config). These checks prove the engine resolves modes, injects
|
|
457
|
+
its ruleset only when enabled, never chisels away the safety floor, and that the
|
|
458
|
+
debt scanner, inspection prompts, and all five slash commands are wired up — all
|
|
459
|
+
deterministic, no API calls.
|
|
460
|
+
|
|
461
|
+
The deterministic target is a committed, deliberately over-engineered fixture,
|
|
462
|
+
`skills/self_test/fixtures/chisel_sample.rb`. Chisel scans it and must return the
|
|
463
|
+
**same three `chisel:` markers every time** (and ignore the two decoys). That is
|
|
464
|
+
what makes this check repeatable rather than a one-off tmpdir.
|
|
465
|
+
|
|
466
|
+
- **grep** (prompt integration): `append_chisel_ruleset` in `lib/rubyn_code/agent/system_prompt_builder.rb`. PASS if found (confirms the ruleset reaches the system prompt).
|
|
467
|
+
- **run_specs**: `bundle exec rspec spec/rubyn_code/chisel_spec.rb spec/rubyn_code/chisel spec/rubyn_code/cli/commands/chisel_spec.rb spec/rubyn_code/cli/commands/chisel_review_spec.rb spec/rubyn_code/cli/commands/chisel_audit_spec.rb spec/rubyn_code/cli/commands/chisel_debt_spec.rb spec/rubyn_code/cli/commands/chisel_gain_spec.rb --format progress`. PASS if output contains `0 failures`. (Includes `self_test_fixture_spec.rb`, which guards the fixture's exact scan result.)
|
|
468
|
+
- **Smoke run against the fixture**: `bash` runs the committed runner — no inline script to keep in sync:
|
|
469
|
+
|
|
470
|
+
```bash
|
|
471
|
+
bundle exec ruby skills/self_test/chisel_smoke.rb
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
It scores four areas on their own line and exits non-zero on any failure:
|
|
475
|
+
|
|
476
|
+
```
|
|
477
|
+
CHISEL debt: PASS
|
|
478
|
+
CHISEL engine: PASS
|
|
479
|
+
CHISEL inspection: PASS
|
|
480
|
+
CHISEL commands: PASS
|
|
481
|
+
CHISEL: PASS
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
- **debt** — scanning the fixture returns exactly its three planted markers
|
|
485
|
+
(file/line/note), with the string-literal and trailing-comment decoys ignored.
|
|
486
|
+
- **engine** — `off` injects nothing; `lite`/`full`/`ultra` layer the right
|
|
487
|
+
addenda and always keep the safety floor; a garbage mode never crashes or
|
|
488
|
+
leaks through. Driven via `RUBYN_CHISEL_MODE`, independent of this machine's
|
|
489
|
+
`chisel_mode` config.
|
|
490
|
+
- **inspection** — `:diff` and `:repo` prompts assemble a String carrying the
|
|
491
|
+
ladder + safety floor; an unknown scope raises instead of emitting junk.
|
|
492
|
+
- **commands** — all five (`/chisel`, `/chisel-review`, `/chisel-audit`,
|
|
493
|
+
`/chisel-debt`, `/chisel-gain`) register and resolve.
|
|
494
|
+
|
|
495
|
+
Score each `CHISEL <area>` line independently (4 line items). PASS criteria:
|
|
496
|
+
all four areas PASS and the final line is `CHISEL: PASS`.
|
|
497
|
+
|
|
498
|
+
You can also point Chisel at the fixture by hand to see the consistent result
|
|
499
|
+
directly: `bundle exec ruby -Ilib -rrubyn_code -e 'RubynCode::Chisel::Debt.scan("skills/self_test/fixtures").each { |i| puts "#{i.file}:#{i.line} — #{i.note}" }'`.
|
|
500
|
+
|
|
193
501
|
## Scoring
|
|
194
502
|
|
|
195
503
|
Count total PASS results out of total tests run. Report the percentage.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Chisel smoke test — runs rubyn-code's Chisel layer against a committed,
|
|
4
|
+
# deliberately over-engineered fixture and asserts a CONSISTENT result every
|
|
5
|
+
# time. Deterministic and offline (no LLM): the debt scanner, mode resolution,
|
|
6
|
+
# inspection-prompt assembly, and command registration are all pure.
|
|
7
|
+
#
|
|
8
|
+
# $ bundle exec ruby skills/self_test/chisel_smoke.rb
|
|
9
|
+
#
|
|
10
|
+
# Prints one `CHISEL <area>: PASS/FAIL` line per area, a final `CHISEL: PASS`
|
|
11
|
+
# (or `FAIL`), and exits non-zero if anything failed — so CI and the
|
|
12
|
+
# /skill self-test scorecard can both consume it.
|
|
13
|
+
|
|
14
|
+
require_relative '../../lib/rubyn_code'
|
|
15
|
+
|
|
16
|
+
C = RubynCode::Chisel
|
|
17
|
+
FIXTURE_DIR = File.expand_path('fixtures', __dir__)
|
|
18
|
+
|
|
19
|
+
# The exact, repeatable output the scanner must produce for the fixture. If you
|
|
20
|
+
# edit skills/self_test/fixtures/chisel_sample.rb, update this table to match.
|
|
21
|
+
EXPECTED_DEBT = [
|
|
22
|
+
{ file: 'chisel_sample.rb', line: 18, note: 'collapse this factory into a single build method' },
|
|
23
|
+
{ file: 'chisel_sample.rb', line: 39, note: 'replace this class with Array#sum at the single call site' },
|
|
24
|
+
{ file: 'chisel_sample.rb', line: 52, note: 'inline DEFAULTS[:retries] since there is only one reader' }
|
|
25
|
+
].freeze
|
|
26
|
+
|
|
27
|
+
results = {}
|
|
28
|
+
|
|
29
|
+
# 1. Debt scanner — the consistent-result core. Scan only the fixture dir so
|
|
30
|
+
# the outcome never depends on the rest of the tree.
|
|
31
|
+
scanned = RubynCode::Chisel::Debt.scan(FIXTURE_DIR)
|
|
32
|
+
actual = scanned.map { |i| { file: i.file, line: i.line, note: i.note } }
|
|
33
|
+
results['debt'] = (actual == EXPECTED_DEBT)
|
|
34
|
+
warn(" debt mismatch — expected #{EXPECTED_DEBT.inspect}, got #{actual.inspect}") unless results['debt']
|
|
35
|
+
|
|
36
|
+
# 2. Engine — off injects nothing; lite/full/ultra layer the right addenda and
|
|
37
|
+
# ALWAYS keep the safety floor; a garbage mode never crashes or leaks through.
|
|
38
|
+
# Driven via RUBYN_CHISEL_MODE so it ignores this machine's chisel_mode config.
|
|
39
|
+
ENV['RUBYN_CHISEL_MODE'] = 'off'
|
|
40
|
+
off_ok = !C.enabled? && C.mode == 'off' && C.prompt_section.empty?
|
|
41
|
+
|
|
42
|
+
ENV['RUBYN_CHISEL_MODE'] = 'lite'
|
|
43
|
+
lite = C.prompt_section
|
|
44
|
+
lite_ok = C.enabled? && lite.include?(C::LADDER) && lite.include?(C::SAFETY_FLOOR) && !lite.include?(C::FULL_ADDENDUM)
|
|
45
|
+
|
|
46
|
+
ENV['RUBYN_CHISEL_MODE'] = 'full'
|
|
47
|
+
full = C.prompt_section
|
|
48
|
+
full_ok = full.include?(C::FULL_ADDENDUM) && full.include?(C::SAFETY_FLOOR) && !full.include?(C::ULTRA_ADDENDUM)
|
|
49
|
+
|
|
50
|
+
ENV['RUBYN_CHISEL_MODE'] = 'ultra'
|
|
51
|
+
ultra = C.prompt_section
|
|
52
|
+
ultra_ok = ultra.include?(C::ULTRA_ADDENDUM) && ultra.include?(C::SAFETY_FLOOR)
|
|
53
|
+
|
|
54
|
+
ENV['RUBYN_CHISEL_MODE'] = 'definitely-not-a-mode'
|
|
55
|
+
typo_ok = C::MODES.include?(C.mode) && C.mode != 'definitely-not-a-mode'
|
|
56
|
+
ENV.delete('RUBYN_CHISEL_MODE')
|
|
57
|
+
results['engine'] = off_ok && lite_ok && full_ok && ultra_ok && typo_ok
|
|
58
|
+
|
|
59
|
+
# 3. Inspection — both scopes assemble a String carrying the ladder + safety
|
|
60
|
+
# floor and naming the fixture; an unknown scope raises instead of emitting junk.
|
|
61
|
+
insp = RubynCode::Chisel::Inspection
|
|
62
|
+
diff_p = insp.prompt(scope: :diff, target: 'main')
|
|
63
|
+
repo_p = insp.prompt(scope: :repo, target: FIXTURE_DIR)
|
|
64
|
+
raised = begin
|
|
65
|
+
insp.prompt(scope: :bogus)
|
|
66
|
+
false
|
|
67
|
+
rescue ArgumentError
|
|
68
|
+
true
|
|
69
|
+
end
|
|
70
|
+
results['inspection'] = diff_p.is_a?(String) && diff_p.include?(C::LADDER) &&
|
|
71
|
+
diff_p.include?(C::SAFETY_FLOOR) &&
|
|
72
|
+
repo_p.include?(C::LADDER) && repo_p.include?(FIXTURE_DIR) && raised
|
|
73
|
+
|
|
74
|
+
# 4. Command registry — all five Chisel commands register and resolve by name.
|
|
75
|
+
reg = RubynCode::CLI::Commands::Registry.new
|
|
76
|
+
[RubynCode::CLI::Commands::Chisel, RubynCode::CLI::Commands::ChiselReview,
|
|
77
|
+
RubynCode::CLI::Commands::ChiselAudit, RubynCode::CLI::Commands::ChiselDebt,
|
|
78
|
+
RubynCode::CLI::Commands::ChiselGain].each { |cmd| reg.register(cmd) }
|
|
79
|
+
results['commands'] = %w[/chisel /chisel-review /chisel-audit /chisel-debt /chisel-gain].all? { |n| reg.known?(n) }
|
|
80
|
+
|
|
81
|
+
results.each { |area, ok| puts "CHISEL #{area}: #{ok ? 'PASS' : 'FAIL'}" }
|
|
82
|
+
all_ok = results.values.all?
|
|
83
|
+
puts(all_ok ? 'CHISEL: PASS' : 'CHISEL: FAIL')
|
|
84
|
+
exit(all_ok ? 0 : 1)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Chisel self-test fixture — a DELIBERATELY over-engineered Ruby file.
|
|
4
|
+
#
|
|
5
|
+
# rubyn-code points Chisel at this file to get consistent, repeatable results:
|
|
6
|
+
# - `Chisel::Debt.scan` on this directory must harvest EXACTLY the three
|
|
7
|
+
# own-line `chisel:` markers below — and none of the decoys at the bottom.
|
|
8
|
+
# - `/chisel-review` and `/chisel-audit` have real over-engineering to flag.
|
|
9
|
+
#
|
|
10
|
+
# Do NOT "clean this up" — the smells and the markers are the point. The smoke
|
|
11
|
+
# test that asserts on this file lives in skills/self_test/chisel_smoke.rb and
|
|
12
|
+
# spec/rubyn_code/chisel/self_test_fixture_spec.rb. If you change a marker, the
|
|
13
|
+
# line/note it sits on, or add/remove one, update those two in lock-step.
|
|
14
|
+
module ChiselFixture
|
|
15
|
+
# An abstract factory with exactly one product shape — classic premature
|
|
16
|
+
# abstraction. A plain method (or just calling the class) would do.
|
|
17
|
+
class GreeterFactory
|
|
18
|
+
# chisel: collapse this factory into a single build method
|
|
19
|
+
def self.create(kind)
|
|
20
|
+
case kind
|
|
21
|
+
when :formal then FormalGreeter.new
|
|
22
|
+
when :casual then CasualGreeter.new
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
class FormalGreeter
|
|
28
|
+
def greet(name) = "Good day, #{name}."
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class CasualGreeter
|
|
32
|
+
def greet(name) = "hey #{name}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# A stateful wrapper that adds nothing over Array#sum.
|
|
36
|
+
class Accumulator
|
|
37
|
+
def initialize = (@total = 0)
|
|
38
|
+
|
|
39
|
+
# chisel: replace this class with Array#sum at the single call site
|
|
40
|
+
def add(amount)
|
|
41
|
+
@total += amount
|
|
42
|
+
self
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def total = @total
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Single-reader config indirection.
|
|
49
|
+
DEFAULTS = { retries: 3 }.freeze
|
|
50
|
+
|
|
51
|
+
def self.retries
|
|
52
|
+
# chisel: inline DEFAULTS[:retries] since there is only one reader
|
|
53
|
+
DEFAULTS.fetch(:retries)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# --- decoys: these MUST NOT be harvested as debt markers ---
|
|
57
|
+
|
|
58
|
+
def self.decoy
|
|
59
|
+
# The next line has "chisel:" inside a string AND as a trailing comment;
|
|
60
|
+
# neither is an own-line marker, so the scanner must ignore both.
|
|
61
|
+
label = 'see # chisel: this is data, not a marker' # chisel: trailing, ignored
|
|
62
|
+
label
|
|
63
|
+
end
|
|
64
|
+
end
|