ruby-claw 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +94 -0
- data/README.md +214 -10
- data/exe/claw +42 -1
- data/lib/claw/auto_forge.rb +66 -0
- data/lib/claw/benchmark/benchmark.rb +79 -0
- data/lib/claw/benchmark/diff.rb +69 -0
- data/lib/claw/benchmark/report.rb +87 -0
- data/lib/claw/benchmark/runner.rb +91 -0
- data/lib/claw/benchmark/scorer.rb +69 -0
- data/lib/claw/benchmark/task.rb +63 -0
- data/lib/claw/benchmark/tasks/claw_remember.rb +20 -0
- data/lib/claw/benchmark/tasks/claw_session.rb +18 -0
- data/lib/claw/benchmark/tasks/evolution_trace.rb +18 -0
- data/lib/claw/benchmark/tasks/mana_call_func.rb +21 -0
- data/lib/claw/benchmark/tasks/mana_eval.rb +18 -0
- data/lib/claw/benchmark/tasks/mana_knowledge.rb +19 -0
- data/lib/claw/benchmark/tasks/mana_var_readwrite.rb +18 -0
- data/lib/claw/benchmark/tasks/runtime_fork.rb +18 -0
- data/lib/claw/benchmark/tasks/runtime_snapshot.rb +18 -0
- data/lib/claw/benchmark/trigger.rb +68 -0
- data/lib/claw/chat.rb +119 -6
- data/lib/claw/child_runtime.rb +196 -0
- data/lib/claw/cli.rb +177 -0
- data/lib/claw/commands.rb +131 -0
- data/lib/claw/config.rb +5 -1
- data/lib/claw/console/event_logger.rb +69 -0
- data/lib/claw/console/public/app.js +264 -0
- data/lib/claw/console/public/style.css +330 -0
- data/lib/claw/console/server.rb +253 -0
- data/lib/claw/console/sse.rb +28 -0
- data/lib/claw/console/views/experiments.erb +8 -0
- data/lib/claw/console/views/index.erb +27 -0
- data/lib/claw/console/views/layout.erb +29 -0
- data/lib/claw/console/views/memory.erb +13 -0
- data/lib/claw/console/views/monitor.erb +15 -0
- data/lib/claw/console/views/prompt.erb +15 -0
- data/lib/claw/console/views/snapshots.erb +12 -0
- data/lib/claw/console/views/tools.erb +13 -0
- data/lib/claw/console/views/traces.erb +9 -0
- data/lib/claw/console.rb +5 -0
- data/lib/claw/evolution.rb +227 -0
- data/lib/claw/forge.rb +144 -0
- data/lib/claw/hub.rb +67 -0
- data/lib/claw/init.rb +199 -0
- data/lib/claw/knowledge.rb +36 -2
- data/lib/claw/memory_store.rb +2 -2
- data/lib/claw/plan_mode.rb +110 -0
- data/lib/claw/resource.rb +35 -0
- data/lib/claw/resources/binding_resource.rb +128 -0
- data/lib/claw/resources/context_resource.rb +73 -0
- data/lib/claw/resources/filesystem_resource.rb +107 -0
- data/lib/claw/resources/memory_resource.rb +74 -0
- data/lib/claw/resources/worktree_resource.rb +133 -0
- data/lib/claw/roles.rb +56 -0
- data/lib/claw/runtime.rb +189 -0
- data/lib/claw/serializer.rb +10 -7
- data/lib/claw/tool.rb +99 -0
- data/lib/claw/tool_index.rb +84 -0
- data/lib/claw/tool_registry.rb +100 -0
- data/lib/claw/trace.rb +86 -0
- data/lib/claw/tui/agent_executor.rb +92 -0
- data/lib/claw/tui/chat_panel.rb +81 -0
- data/lib/claw/tui/command_bar.rb +22 -0
- data/lib/claw/tui/file_card.rb +88 -0
- data/lib/claw/tui/folding.rb +80 -0
- data/lib/claw/tui/input_handler.rb +73 -0
- data/lib/claw/tui/layout.rb +34 -0
- data/lib/claw/tui/messages.rb +31 -0
- data/lib/claw/tui/model.rb +411 -0
- data/lib/claw/tui/object_explorer.rb +136 -0
- data/lib/claw/tui/status_bar.rb +30 -0
- data/lib/claw/tui/status_panel.rb +133 -0
- data/lib/claw/tui/styles.rb +58 -0
- data/lib/claw/tui/tui.rb +54 -0
- data/lib/claw/version.rb +1 -1
- data/lib/claw.rb +99 -1
- metadata +223 -7
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Claw
|
|
6
|
+
module Benchmark
|
|
7
|
+
# Generate Markdown benchmark reports.
|
|
8
|
+
module Report
|
|
9
|
+
# Generate a full report from suite results.
|
|
10
|
+
#
|
|
11
|
+
# @param suite [SuiteResult]
|
|
12
|
+
# @return [String] Markdown report
|
|
13
|
+
def self.generate(suite)
|
|
14
|
+
lines = ["# Benchmark Report\n"]
|
|
15
|
+
lines << "**Date:** #{suite.timestamp.strftime('%Y-%m-%d %H:%M:%S')}"
|
|
16
|
+
lines << ""
|
|
17
|
+
|
|
18
|
+
# Summary
|
|
19
|
+
lines << "## Summary"
|
|
20
|
+
lines << ""
|
|
21
|
+
total = suite.results.size
|
|
22
|
+
passed = suite.results.count { |r| r.pass_rate == 1.0 }
|
|
23
|
+
lines << "| Metric | Value |"
|
|
24
|
+
lines << "|--------|-------|"
|
|
25
|
+
lines << "| Total tasks | #{total} |"
|
|
26
|
+
lines << "| Suite score | #{suite.suite_score.round(1)} |"
|
|
27
|
+
lines << "| Pass rate | #{(suite.pass_rate * 100).round(1)}% |"
|
|
28
|
+
lines << "| All-pass tasks | #{passed}/#{total} |"
|
|
29
|
+
lines << ""
|
|
30
|
+
|
|
31
|
+
# By Layer
|
|
32
|
+
lines << "## By Layer"
|
|
33
|
+
lines << ""
|
|
34
|
+
layers = suite.results.group_by { |r| r.task.layer }
|
|
35
|
+
lines << "| Layer | Tasks | Pass Rate | Avg Score |"
|
|
36
|
+
lines << "|-------|-------|-----------|-----------|"
|
|
37
|
+
layers.each do |layer, results|
|
|
38
|
+
pr = results.sum(&:pass_rate) / results.size * 100
|
|
39
|
+
sc = results.sum(&:avg_score) / results.size
|
|
40
|
+
lines << "| #{layer} | #{results.size} | #{pr.round(1)}% | #{sc.round(1)} |"
|
|
41
|
+
end
|
|
42
|
+
lines << ""
|
|
43
|
+
|
|
44
|
+
# Task Details
|
|
45
|
+
lines << "## Task Details"
|
|
46
|
+
lines << ""
|
|
47
|
+
suite.results.each do |tr|
|
|
48
|
+
lines << "### #{tr.task.id} (#{tr.task.layer})"
|
|
49
|
+
lines << ""
|
|
50
|
+
lines << "- **Score:** #{tr.avg_score.round(1)}"
|
|
51
|
+
lines << "- **Pass rate:** #{(tr.pass_rate * 100).round(0)}%"
|
|
52
|
+
lines << ""
|
|
53
|
+
lines << "| Run | Correct | Rounds | Tokens | Time (ms) | Path |"
|
|
54
|
+
lines << "|-----|---------|--------|--------|-----------|------|"
|
|
55
|
+
tr.runs.each_with_index do |run, i|
|
|
56
|
+
path = (run.tool_path || []).join(" → ")
|
|
57
|
+
mark = run.correct ? "✓" : "✗"
|
|
58
|
+
lines << "| #{i + 1} | #{mark} | #{run.rounds} | #{run.tokens} | #{run.elapsed_ms} | #{path} |"
|
|
59
|
+
end
|
|
60
|
+
if tr.runs.any? { |r| r.error }
|
|
61
|
+
lines << ""
|
|
62
|
+
tr.runs.each_with_index do |run, i|
|
|
63
|
+
lines << "- Run #{i + 1} error: #{run.error}" if run.error
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
lines << ""
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
lines.join("\n")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Save report to .ruby-claw/benchmarks/
|
|
73
|
+
#
|
|
74
|
+
# @param report_text [String] Markdown content
|
|
75
|
+
# @param claw_dir [String] path to .ruby-claw/
|
|
76
|
+
# @return [String] file path
|
|
77
|
+
def self.save(report_text, claw_dir = ".ruby-claw")
|
|
78
|
+
dir = File.join(claw_dir, "benchmarks")
|
|
79
|
+
FileUtils.mkdir_p(dir)
|
|
80
|
+
filename = "#{Time.now.strftime('%Y-%m-%d_%H%M%S')}.md"
|
|
81
|
+
path = File.join(dir, filename)
|
|
82
|
+
File.write(path, report_text)
|
|
83
|
+
path
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Claw
|
|
4
|
+
module Benchmark
|
|
5
|
+
# Executes benchmark tasks. Each task runs 3 times with a clean environment.
|
|
6
|
+
class Runner
|
|
7
|
+
RUNS_PER_TASK = 3
|
|
8
|
+
|
|
9
|
+
# Run the entire benchmark suite.
|
|
10
|
+
#
|
|
11
|
+
# @param tasks [Array<Task>] tasks to run
|
|
12
|
+
# @param on_progress [Proc, nil] called with (task_id, run_index, total)
|
|
13
|
+
# @return [SuiteResult]
|
|
14
|
+
def run_all(tasks, &on_progress)
|
|
15
|
+
total = tasks.size * RUNS_PER_TASK
|
|
16
|
+
completed = 0
|
|
17
|
+
|
|
18
|
+
results = tasks.map do |task|
|
|
19
|
+
runs = RUNS_PER_TASK.times.map do |i|
|
|
20
|
+
result = run_once(task)
|
|
21
|
+
completed += 1
|
|
22
|
+
on_progress&.call(task.id, i + 1, total, completed)
|
|
23
|
+
result
|
|
24
|
+
end
|
|
25
|
+
TaskResult.new(task: task, runs: runs)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
SuiteResult.new(results: results, timestamp: Time.now)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Execute a single task run with a clean environment.
|
|
32
|
+
#
|
|
33
|
+
# @param task [Task]
|
|
34
|
+
# @return [RunResult]
|
|
35
|
+
def run_once(task)
|
|
36
|
+
# Create isolated binding
|
|
37
|
+
isolated_binding = Object.new.instance_eval { binding }
|
|
38
|
+
|
|
39
|
+
# Setup: inject variables
|
|
40
|
+
vars = task.setup.call
|
|
41
|
+
vars.each { |k, v| isolated_binding.local_variable_set(k, v) }
|
|
42
|
+
|
|
43
|
+
# Create minimal runtime
|
|
44
|
+
runtime = Claw::Runtime.new
|
|
45
|
+
runtime.register("binding", Claw::Resources::BindingResource.new(isolated_binding))
|
|
46
|
+
runtime.snapshot!(label: "bench_start")
|
|
47
|
+
|
|
48
|
+
# Execute
|
|
49
|
+
t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
50
|
+
engine = Mana::Engine.new(isolated_binding)
|
|
51
|
+
engine.execute(task.prompt)
|
|
52
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000).round
|
|
53
|
+
|
|
54
|
+
# Collect results
|
|
55
|
+
trace = engine.trace_data || {}
|
|
56
|
+
steps = trace[:steps] || []
|
|
57
|
+
tool_path = steps.flat_map { |s| (s[:tool_calls] || []).map { |tc| tc[:name] } }
|
|
58
|
+
total_tokens = steps.sum { |s|
|
|
59
|
+
u = s[:usage] || {}
|
|
60
|
+
(u[:input_tokens] || u["input_tokens"] || 0).to_i +
|
|
61
|
+
(u[:output_tokens] || u["output_tokens"] || 0).to_i
|
|
62
|
+
}
|
|
63
|
+
rounds = trace[:total_iterations] || steps.size
|
|
64
|
+
|
|
65
|
+
correct = begin
|
|
66
|
+
task.expect.call(isolated_binding)
|
|
67
|
+
rescue => e
|
|
68
|
+
false
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
RunResult.new(
|
|
72
|
+
correct: correct,
|
|
73
|
+
rounds: rounds,
|
|
74
|
+
tokens: total_tokens,
|
|
75
|
+
tool_path: tool_path,
|
|
76
|
+
elapsed_ms: elapsed_ms,
|
|
77
|
+
error: nil
|
|
78
|
+
)
|
|
79
|
+
rescue => e
|
|
80
|
+
RunResult.new(
|
|
81
|
+
correct: false,
|
|
82
|
+
rounds: 0,
|
|
83
|
+
tokens: 0,
|
|
84
|
+
tool_path: [],
|
|
85
|
+
elapsed_ms: 0,
|
|
86
|
+
error: "#{e.class}: #{e.message}"
|
|
87
|
+
)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Claw
|
|
4
|
+
module Benchmark
|
|
5
|
+
# Scoring formula for benchmark runs.
|
|
6
|
+
#
|
|
7
|
+
# task_score = correctness * 100
|
|
8
|
+
# - (actual_rounds / max_rounds) * 20
|
|
9
|
+
# - (actual_tokens / max_tokens) * 20
|
|
10
|
+
# - path_penalty
|
|
11
|
+
module Scorer
|
|
12
|
+
# Score a single run against its task definition.
|
|
13
|
+
#
|
|
14
|
+
# @param run [RunResult]
|
|
15
|
+
# @param task [Task]
|
|
16
|
+
# @return [Float] score (max 100, can go negative)
|
|
17
|
+
def self.score_run(run, task)
|
|
18
|
+
correctness = run.correct ? 100.0 : 0.0
|
|
19
|
+
|
|
20
|
+
rounds_penalty = task.max_rounds > 0 ?
|
|
21
|
+
(run.rounds.to_f / task.max_rounds) * 20.0 : 0.0
|
|
22
|
+
|
|
23
|
+
tokens_penalty = task.max_tokens > 0 ?
|
|
24
|
+
(run.tokens.to_f / task.max_tokens) * 20.0 : 0.0
|
|
25
|
+
|
|
26
|
+
path_pen = path_penalty(run.tool_path || [], task.ideal_path || [])
|
|
27
|
+
|
|
28
|
+
[correctness - rounds_penalty - tokens_penalty - path_pen, 0.0].max
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Calculate path penalty using edit distance between actual and ideal tool sequences.
|
|
32
|
+
#
|
|
33
|
+
# @param actual [Array<String>] actual tool call sequence
|
|
34
|
+
# @param ideal [Array<String>] expected tool call sequence
|
|
35
|
+
# @return [Float] penalty (0 if paths match, higher for more divergence)
|
|
36
|
+
def self.path_penalty(actual, ideal)
|
|
37
|
+
return 0.0 if ideal.empty?
|
|
38
|
+
|
|
39
|
+
distance = levenshtein(actual, ideal)
|
|
40
|
+
# Normalize: each edit costs 5 points, max penalty 20
|
|
41
|
+
[distance * 5.0, 20.0].min
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Levenshtein distance between two arrays of strings.
|
|
45
|
+
def self.levenshtein(a, b)
|
|
46
|
+
m = a.size
|
|
47
|
+
n = b.size
|
|
48
|
+
d = Array.new(m + 1) { Array.new(n + 1, 0) }
|
|
49
|
+
|
|
50
|
+
(0..m).each { |i| d[i][0] = i }
|
|
51
|
+
(0..n).each { |j| d[0][j] = j }
|
|
52
|
+
|
|
53
|
+
(1..m).each do |i|
|
|
54
|
+
(1..n).each do |j|
|
|
55
|
+
cost = a[i - 1] == b[j - 1] ? 0 : 1
|
|
56
|
+
d[i][j] = [
|
|
57
|
+
d[i - 1][j] + 1,
|
|
58
|
+
d[i][j - 1] + 1,
|
|
59
|
+
d[i - 1][j - 1] + cost
|
|
60
|
+
].min
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
d[m][n]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private_class_method :levenshtein
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Claw
|
|
4
|
+
module Benchmark
|
|
5
|
+
# A single benchmark task definition.
|
|
6
|
+
Task = Struct.new(
|
|
7
|
+
:id, # unique task identifier (String)
|
|
8
|
+
:layer, # :mana, :claw, :runtime, :evolution
|
|
9
|
+
:setup, # Proc → Hash of variables to inject into binding
|
|
10
|
+
:prompt, # String prompt to send to the engine
|
|
11
|
+
:expect, # Proc(binding) → boolean (correctness check)
|
|
12
|
+
:max_rounds, # maximum allowed LLM iterations
|
|
13
|
+
:max_tokens, # maximum allowed token usage
|
|
14
|
+
:ideal_path, # Array<String> of expected tool call sequence
|
|
15
|
+
keyword_init: true
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Result of a single run (one of 3 per task).
|
|
19
|
+
RunResult = Struct.new(
|
|
20
|
+
:correct, # boolean
|
|
21
|
+
:rounds, # actual LLM iterations
|
|
22
|
+
:tokens, # actual token usage (input + output)
|
|
23
|
+
:tool_path, # Array<String> actual tool call sequence
|
|
24
|
+
:elapsed_ms, # execution time in milliseconds
|
|
25
|
+
:error, # exception message if failed, nil otherwise
|
|
26
|
+
keyword_init: true
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Aggregated result for one task across multiple runs.
|
|
30
|
+
TaskResult = Struct.new(
|
|
31
|
+
:task, # Task instance
|
|
32
|
+
:runs, # Array<RunResult>
|
|
33
|
+
keyword_init: true
|
|
34
|
+
) do
|
|
35
|
+
def pass_rate
|
|
36
|
+
return 0.0 if runs.empty?
|
|
37
|
+
runs.count(&:correct).to_f / runs.size
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def avg_score
|
|
41
|
+
scores = runs.map { |r| Scorer.score_run(r, task) }
|
|
42
|
+
scores.sum / scores.size.to_f
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Suite-level result across all tasks.
|
|
47
|
+
SuiteResult = Struct.new(
|
|
48
|
+
:results, # Array<TaskResult>
|
|
49
|
+
:timestamp, # Time
|
|
50
|
+
keyword_init: true
|
|
51
|
+
) do
|
|
52
|
+
def suite_score
|
|
53
|
+
return 0.0 if results.empty?
|
|
54
|
+
results.sum(&:avg_score) / results.size.to_f
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def pass_rate
|
|
58
|
+
return 0.0 if results.empty?
|
|
59
|
+
results.sum(&:pass_rate) / results.size.to_f
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "claw_remember",
|
|
6
|
+
layer: :claw,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ important_fact: "The project deadline is March 15th" }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Read the `important_fact` variable and remember it using the remember tool.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
memory = Claw.memory
|
|
13
|
+
return false unless memory
|
|
14
|
+
memory.long_term.any? { |m| m[:content].include?("March 15th") }
|
|
15
|
+
},
|
|
16
|
+
max_rounds: 3,
|
|
17
|
+
max_tokens: 2000,
|
|
18
|
+
ideal_path: %w[read_var remember]
|
|
19
|
+
)
|
|
20
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "claw_session",
|
|
6
|
+
layer: :claw,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ counter: 0, step: 5 }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Read both `counter` and `step`, then set `counter` to `counter + step`.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:counter) == 5
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var read_var write_var]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "evolution_trace",
|
|
6
|
+
layer: :evolution,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ items: [1, 2, 3], total: nil }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Calculate the sum of all elements in `items` and store it in `total`.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:total) == 6
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var eval_code write_var]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "mana_call_func",
|
|
6
|
+
layer: :mana,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{
|
|
9
|
+
greet: ->(name) { "Hello, #{name}!" },
|
|
10
|
+
result: nil
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
prompt: "Call the `greet` function with the argument \"World\" and store the return value in `result`.",
|
|
14
|
+
expect: ->(b) {
|
|
15
|
+
b.local_variable_get(:result) == "Hello, World!"
|
|
16
|
+
},
|
|
17
|
+
max_rounds: 3,
|
|
18
|
+
max_tokens: 2000,
|
|
19
|
+
ideal_path: %w[read_var call_function write_var]
|
|
20
|
+
)
|
|
21
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "mana_eval",
|
|
6
|
+
layer: :mana,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ numbers: [3, 7, 2, 9, 1] }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Sort the `numbers` array in descending order and store the result back in `numbers`.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:numbers) == [9, 7, 3, 2, 1]
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var eval_code]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "mana_knowledge",
|
|
6
|
+
layer: :mana,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ answer: nil }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Use knowledge lookup to find what the Array#flatten method does, then set `answer` to the string \"recursive flatten\".",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
val = b.local_variable_get(:answer)
|
|
13
|
+
val.is_a?(String) && val.downcase.include?("flatten")
|
|
14
|
+
},
|
|
15
|
+
max_rounds: 4,
|
|
16
|
+
max_tokens: 3000,
|
|
17
|
+
ideal_path: %w[knowledge_lookup write_var]
|
|
18
|
+
)
|
|
19
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "mana_var_readwrite",
|
|
6
|
+
layer: :mana,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ x: 10, y: 20 }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Set the variable `x` to 42 and `y` to the current value of `x` plus 8.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:x) == 42 && b.local_variable_get(:y) == 50
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var write_var read_var write_var]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "runtime_fork",
|
|
6
|
+
layer: :runtime,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ value: 100, doubled: nil }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Read `value`, compute its double, and store the result in `doubled`.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:doubled) == 200
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var eval_code write_var]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Claw::Benchmark::Tasks.register(
|
|
4
|
+
Claw::Benchmark::Task.new(
|
|
5
|
+
id: "runtime_snapshot",
|
|
6
|
+
layer: :runtime,
|
|
7
|
+
setup: -> {
|
|
8
|
+
{ data: "original" }
|
|
9
|
+
},
|
|
10
|
+
prompt: "Read `data`, then change `data` to \"modified\". The runtime will track the change.",
|
|
11
|
+
expect: ->(b) {
|
|
12
|
+
b.local_variable_get(:data) == "modified"
|
|
13
|
+
},
|
|
14
|
+
max_rounds: 3,
|
|
15
|
+
max_tokens: 2000,
|
|
16
|
+
ideal_path: %w[read_var write_var]
|
|
17
|
+
)
|
|
18
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Claw
|
|
4
|
+
module Benchmark
|
|
5
|
+
# Automatic evolution triggers based on benchmark results.
|
|
6
|
+
# Event-driven: checks after each benchmark run or trace write.
|
|
7
|
+
class Trigger
|
|
8
|
+
def initialize(runtime:, claw_dir: ".ruby-claw")
|
|
9
|
+
@runtime = runtime
|
|
10
|
+
@claw_dir = claw_dir
|
|
11
|
+
@mutex = Mutex.new
|
|
12
|
+
@evolution_running = false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Check after a benchmark run completes.
|
|
16
|
+
# Triggers evolution if suite score regressed.
|
|
17
|
+
#
|
|
18
|
+
# @param current_score [Float] latest suite score
|
|
19
|
+
# @param previous_score [Float, nil] previous suite score
|
|
20
|
+
def check_after_benchmark!(current_score, previous_score)
|
|
21
|
+
return unless previous_score
|
|
22
|
+
return if current_score >= previous_score
|
|
23
|
+
return if @mutex.synchronize { @evolution_running }
|
|
24
|
+
|
|
25
|
+
trigger!(
|
|
26
|
+
reason: "score_regression",
|
|
27
|
+
detail: "#{previous_score.round(1)} → #{current_score.round(1)}"
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Check after a trace is written.
|
|
32
|
+
# Triggers evolution if the same task failed 3 consecutive times.
|
|
33
|
+
#
|
|
34
|
+
# @param task_id [String]
|
|
35
|
+
# @param recent_results [Array<Boolean>] last N correctness results
|
|
36
|
+
def check_after_trace!(task_id, recent_results)
|
|
37
|
+
return if recent_results.size < 3
|
|
38
|
+
return if @mutex.synchronize { @evolution_running }
|
|
39
|
+
|
|
40
|
+
if recent_results.last(3).none?
|
|
41
|
+
trigger!(
|
|
42
|
+
reason: "consecutive_failures",
|
|
43
|
+
detail: "#{task_id} failed 3 times in a row"
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def trigger!(reason:, detail:)
|
|
51
|
+
@mutex.synchronize { @evolution_running = true }
|
|
52
|
+
|
|
53
|
+
@runtime&.record_event(
|
|
54
|
+
action: "evolution_triggered",
|
|
55
|
+
target: reason,
|
|
56
|
+
detail: detail
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
begin
|
|
60
|
+
evo = Claw::Evolution.new(runtime: @runtime, claw_dir: @claw_dir)
|
|
61
|
+
evo.evolve
|
|
62
|
+
ensure
|
|
63
|
+
@mutex.synchronize { @evolution_running = false }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|