ruby-claw 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +94 -0
  3. data/README.md +214 -10
  4. data/exe/claw +42 -1
  5. data/lib/claw/auto_forge.rb +66 -0
  6. data/lib/claw/benchmark/benchmark.rb +79 -0
  7. data/lib/claw/benchmark/diff.rb +69 -0
  8. data/lib/claw/benchmark/report.rb +87 -0
  9. data/lib/claw/benchmark/runner.rb +91 -0
  10. data/lib/claw/benchmark/scorer.rb +69 -0
  11. data/lib/claw/benchmark/task.rb +63 -0
  12. data/lib/claw/benchmark/tasks/claw_remember.rb +20 -0
  13. data/lib/claw/benchmark/tasks/claw_session.rb +18 -0
  14. data/lib/claw/benchmark/tasks/evolution_trace.rb +18 -0
  15. data/lib/claw/benchmark/tasks/mana_call_func.rb +21 -0
  16. data/lib/claw/benchmark/tasks/mana_eval.rb +18 -0
  17. data/lib/claw/benchmark/tasks/mana_knowledge.rb +19 -0
  18. data/lib/claw/benchmark/tasks/mana_var_readwrite.rb +18 -0
  19. data/lib/claw/benchmark/tasks/runtime_fork.rb +18 -0
  20. data/lib/claw/benchmark/tasks/runtime_snapshot.rb +18 -0
  21. data/lib/claw/benchmark/trigger.rb +68 -0
  22. data/lib/claw/chat.rb +119 -6
  23. data/lib/claw/child_runtime.rb +196 -0
  24. data/lib/claw/cli.rb +177 -0
  25. data/lib/claw/commands.rb +131 -0
  26. data/lib/claw/config.rb +5 -1
  27. data/lib/claw/console/event_logger.rb +69 -0
  28. data/lib/claw/console/public/app.js +264 -0
  29. data/lib/claw/console/public/style.css +330 -0
  30. data/lib/claw/console/server.rb +253 -0
  31. data/lib/claw/console/sse.rb +28 -0
  32. data/lib/claw/console/views/experiments.erb +8 -0
  33. data/lib/claw/console/views/index.erb +27 -0
  34. data/lib/claw/console/views/layout.erb +29 -0
  35. data/lib/claw/console/views/memory.erb +13 -0
  36. data/lib/claw/console/views/monitor.erb +15 -0
  37. data/lib/claw/console/views/prompt.erb +15 -0
  38. data/lib/claw/console/views/snapshots.erb +12 -0
  39. data/lib/claw/console/views/tools.erb +13 -0
  40. data/lib/claw/console/views/traces.erb +9 -0
  41. data/lib/claw/console.rb +5 -0
  42. data/lib/claw/evolution.rb +227 -0
  43. data/lib/claw/forge.rb +144 -0
  44. data/lib/claw/hub.rb +67 -0
  45. data/lib/claw/init.rb +199 -0
  46. data/lib/claw/knowledge.rb +36 -2
  47. data/lib/claw/memory_store.rb +2 -2
  48. data/lib/claw/plan_mode.rb +110 -0
  49. data/lib/claw/resource.rb +35 -0
  50. data/lib/claw/resources/binding_resource.rb +128 -0
  51. data/lib/claw/resources/context_resource.rb +73 -0
  52. data/lib/claw/resources/filesystem_resource.rb +107 -0
  53. data/lib/claw/resources/memory_resource.rb +74 -0
  54. data/lib/claw/resources/worktree_resource.rb +133 -0
  55. data/lib/claw/roles.rb +56 -0
  56. data/lib/claw/runtime.rb +189 -0
  57. data/lib/claw/serializer.rb +10 -7
  58. data/lib/claw/tool.rb +99 -0
  59. data/lib/claw/tool_index.rb +84 -0
  60. data/lib/claw/tool_registry.rb +100 -0
  61. data/lib/claw/trace.rb +86 -0
  62. data/lib/claw/tui/agent_executor.rb +92 -0
  63. data/lib/claw/tui/chat_panel.rb +81 -0
  64. data/lib/claw/tui/command_bar.rb +22 -0
  65. data/lib/claw/tui/file_card.rb +88 -0
  66. data/lib/claw/tui/folding.rb +80 -0
  67. data/lib/claw/tui/input_handler.rb +73 -0
  68. data/lib/claw/tui/layout.rb +34 -0
  69. data/lib/claw/tui/messages.rb +31 -0
  70. data/lib/claw/tui/model.rb +411 -0
  71. data/lib/claw/tui/object_explorer.rb +136 -0
  72. data/lib/claw/tui/status_bar.rb +30 -0
  73. data/lib/claw/tui/status_panel.rb +133 -0
  74. data/lib/claw/tui/styles.rb +58 -0
  75. data/lib/claw/tui/tui.rb +54 -0
  76. data/lib/claw/version.rb +1 -1
  77. data/lib/claw.rb +99 -1
  78. metadata +223 -7
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Claw
6
+ module Benchmark
7
+ # Generate Markdown benchmark reports.
8
+ module Report
9
+ # Generate a full report from suite results.
10
+ #
11
+ # @param suite [SuiteResult]
12
+ # @return [String] Markdown report
13
+ def self.generate(suite)
14
+ lines = ["# Benchmark Report\n"]
15
+ lines << "**Date:** #{suite.timestamp.strftime('%Y-%m-%d %H:%M:%S')}"
16
+ lines << ""
17
+
18
+ # Summary
19
+ lines << "## Summary"
20
+ lines << ""
21
+ total = suite.results.size
22
+ passed = suite.results.count { |r| r.pass_rate == 1.0 }
23
+ lines << "| Metric | Value |"
24
+ lines << "|--------|-------|"
25
+ lines << "| Total tasks | #{total} |"
26
+ lines << "| Suite score | #{suite.suite_score.round(1)} |"
27
+ lines << "| Pass rate | #{(suite.pass_rate * 100).round(1)}% |"
28
+ lines << "| All-pass tasks | #{passed}/#{total} |"
29
+ lines << ""
30
+
31
+ # By Layer
32
+ lines << "## By Layer"
33
+ lines << ""
34
+ layers = suite.results.group_by { |r| r.task.layer }
35
+ lines << "| Layer | Tasks | Pass Rate | Avg Score |"
36
+ lines << "|-------|-------|-----------|-----------|"
37
+ layers.each do |layer, results|
38
+ pr = results.sum(&:pass_rate) / results.size * 100
39
+ sc = results.sum(&:avg_score) / results.size
40
+ lines << "| #{layer} | #{results.size} | #{pr.round(1)}% | #{sc.round(1)} |"
41
+ end
42
+ lines << ""
43
+
44
+ # Task Details
45
+ lines << "## Task Details"
46
+ lines << ""
47
+ suite.results.each do |tr|
48
+ lines << "### #{tr.task.id} (#{tr.task.layer})"
49
+ lines << ""
50
+ lines << "- **Score:** #{tr.avg_score.round(1)}"
51
+ lines << "- **Pass rate:** #{(tr.pass_rate * 100).round(0)}%"
52
+ lines << ""
53
+ lines << "| Run | Correct | Rounds | Tokens | Time (ms) | Path |"
54
+ lines << "|-----|---------|--------|--------|-----------|------|"
55
+ tr.runs.each_with_index do |run, i|
56
+ path = (run.tool_path || []).join(" → ")
57
+ mark = run.correct ? "✓" : "✗"
58
+ lines << "| #{i + 1} | #{mark} | #{run.rounds} | #{run.tokens} | #{run.elapsed_ms} | #{path} |"
59
+ end
60
+ if tr.runs.any? { |r| r.error }
61
+ lines << ""
62
+ tr.runs.each_with_index do |run, i|
63
+ lines << "- Run #{i + 1} error: #{run.error}" if run.error
64
+ end
65
+ end
66
+ lines << ""
67
+ end
68
+
69
+ lines.join("\n")
70
+ end
71
+
72
+ # Save report to .ruby-claw/benchmarks/
73
+ #
74
+ # @param report_text [String] Markdown content
75
+ # @param claw_dir [String] path to .ruby-claw/
76
+ # @return [String] file path
77
+ def self.save(report_text, claw_dir = ".ruby-claw")
78
+ dir = File.join(claw_dir, "benchmarks")
79
+ FileUtils.mkdir_p(dir)
80
+ filename = "#{Time.now.strftime('%Y-%m-%d_%H%M%S')}.md"
81
+ path = File.join(dir, filename)
82
+ File.write(path, report_text)
83
+ path
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ module Benchmark
5
+ # Executes benchmark tasks. Each task runs 3 times with a clean environment.
6
+ class Runner
7
+ RUNS_PER_TASK = 3
8
+
9
+ # Run the entire benchmark suite.
10
+ #
11
+ # @param tasks [Array<Task>] tasks to run
12
+ # @param on_progress [Proc, nil] called with (task_id, run_index, total)
13
+ # @return [SuiteResult]
14
+ def run_all(tasks, &on_progress)
15
+ total = tasks.size * RUNS_PER_TASK
16
+ completed = 0
17
+
18
+ results = tasks.map do |task|
19
+ runs = RUNS_PER_TASK.times.map do |i|
20
+ result = run_once(task)
21
+ completed += 1
22
+ on_progress&.call(task.id, i + 1, total, completed)
23
+ result
24
+ end
25
+ TaskResult.new(task: task, runs: runs)
26
+ end
27
+
28
+ SuiteResult.new(results: results, timestamp: Time.now)
29
+ end
30
+
31
+ # Execute a single task run with a clean environment.
32
+ #
33
+ # @param task [Task]
34
+ # @return [RunResult]
35
+ def run_once(task)
36
+ # Create isolated binding
37
+ isolated_binding = Object.new.instance_eval { binding }
38
+
39
+ # Setup: inject variables
40
+ vars = task.setup.call
41
+ vars.each { |k, v| isolated_binding.local_variable_set(k, v) }
42
+
43
+ # Create minimal runtime
44
+ runtime = Claw::Runtime.new
45
+ runtime.register("binding", Claw::Resources::BindingResource.new(isolated_binding))
46
+ runtime.snapshot!(label: "bench_start")
47
+
48
+ # Execute
49
+ t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
50
+ engine = Mana::Engine.new(isolated_binding)
51
+ engine.execute(task.prompt)
52
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000).round
53
+
54
+ # Collect results
55
+ trace = engine.trace_data || {}
56
+ steps = trace[:steps] || []
57
+ tool_path = steps.flat_map { |s| (s[:tool_calls] || []).map { |tc| tc[:name] } }
58
+ total_tokens = steps.sum { |s|
59
+ u = s[:usage] || {}
60
+ (u[:input_tokens] || u["input_tokens"] || 0).to_i +
61
+ (u[:output_tokens] || u["output_tokens"] || 0).to_i
62
+ }
63
+ rounds = trace[:total_iterations] || steps.size
64
+
65
+ correct = begin
66
+ task.expect.call(isolated_binding)
67
+ rescue => e
68
+ false
69
+ end
70
+
71
+ RunResult.new(
72
+ correct: correct,
73
+ rounds: rounds,
74
+ tokens: total_tokens,
75
+ tool_path: tool_path,
76
+ elapsed_ms: elapsed_ms,
77
+ error: nil
78
+ )
79
+ rescue => e
80
+ RunResult.new(
81
+ correct: false,
82
+ rounds: 0,
83
+ tokens: 0,
84
+ tool_path: [],
85
+ elapsed_ms: 0,
86
+ error: "#{e.class}: #{e.message}"
87
+ )
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ module Benchmark
5
+ # Scoring formula for benchmark runs.
6
+ #
7
+ # task_score = correctness * 100
8
+ # - (actual_rounds / max_rounds) * 20
9
+ # - (actual_tokens / max_tokens) * 20
10
+ # - path_penalty
11
+ module Scorer
12
+ # Score a single run against its task definition.
13
+ #
14
+ # @param run [RunResult]
15
+ # @param task [Task]
16
+ # @return [Float] score (max 100, can go negative)
17
+ def self.score_run(run, task)
18
+ correctness = run.correct ? 100.0 : 0.0
19
+
20
+ rounds_penalty = task.max_rounds > 0 ?
21
+ (run.rounds.to_f / task.max_rounds) * 20.0 : 0.0
22
+
23
+ tokens_penalty = task.max_tokens > 0 ?
24
+ (run.tokens.to_f / task.max_tokens) * 20.0 : 0.0
25
+
26
+ path_pen = path_penalty(run.tool_path || [], task.ideal_path || [])
27
+
28
+ [correctness - rounds_penalty - tokens_penalty - path_pen, 0.0].max
29
+ end
30
+
31
+ # Calculate path penalty using edit distance between actual and ideal tool sequences.
32
+ #
33
+ # @param actual [Array<String>] actual tool call sequence
34
+ # @param ideal [Array<String>] expected tool call sequence
35
+ # @return [Float] penalty (0 if paths match, higher for more divergence)
36
+ def self.path_penalty(actual, ideal)
37
+ return 0.0 if ideal.empty?
38
+
39
+ distance = levenshtein(actual, ideal)
40
+ # Normalize: each edit costs 5 points, max penalty 20
41
+ [distance * 5.0, 20.0].min
42
+ end
43
+
44
+ # Levenshtein distance between two arrays of strings.
45
+ def self.levenshtein(a, b)
46
+ m = a.size
47
+ n = b.size
48
+ d = Array.new(m + 1) { Array.new(n + 1, 0) }
49
+
50
+ (0..m).each { |i| d[i][0] = i }
51
+ (0..n).each { |j| d[0][j] = j }
52
+
53
+ (1..m).each do |i|
54
+ (1..n).each do |j|
55
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
56
+ d[i][j] = [
57
+ d[i - 1][j] + 1,
58
+ d[i][j - 1] + 1,
59
+ d[i - 1][j - 1] + cost
60
+ ].min
61
+ end
62
+ end
63
+ d[m][n]
64
+ end
65
+
66
+ private_class_method :levenshtein
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ module Benchmark
5
+ # A single benchmark task definition.
6
+ Task = Struct.new(
7
+ :id, # unique task identifier (String)
8
+ :layer, # :mana, :claw, :runtime, :evolution
9
+ :setup, # Proc → Hash of variables to inject into binding
10
+ :prompt, # String prompt to send to the engine
11
+ :expect, # Proc(binding) → boolean (correctness check)
12
+ :max_rounds, # maximum allowed LLM iterations
13
+ :max_tokens, # maximum allowed token usage
14
+ :ideal_path, # Array<String> of expected tool call sequence
15
+ keyword_init: true
16
+ )
17
+
18
+ # Result of a single run (one of 3 per task).
19
+ RunResult = Struct.new(
20
+ :correct, # boolean
21
+ :rounds, # actual LLM iterations
22
+ :tokens, # actual token usage (input + output)
23
+ :tool_path, # Array<String> actual tool call sequence
24
+ :elapsed_ms, # execution time in milliseconds
25
+ :error, # exception message if failed, nil otherwise
26
+ keyword_init: true
27
+ )
28
+
29
+ # Aggregated result for one task across multiple runs.
30
+ TaskResult = Struct.new(
31
+ :task, # Task instance
32
+ :runs, # Array<RunResult>
33
+ keyword_init: true
34
+ ) do
35
+ def pass_rate
36
+ return 0.0 if runs.empty?
37
+ runs.count(&:correct).to_f / runs.size
38
+ end
39
+
40
+ def avg_score
41
+ scores = runs.map { |r| Scorer.score_run(r, task) }
42
+ scores.sum / scores.size.to_f
43
+ end
44
+ end
45
+
46
+ # Suite-level result across all tasks.
47
+ SuiteResult = Struct.new(
48
+ :results, # Array<TaskResult>
49
+ :timestamp, # Time
50
+ keyword_init: true
51
+ ) do
52
+ def suite_score
53
+ return 0.0 if results.empty?
54
+ results.sum(&:avg_score) / results.size.to_f
55
+ end
56
+
57
+ def pass_rate
58
+ return 0.0 if results.empty?
59
+ results.sum(&:pass_rate) / results.size.to_f
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "claw_remember",
6
+ layer: :claw,
7
+ setup: -> {
8
+ { important_fact: "The project deadline is March 15th" }
9
+ },
10
+ prompt: "Read the `important_fact` variable and remember it using the remember tool.",
11
+ expect: ->(b) {
12
+ memory = Claw.memory
13
+ return false unless memory
14
+ memory.long_term.any? { |m| m[:content].include?("March 15th") }
15
+ },
16
+ max_rounds: 3,
17
+ max_tokens: 2000,
18
+ ideal_path: %w[read_var remember]
19
+ )
20
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "claw_session",
6
+ layer: :claw,
7
+ setup: -> {
8
+ { counter: 0, step: 5 }
9
+ },
10
+ prompt: "Read both `counter` and `step`, then set `counter` to `counter + step`.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:counter) == 5
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var read_var write_var]
17
+ )
18
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "evolution_trace",
6
+ layer: :evolution,
7
+ setup: -> {
8
+ { items: [1, 2, 3], total: nil }
9
+ },
10
+ prompt: "Calculate the sum of all elements in `items` and store it in `total`.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:total) == 6
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var eval_code write_var]
17
+ )
18
+ )
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "mana_call_func",
6
+ layer: :mana,
7
+ setup: -> {
8
+ {
9
+ greet: ->(name) { "Hello, #{name}!" },
10
+ result: nil
11
+ }
12
+ },
13
+ prompt: "Call the `greet` function with the argument \"World\" and store the return value in `result`.",
14
+ expect: ->(b) {
15
+ b.local_variable_get(:result) == "Hello, World!"
16
+ },
17
+ max_rounds: 3,
18
+ max_tokens: 2000,
19
+ ideal_path: %w[read_var call_function write_var]
20
+ )
21
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "mana_eval",
6
+ layer: :mana,
7
+ setup: -> {
8
+ { numbers: [3, 7, 2, 9, 1] }
9
+ },
10
+ prompt: "Sort the `numbers` array in descending order and store the result back in `numbers`.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:numbers) == [9, 7, 3, 2, 1]
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var eval_code]
17
+ )
18
+ )
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "mana_knowledge",
6
+ layer: :mana,
7
+ setup: -> {
8
+ { answer: nil }
9
+ },
10
+ prompt: "Use knowledge lookup to find what the Array#flatten method does, then set `answer` to the string \"recursive flatten\".",
11
+ expect: ->(b) {
12
+ val = b.local_variable_get(:answer)
13
+ val.is_a?(String) && val.downcase.include?("flatten")
14
+ },
15
+ max_rounds: 4,
16
+ max_tokens: 3000,
17
+ ideal_path: %w[knowledge_lookup write_var]
18
+ )
19
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "mana_var_readwrite",
6
+ layer: :mana,
7
+ setup: -> {
8
+ { x: 10, y: 20 }
9
+ },
10
+ prompt: "Set the variable `x` to 42 and `y` to the current value of `x` plus 8.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:x) == 42 && b.local_variable_get(:y) == 50
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var write_var read_var write_var]
17
+ )
18
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "runtime_fork",
6
+ layer: :runtime,
7
+ setup: -> {
8
+ { value: 100, doubled: nil }
9
+ },
10
+ prompt: "Read `value`, compute its double, and store the result in `doubled`.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:doubled) == 200
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var eval_code write_var]
17
+ )
18
+ )
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ Claw::Benchmark::Tasks.register(
4
+ Claw::Benchmark::Task.new(
5
+ id: "runtime_snapshot",
6
+ layer: :runtime,
7
+ setup: -> {
8
+ { data: "original" }
9
+ },
10
+ prompt: "Read `data`, then change `data` to \"modified\". The runtime will track the change.",
11
+ expect: ->(b) {
12
+ b.local_variable_get(:data) == "modified"
13
+ },
14
+ max_rounds: 3,
15
+ max_tokens: 2000,
16
+ ideal_path: %w[read_var write_var]
17
+ )
18
+ )
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Claw
4
+ module Benchmark
5
+ # Automatic evolution triggers based on benchmark results.
6
+ # Event-driven: checks after each benchmark run or trace write.
7
+ class Trigger
8
+ def initialize(runtime:, claw_dir: ".ruby-claw")
9
+ @runtime = runtime
10
+ @claw_dir = claw_dir
11
+ @mutex = Mutex.new
12
+ @evolution_running = false
13
+ end
14
+
15
+ # Check after a benchmark run completes.
16
+ # Triggers evolution if suite score regressed.
17
+ #
18
+ # @param current_score [Float] latest suite score
19
+ # @param previous_score [Float, nil] previous suite score
20
+ def check_after_benchmark!(current_score, previous_score)
21
+ return unless previous_score
22
+ return if current_score >= previous_score
23
+ return if @mutex.synchronize { @evolution_running }
24
+
25
+ trigger!(
26
+ reason: "score_regression",
27
+ detail: "#{previous_score.round(1)} → #{current_score.round(1)}"
28
+ )
29
+ end
30
+
31
+ # Check after a trace is written.
32
+ # Triggers evolution if the same task failed 3 consecutive times.
33
+ #
34
+ # @param task_id [String]
35
+ # @param recent_results [Array<Boolean>] last N correctness results
36
+ def check_after_trace!(task_id, recent_results)
37
+ return if recent_results.size < 3
38
+ return if @mutex.synchronize { @evolution_running }
39
+
40
+ if recent_results.last(3).none?
41
+ trigger!(
42
+ reason: "consecutive_failures",
43
+ detail: "#{task_id} failed 3 times in a row"
44
+ )
45
+ end
46
+ end
47
+
48
+ private
49
+
50
+ def trigger!(reason:, detail:)
51
+ @mutex.synchronize { @evolution_running = true }
52
+
53
+ @runtime&.record_event(
54
+ action: "evolution_triggered",
55
+ target: reason,
56
+ detail: detail
57
+ )
58
+
59
+ begin
60
+ evo = Claw::Evolution.new(runtime: @runtime, claw_dir: @claw_dir)
61
+ evo.evolve
62
+ ensure
63
+ @mutex.synchronize { @evolution_running = false }
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end