RubyGems - ruby-claw - Versions diffs - 0.1.2 → 0.2.0 - Mend

ruby-claw 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +94 -0
data/README.md +214 -10
data/exe/claw +42 -1
data/lib/claw/auto_forge.rb +66 -0
data/lib/claw/benchmark/benchmark.rb +79 -0
data/lib/claw/benchmark/diff.rb +69 -0
data/lib/claw/benchmark/report.rb +87 -0
data/lib/claw/benchmark/runner.rb +91 -0
data/lib/claw/benchmark/scorer.rb +69 -0
data/lib/claw/benchmark/task.rb +63 -0
data/lib/claw/benchmark/tasks/claw_remember.rb +20 -0
data/lib/claw/benchmark/tasks/claw_session.rb +18 -0
data/lib/claw/benchmark/tasks/evolution_trace.rb +18 -0
data/lib/claw/benchmark/tasks/mana_call_func.rb +21 -0
data/lib/claw/benchmark/tasks/mana_eval.rb +18 -0
data/lib/claw/benchmark/tasks/mana_knowledge.rb +19 -0
data/lib/claw/benchmark/tasks/mana_var_readwrite.rb +18 -0
data/lib/claw/benchmark/tasks/runtime_fork.rb +18 -0
data/lib/claw/benchmark/tasks/runtime_snapshot.rb +18 -0
data/lib/claw/benchmark/trigger.rb +68 -0
data/lib/claw/chat.rb +119 -6
data/lib/claw/child_runtime.rb +196 -0
data/lib/claw/cli.rb +177 -0
data/lib/claw/commands.rb +131 -0
data/lib/claw/config.rb +5 -1
data/lib/claw/console/event_logger.rb +69 -0
data/lib/claw/console/public/app.js +264 -0
data/lib/claw/console/public/style.css +330 -0
data/lib/claw/console/server.rb +253 -0
data/lib/claw/console/sse.rb +28 -0
data/lib/claw/console/views/experiments.erb +8 -0
data/lib/claw/console/views/index.erb +27 -0
data/lib/claw/console/views/layout.erb +29 -0
data/lib/claw/console/views/memory.erb +13 -0
data/lib/claw/console/views/monitor.erb +15 -0
data/lib/claw/console/views/prompt.erb +15 -0
data/lib/claw/console/views/snapshots.erb +12 -0
data/lib/claw/console/views/tools.erb +13 -0
data/lib/claw/console/views/traces.erb +9 -0
data/lib/claw/console.rb +5 -0
data/lib/claw/evolution.rb +227 -0
data/lib/claw/forge.rb +144 -0
data/lib/claw/hub.rb +67 -0
data/lib/claw/init.rb +199 -0
data/lib/claw/knowledge.rb +36 -2
data/lib/claw/memory_store.rb +2 -2
data/lib/claw/plan_mode.rb +110 -0
data/lib/claw/resource.rb +35 -0
data/lib/claw/resources/binding_resource.rb +128 -0
data/lib/claw/resources/context_resource.rb +73 -0
data/lib/claw/resources/filesystem_resource.rb +107 -0
data/lib/claw/resources/memory_resource.rb +74 -0
data/lib/claw/resources/worktree_resource.rb +133 -0
data/lib/claw/roles.rb +56 -0
data/lib/claw/runtime.rb +189 -0
data/lib/claw/serializer.rb +10 -7
data/lib/claw/tool.rb +99 -0
data/lib/claw/tool_index.rb +84 -0
data/lib/claw/tool_registry.rb +100 -0
data/lib/claw/trace.rb +86 -0
data/lib/claw/tui/agent_executor.rb +92 -0
data/lib/claw/tui/chat_panel.rb +81 -0
data/lib/claw/tui/command_bar.rb +22 -0
data/lib/claw/tui/file_card.rb +88 -0
data/lib/claw/tui/folding.rb +80 -0
data/lib/claw/tui/input_handler.rb +73 -0
data/lib/claw/tui/layout.rb +34 -0
data/lib/claw/tui/messages.rb +31 -0
data/lib/claw/tui/model.rb +411 -0
data/lib/claw/tui/object_explorer.rb +136 -0
data/lib/claw/tui/status_bar.rb +30 -0
data/lib/claw/tui/status_panel.rb +133 -0
data/lib/claw/tui/styles.rb +58 -0
data/lib/claw/tui/tui.rb +54 -0
data/lib/claw/version.rb +1 -1
data/lib/claw.rb +99 -1
metadata +223 -7

data/lib/claw/benchmark/report.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require "fileutils"
+module Claw
+  module Benchmark
+    # Generate Markdown benchmark reports.
+    module Report
+      # Generate a full report from suite results.
+      #
+      # @param suite [SuiteResult]
+      # @return [String] Markdown report
+      def self.generate(suite)
+        lines = ["# Benchmark Report\n"]
+        lines << "**Date:** #{suite.timestamp.strftime('%Y-%m-%d %H:%M:%S')}"
+        lines << ""
+        # Summary
+        lines << "## Summary"
+        lines << ""
+        total = suite.results.size
+        passed = suite.results.count { |r| r.pass_rate == 1.0 }
+        lines << "| Metric | Value |"
+        lines << "|--------|-------|"
+        lines << "| Total tasks | #{total} |"
+        lines << "| Suite score | #{suite.suite_score.round(1)} |"
+        lines << "| Pass rate | #{(suite.pass_rate * 100).round(1)}% |"
+        lines << "| All-pass tasks | #{passed}/#{total} |"
+        lines << ""
+        # By Layer
+        lines << "## By Layer"
+        lines << ""
+        layers = suite.results.group_by { |r| r.task.layer }
+        lines << "| Layer | Tasks | Pass Rate | Avg Score |"
+        lines << "|-------|-------|-----------|-----------|"
+        layers.each do |layer, results|
+          pr = results.sum(&:pass_rate) / results.size * 100
+          sc = results.sum(&:avg_score) / results.size
+          lines << "| #{layer} | #{results.size} | #{pr.round(1)}% | #{sc.round(1)} |"
+        end
+        lines << ""
+        # Task Details
+        lines << "## Task Details"
+        lines << ""
+        suite.results.each do |tr|
+          lines << "### #{tr.task.id} (#{tr.task.layer})"
+          lines << ""
+          lines << "- **Score:** #{tr.avg_score.round(1)}"
+          lines << "- **Pass rate:** #{(tr.pass_rate * 100).round(0)}%"
+          lines << ""
+          lines << "| Run | Correct | Rounds | Tokens | Time (ms) | Path |"
+          lines << "|-----|---------|--------|--------|-----------|------|"
+          tr.runs.each_with_index do |run, i|
+            path = (run.tool_path || []).join(" → ")
+            mark = run.correct ? "✓" : "✗"
+            lines << "| #{i + 1} | #{mark} | #{run.rounds} | #{run.tokens} | #{run.elapsed_ms} | #{path} |"
+          end
+          if tr.runs.any? { |r| r.error }
+            lines << ""
+            tr.runs.each_with_index do |run, i|
+              lines << "- Run #{i + 1} error: #{run.error}" if run.error
+            end
+          end
+          lines << ""
+        end
+        lines.join("\n")
+      end
+      # Save report to .ruby-claw/benchmarks/
+      #
+      # @param report_text [String] Markdown content
+      # @param claw_dir [String] path to .ruby-claw/
+      # @return [String] file path
+      def self.save(report_text, claw_dir = ".ruby-claw")
+        dir = File.join(claw_dir, "benchmarks")
+        FileUtils.mkdir_p(dir)
+        filename = "#{Time.now.strftime('%Y-%m-%d_%H%M%S')}.md"
+        path = File.join(dir, filename)
+        File.write(path, report_text)
+        path
+      end
+    end
+  end
+end

data/lib/claw/benchmark/runner.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module Claw
+  module Benchmark
+    # Executes benchmark tasks. Each task runs 3 times with a clean environment.
+    class Runner
+      RUNS_PER_TASK = 3
+      # Run the entire benchmark suite.
+      #
+      # @param tasks [Array<Task>] tasks to run
+      # @param on_progress [Proc, nil] called with (task_id, run_index, total)
+      # @return [SuiteResult]
+      def run_all(tasks, &on_progress)
+        total = tasks.size * RUNS_PER_TASK
+        completed = 0
+        results = tasks.map do |task|
+          runs = RUNS_PER_TASK.times.map do |i|
+            result = run_once(task)
+            completed += 1
+            on_progress&.call(task.id, i + 1, total, completed)
+            result
+          end
+          TaskResult.new(task: task, runs: runs)
+        end
+        SuiteResult.new(results: results, timestamp: Time.now)
+      end
+      # Execute a single task run with a clean environment.
+      #
+      # @param task [Task]
+      # @return [RunResult]
+      def run_once(task)
+        # Create isolated binding
+        isolated_binding = Object.new.instance_eval { binding }
+        # Setup: inject variables
+        vars = task.setup.call
+        vars.each { |k, v| isolated_binding.local_variable_set(k, v) }
+        # Create minimal runtime
+        runtime = Claw::Runtime.new
+        runtime.register("binding", Claw::Resources::BindingResource.new(isolated_binding))
+        runtime.snapshot!(label: "bench_start")
+        # Execute
+        t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+        engine = Mana::Engine.new(isolated_binding)
+        engine.execute(task.prompt)
+        elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000).round
+        # Collect results
+        trace = engine.trace_data || {}
+        steps = trace[:steps] || []
+        tool_path = steps.flat_map { |s| (s[:tool_calls] || []).map { |tc| tc[:name] } }
+        total_tokens = steps.sum { |s|
+          u = s[:usage] || {}
+          (u[:input_tokens] || u["input_tokens"] || 0).to_i +
+            (u[:output_tokens] || u["output_tokens"] || 0).to_i
+        }
+        rounds = trace[:total_iterations] || steps.size
+        correct = begin
+          task.expect.call(isolated_binding)
+        rescue => e
+          false
+        end
+        RunResult.new(
+          correct: correct,
+          rounds: rounds,
+          tokens: total_tokens,
+          tool_path: tool_path,
+          elapsed_ms: elapsed_ms,
+          error: nil
+        )
+      rescue => e
+        RunResult.new(
+          correct: false,
+          rounds: 0,
+          tokens: 0,
+          tool_path: [],
+          elapsed_ms: 0,
+          error: "#{e.class}: #{e.message}"
+        )
+      end
+    end
+  end
+end

data/lib/claw/benchmark/scorer.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+module Claw
+  module Benchmark
+    # Scoring formula for benchmark runs.
+    #
+    # task_score = correctness * 100
+    #            - (actual_rounds / max_rounds) * 20
+    #            - (actual_tokens / max_tokens) * 20
+    #            - path_penalty
+    module Scorer
+      # Score a single run against its task definition.
+      #
+      # @param run [RunResult]
+      # @param task [Task]
+      # @return [Float] score (max 100, can go negative)
+      def self.score_run(run, task)
+        correctness = run.correct ? 100.0 : 0.0
+        rounds_penalty = task.max_rounds > 0 ?
+          (run.rounds.to_f / task.max_rounds) * 20.0 : 0.0
+        tokens_penalty = task.max_tokens > 0 ?
+          (run.tokens.to_f / task.max_tokens) * 20.0 : 0.0
+        path_pen = path_penalty(run.tool_path || [], task.ideal_path || [])
+        [correctness - rounds_penalty - tokens_penalty - path_pen, 0.0].max
+      end
+      # Calculate path penalty using edit distance between actual and ideal tool sequences.
+      #
+      # @param actual [Array<String>] actual tool call sequence
+      # @param ideal [Array<String>] expected tool call sequence
+      # @return [Float] penalty (0 if paths match, higher for more divergence)
+      def self.path_penalty(actual, ideal)
+        return 0.0 if ideal.empty?
+        distance = levenshtein(actual, ideal)
+        # Normalize: each edit costs 5 points, max penalty 20
+        [distance * 5.0, 20.0].min
+      end
+      # Levenshtein distance between two arrays of strings.
+      def self.levenshtein(a, b)
+        m = a.size
+        n = b.size
+        d = Array.new(m + 1) { Array.new(n + 1, 0) }
+        (0..m).each { |i| d[i][0] = i }
+        (0..n).each { |j| d[0][j] = j }
+        (1..m).each do |i|
+          (1..n).each do |j|
+            cost = a[i - 1] == b[j - 1] ? 0 : 1
+            d[i][j] = [
+              d[i - 1][j] + 1,
+              d[i][j - 1] + 1,
+              d[i - 1][j - 1] + cost
+            ].min
+          end
+        end
+        d[m][n]
+      end
+      private_class_method :levenshtein
+    end
+  end
+end

data/lib/claw/benchmark/task.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+module Claw
+  module Benchmark
+    # A single benchmark task definition.
+    Task = Struct.new(
+      :id,          # unique task identifier (String)
+      :layer,       # :mana, :claw, :runtime, :evolution
+      :setup,       # Proc → Hash of variables to inject into binding
+      :prompt,      # String prompt to send to the engine
+      :expect,      # Proc(binding) → boolean (correctness check)
+      :max_rounds,  # maximum allowed LLM iterations
+      :max_tokens,  # maximum allowed token usage
+      :ideal_path,  # Array<String> of expected tool call sequence
+      keyword_init: true
+    )
+    # Result of a single run (one of 3 per task).
+    RunResult = Struct.new(
+      :correct,     # boolean
+      :rounds,      # actual LLM iterations
+      :tokens,      # actual token usage (input + output)
+      :tool_path,   # Array<String> actual tool call sequence
+      :elapsed_ms,  # execution time in milliseconds
+      :error,       # exception message if failed, nil otherwise
+      keyword_init: true
+    )
+    # Aggregated result for one task across multiple runs.
+    TaskResult = Struct.new(
+      :task,        # Task instance
+      :runs,        # Array<RunResult>
+      keyword_init: true
+    ) do
+      def pass_rate
+        return 0.0 if runs.empty?
+        runs.count(&:correct).to_f / runs.size
+      end
+      def avg_score
+        scores = runs.map { |r| Scorer.score_run(r, task) }
+        scores.sum / scores.size.to_f
+      end
+    end
+    # Suite-level result across all tasks.
+    SuiteResult = Struct.new(
+      :results,     # Array<TaskResult>
+      :timestamp,   # Time
+      keyword_init: true
+    ) do
+      def suite_score
+        return 0.0 if results.empty?
+        results.sum(&:avg_score) / results.size.to_f
+      end
+      def pass_rate
+        return 0.0 if results.empty?
+        results.sum(&:pass_rate) / results.size.to_f
+      end
+    end
+  end
+end

data/lib/claw/benchmark/tasks/claw_remember.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "claw_remember",
+    layer: :claw,
+    setup: -> {
+      { important_fact: "The project deadline is March 15th" }
+    },
+    prompt: "Read the `important_fact` variable and remember it using the remember tool.",
+    expect: ->(b) {
+      memory = Claw.memory
+      return false unless memory
+      memory.long_term.any? { |m| m[:content].include?("March 15th") }
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var remember]
+  )
+)

data/lib/claw/benchmark/tasks/claw_session.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "claw_session",
+    layer: :claw,
+    setup: -> {
+      { counter: 0, step: 5 }
+    },
+    prompt: "Read both `counter` and `step`, then set `counter` to `counter + step`.",
+    expect: ->(b) {
+      b.local_variable_get(:counter) == 5
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var read_var write_var]
+  )
+)

data/lib/claw/benchmark/tasks/evolution_trace.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "evolution_trace",
+    layer: :evolution,
+    setup: -> {
+      { items: [1, 2, 3], total: nil }
+    },
+    prompt: "Calculate the sum of all elements in `items` and store it in `total`.",
+    expect: ->(b) {
+      b.local_variable_get(:total) == 6
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var eval_code write_var]
+  )
+)

data/lib/claw/benchmark/tasks/mana_call_func.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "mana_call_func",
+    layer: :mana,
+    setup: -> {
+      {
+        greet: ->(name) { "Hello, #{name}!" },
+        result: nil
+      }
+    },
+    prompt: "Call the `greet` function with the argument \"World\" and store the return value in `result`.",
+    expect: ->(b) {
+      b.local_variable_get(:result) == "Hello, World!"
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var call_function write_var]
+  )
+)

data/lib/claw/benchmark/tasks/mana_eval.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "mana_eval",
+    layer: :mana,
+    setup: -> {
+      { numbers: [3, 7, 2, 9, 1] }
+    },
+    prompt: "Sort the `numbers` array in descending order and store the result back in `numbers`.",
+    expect: ->(b) {
+      b.local_variable_get(:numbers) == [9, 7, 3, 2, 1]
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var eval_code]
+  )
+)

data/lib/claw/benchmark/tasks/mana_knowledge.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "mana_knowledge",
+    layer: :mana,
+    setup: -> {
+      { answer: nil }
+    },
+    prompt: "Use knowledge lookup to find what the Array#flatten method does, then set `answer` to the string \"recursive flatten\".",
+    expect: ->(b) {
+      val = b.local_variable_get(:answer)
+      val.is_a?(String) && val.downcase.include?("flatten")
+    },
+    max_rounds: 4,
+    max_tokens: 3000,
+    ideal_path: %w[knowledge_lookup write_var]
+  )
+)

data/lib/claw/benchmark/tasks/mana_var_readwrite.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "mana_var_readwrite",
+    layer: :mana,
+    setup: -> {
+      { x: 10, y: 20 }
+    },
+    prompt: "Set the variable `x` to 42 and `y` to the current value of `x` plus 8.",
+    expect: ->(b) {
+      b.local_variable_get(:x) == 42 && b.local_variable_get(:y) == 50
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var write_var read_var write_var]
+  )
+)

data/lib/claw/benchmark/tasks/runtime_fork.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "runtime_fork",
+    layer: :runtime,
+    setup: -> {
+      { value: 100, doubled: nil }
+    },
+    prompt: "Read `value`, compute its double, and store the result in `doubled`.",
+    expect: ->(b) {
+      b.local_variable_get(:doubled) == 200
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var eval_code write_var]
+  )
+)

data/lib/claw/benchmark/tasks/runtime_snapshot.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+Claw::Benchmark::Tasks.register(
+  Claw::Benchmark::Task.new(
+    id: "runtime_snapshot",
+    layer: :runtime,
+    setup: -> {
+      { data: "original" }
+    },
+    prompt: "Read `data`, then change `data` to \"modified\". The runtime will track the change.",
+    expect: ->(b) {
+      b.local_variable_get(:data) == "modified"
+    },
+    max_rounds: 3,
+    max_tokens: 2000,
+    ideal_path: %w[read_var write_var]
+  )
+)

data/lib/claw/benchmark/trigger.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+module Claw
+  module Benchmark
+    # Automatic evolution triggers based on benchmark results.
+    # Event-driven: checks after each benchmark run or trace write.
+    class Trigger
+      def initialize(runtime:, claw_dir: ".ruby-claw")
+        @runtime = runtime
+        @claw_dir = claw_dir
+        @mutex = Mutex.new
+        @evolution_running = false
+      end
+      # Check after a benchmark run completes.
+      # Triggers evolution if suite score regressed.
+      #
+      # @param current_score [Float] latest suite score
+      # @param previous_score [Float, nil] previous suite score
+      def check_after_benchmark!(current_score, previous_score)
+        return unless previous_score
+        return if current_score >= previous_score
+        return if @mutex.synchronize { @evolution_running }
+        trigger!(
+          reason: "score_regression",
+          detail: "#{previous_score.round(1)} → #{current_score.round(1)}"
+        )
+      end
+      # Check after a trace is written.
+      # Triggers evolution if the same task failed 3 consecutive times.
+      #
+      # @param task_id [String]
+      # @param recent_results [Array<Boolean>] last N correctness results
+      def check_after_trace!(task_id, recent_results)
+        return if recent_results.size < 3
+        return if @mutex.synchronize { @evolution_running }
+        if recent_results.last(3).none?
+          trigger!(
+            reason: "consecutive_failures",
+            detail: "#{task_id} failed 3 times in a row"
+          )
+        end
+      end
+      private
+      def trigger!(reason:, detail:)
+        @mutex.synchronize { @evolution_running = true }
+        @runtime&.record_event(
+          action: "evolution_triggered",
+          target: reason,
+          detail: detail
+        )
+        begin
+          evo = Claw::Evolution.new(runtime: @runtime, claw_dir: @claw_dir)
+          evo.evolve
+        ensure
+          @mutex.synchronize { @evolution_running = false }
+        end
+      end
+    end
+  end
+end