RubyGems - ruby-skill-bench - Versions diffs - 1.1.0 → 1.2.0 - Mend

ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +4 -4
data/README.md +166 -35
data/docs/architecture.md +3 -1
data/docs/first-eval-guide.md +7 -7
data/docs/testing-guide.md +1 -1
data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
data/lib/skill_bench/agent/react_agent/step.rb +7 -1
data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
data/lib/skill_bench/cli/help_printer.rb +10 -2
data/lib/skill_bench/cli/init_command.rb +2 -1
data/lib/skill_bench/cli/result_printer.rb +1 -1
data/lib/skill_bench/cli/run_command.rb +47 -9
data/lib/skill_bench/cli/validate_command.rb +242 -0
data/lib/skill_bench/cli.rb +3 -0
data/lib/skill_bench/client.rb +43 -1
data/lib/skill_bench/clients/all.rb +2 -0
data/lib/skill_bench/clients/base_client.rb +12 -1
data/lib/skill_bench/clients/base_url_validator.rb +105 -0
data/lib/skill_bench/clients/provider_config.rb +34 -1
data/lib/skill_bench/clients/provider_schemas.rb +4 -0
data/lib/skill_bench/clients/providers/mistral.rb +47 -0
data/lib/skill_bench/commands/init.rb +5 -0
data/lib/skill_bench/commands/skill_new.rb +3 -1
data/lib/skill_bench/config/applier.rb +2 -0
data/lib/skill_bench/config/defaults.rb +2 -0
data/lib/skill_bench/config/facade_readers.rb +7 -0
data/lib/skill_bench/config/facade_writers.rb +17 -0
data/lib/skill_bench/config/json_loader.rb +1 -1
data/lib/skill_bench/config/store.rb +29 -0
data/lib/skill_bench/config.rb +18 -0
data/lib/skill_bench/evaluation/runner.rb +20 -3
data/lib/skill_bench/execution/context_hydrator.rb +52 -11
data/lib/skill_bench/execution/sandbox.rb +58 -11
data/lib/skill_bench/judge/judge.rb +4 -0
data/lib/skill_bench/judge/prompt.rb +42 -6
data/lib/skill_bench/models/config.rb +32 -0
data/lib/skill_bench/output_formatter.rb +60 -1
data/lib/skill_bench/package_verifier.rb +1 -1
data/lib/skill_bench/rails/skill_templates.rb +19 -5
data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
data/lib/skill_bench/services/batch_runner_service.rb +111 -0
data/lib/skill_bench/services/compare_option_parser.rb +1 -0
data/lib/skill_bench/services/cost_calculator.rb +91 -0
data/lib/skill_bench/services/html_formatter.rb +289 -0
data/lib/skill_bench/services/json_formatter.rb +19 -1
data/lib/skill_bench/services/junit_formatter.rb +74 -24
data/lib/skill_bench/services/provider_resolver.rb +5 -2
data/lib/skill_bench/services/response_cache.rb +130 -0
data/lib/skill_bench/services/runner_service.rb +88 -4
data/lib/skill_bench/services/summary_formatter.rb +90 -0
data/lib/skill_bench/services/template_registry.rb +43 -9
data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
data/lib/skill_bench/tools/registry.rb +29 -3
data/lib/skill_bench/tools/run_command.rb +171 -19
data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
data/lib/skill_bench/trend_tracker.rb +5 -5
data/lib/skill_bench/version.rb +1 -1
data/lib/skill_bench.rb +2 -3
metadata +17 -36

data/lib/skill_bench/models/config.rb CHANGED Viewed

@@ -24,6 +24,30 @@ module SkillBench
         new(raw_data)
       end
+      # Returns the configuration for a path, memoizing the parse per run.
+      #
+      # Hot paths such as {SkillBench::Services::ProviderResolver} resolve the
+      # provider on every run, yet skill-bench.json is stable within a single
+      # run. The parse is cached per absolute path and invalidated when the
+      # file's mtime changes, so the file is parsed at most once per run while
+      # a rewritten file (for example between tests) is still re-read. Reset by
+      # setting the @loaded ivar to nil.
+      #
+      # @param path [String] Path to config file (default: skill-bench.json)
+      # @return [SkillBench::Models::Config] Memoized config instance
+      # @raise [Errno::ENOENT] if config file not found
+      def self.loaded(path = 'skill-bench.json')
+        key = File.expand_path(path)
+        mtime = File.mtime(key)
+        cache = (@loaded ||= {})
+        entry = cache[key]
+        return entry[:config] if entry && entry[:mtime] == mtime
+        config = load(path)
+        cache[key] = { mtime: mtime, config: config }
+        config
+      end
       # Returns the configured provider name
       # @return [String, nil] Provider name
       def provider_name
@@ -36,6 +60,14 @@ module SkillBench
         @data[:config] || {}
       end
+      # Indicates whether the config explicitly selects the built-in mock
+      # provider, as opposed to having no provider configured at all.
+      #
+      # @return [Boolean] true when the configured provider is 'mock'
+      def mock?
+        provider_name == 'mock'
+      end
       # Returns max execution time
       # @return [Integer] Max execution time in seconds
       def max_execution_time

data/lib/skill_bench/output_formatter.rb CHANGED Viewed

@@ -5,6 +5,7 @@ require_relative 'services/delta_table_formatter'
 require_relative 'services/feedback_generator'
 require_relative 'services/json_formatter'
 require_relative 'services/junit_formatter'
+require_relative 'services/html_formatter'
 module SkillBench
   # Handles formatting output for different use cases (human, CI, etc.).
@@ -14,7 +15,7 @@ module SkillBench
     # Format the eval result for output.
     #
     # @param result [Hash] Eval result with keys like :eval_name, :pass, :score, etc.
-    # @param format [Symbol] Output format (:human, :json, :junit)
+    # @param format [Symbol] Output format (:human, :json, :junit, :html)
     # @return [String] Formatted output string
     def self.format(result, format: :human)
       case format
@@ -22,6 +23,8 @@ module SkillBench
         Services::JsonFormatter.format(result)
       when :junit
         Services::JUnitFormatter.format(result)
+      when :html
+        Services::HtmlFormatter.format(result)
       else
         format_human(result)
       end
@@ -39,6 +42,48 @@ module SkillBench
       report&.verdict ? 0 : 1
     end
+    # Format an aggregate batch result for human output.
+    #
+    # Renders one PASS/FAIL line per eval plus a final summary line.
+    #
+    # @param aggregate [Hash] Aggregate envelope with :results and :summary.
+    # @return [String] Human-readable batch summary.
+    def self.format_batch(aggregate)
+      lines = aggregate[:results].map { |result| batch_result_line(result) }
+      lines << ''
+      lines << batch_summary_line(aggregate[:summary])
+      lines.join("\n")
+    end
+    # Determine the exit code for an aggregate batch result.
+    #
+    # @param aggregate [Hash] Aggregate envelope with a :summary.
+    # @return [Integer] 0 when every eval passed, 1 when any failed.
+    def self.batch_exit_code(aggregate)
+      aggregate.dig(:summary, :failed).to_i.positive? ? 1 : 0
+    end
+    # Builds a single PASS/FAIL line for one eval result.
+    #
+    # @param result [Hash] A single-eval result envelope.
+    # @return [String] A formatted verdict line.
+    def self.batch_result_line(result)
+      status = exit_code(result).zero? ? 'PASS' : 'FAIL'
+      line = "#{status}  #{result[:eval_name]}"
+      error = result.dig(:response, :error, :message)
+      error ? "#{line} — #{error}" : line
+    end
+    private_class_method :batch_result_line
+    # Builds the trailing summary line for a batch run.
+    #
+    # @param summary [Hash] Summary with :passed, :failed and :total counts.
+    # @return [String] A formatted summary line.
+    def self.batch_summary_line(summary)
+      "Summary: #{summary[:passed]} passed / #{summary[:failed]} failed (#{summary[:total]} total)"
+    end
+    private_class_method :batch_summary_line
     # Format result as human-readable text.
     #
     # @param result [Hash] Eval result in old or new format.
@@ -93,6 +138,7 @@ module SkillBench
         "  Eval: #{result[:eval_name] || ''}",
         "  Skill: #{result[:skill_name] || ''}",
         "  Provider: #{result[:provider_name] || ''}",
+        build_usage_line(result),
         ('═' * 55),
         ''
       ]
@@ -110,6 +156,19 @@ module SkillBench
     end
     private_class_method :format_delta_report
+    # Builds the token/cost summary line for the report header.
+    #
+    # @param result [Hash] Eval result envelope; reads :tokens and :cost.
+    # @return [String] A formatted "Tokens / Est. Cost" line.
+    def self.build_usage_line(result)
+      tokens = result[:tokens] || {}
+      total = tokens[:total_tokens] || tokens['total_tokens'] || 0
+      cost = result[:cost]
+      cost_label = cost ? Kernel.format('$%.4f', cost) : '—'
+      "  Tokens: #{total}  |  Est. Cost: #{cost_label}"
+    end
+    private_class_method :build_usage_line
     # Builds iteration timeline lines from the result response.
     #
     # @param result [Hash] Eval result envelope.

data/lib/skill_bench/package_verifier.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module SkillBench
       lib/skill_bench/config/json_loader.rb
       lib/skill_bench/config/store.rb
       lib/skill_bench/package_verifier.rb
-      lib/skill_bench/source_path_resolver.rb
+      lib/skill_bench/execution/source_path_resolver.rb
       lib/skill_bench/runner.rb
     ].freeze

data/lib/skill_bench/rails/skill_templates.rb CHANGED Viewed

@@ -1,16 +1,30 @@
 # frozen_string_literal: true
-require 'active_support/inflector'
 module SkillBench
   module Rails
     # Generates Rails-specific skill templates
     class SkillTemplates
+      # Convert a snake_case or kebab-case name to CamelCase.
+      #
+      # Replaces ActiveSupport's +String#camelize+ for the scaffold inputs used
+      # here: it splits on +_+ and +-+ separators, upcases the first letter of
+      # each segment, and preserves any segment that is already CamelCase.
+      #
+      # @example
+      #   SkillTemplates.camelize('user_creator') # => "UserCreator"
+      #   SkillTemplates.camelize('order-service') # => "OrderService"
+      #   SkillTemplates.camelize('UserCreator')   # => "UserCreator"
+      # @param name [String] snake_case, kebab-case, or already-CamelCase name
+      # @return [String] CamelCase name
+      def self.camelize(name)
+        name.split(/[-_]/).map { |segment| segment.empty? ? segment : segment[0].upcase + segment[1..] }.join
+      end
       # Generate a service object template
       # @param name [String] Service name (e.g., 'my_service' or 'my-service')
       # @return [String] Service object Ruby class
       def self.service_object(name)
-        class_name = name.split(/[-_]/).map(&:capitalize).join
+        class_name = camelize(name)
         <<~RUBY
           # frozen_string_literal: true
@@ -43,7 +57,7 @@ module SkillBench
       # @param name [String] Concern name (e.g., 'my_concern')
       # @return [String] Concern module
       def self.concern(name)
-        module_name = name.camelize
+        module_name = camelize(name)
         <<~RUBY
           # frozen_string_literal: true
@@ -67,7 +81,7 @@ module SkillBench
       # @param name [String] Model name (e.g., 'my_model')
       # @return [String] ActiveRecord model class
       def self.active_record_model(name)
-        class_name = name.camelize
+        class_name = camelize(name)
         <<~RUBY
           # frozen_string_literal: true

data/lib/skill_bench/services/agent_spawner_service.rb CHANGED Viewed

@@ -7,6 +7,9 @@ module SkillBench
   module Services
     # Spawns and executes LLM agents for evaluation.
     class AgentSpawnerService
+      # Zeroed token usage used when a run produces no usage data (e.g. mock, rescue).
+      EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
       # Spawns the LLM agent with the given system prompt.
       #
       # @param evaluation [SkillBench::Models::Eval] The eval being run
@@ -33,7 +36,7 @@ module SkillBench
       #
       # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
       def call
-        return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
+        return { result: 'mock result', status: :success, iterations: [], usage: EMPTY_USAGE } if @provider.name == 'mock'
         client_params = build_client_params
         max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
@@ -63,6 +66,7 @@ module SkillBench
           final_answer = agent_result.dig(:response, :content) || ''
           diff = Execution::Sandbox.capture_diff(sandbox.path)
           iterations = agent_result.dig(:response, :iterations) || []
+          usage = agent_result.dig(:response, :usage) || EMPTY_USAGE
           output = [final_answer, diff].reject(&:empty?).join("\n\n")
@@ -70,7 +74,7 @@ module SkillBench
             result: output,
             status: status,
             runtime: @provider.runtime,
-            usage: {},
+            usage: usage,
             raw_response: agent_result,
             iterations: iterations
           }
@@ -80,7 +84,7 @@ module SkillBench
           result: "Error: #{e.message}",
           status: :error,
           runtime: @provider.runtime,
-          usage: {},
+          usage: EMPTY_USAGE,
           raw_response: { error: e.message, backtrace: e.backtrace },
           iterations: []
         }

data/lib/skill_bench/services/batch_runner_service.rb ADDED Viewed

@@ -0,0 +1,111 @@
+# frozen_string_literal: true
+require 'pathname'
+require 'parallel'
+require_relative 'runner_service'
+require_relative '../output_formatter'
+require_relative '../runner'
+module SkillBench
+  module Services
+    # Orchestrates running many evals in a single batch.
+    #
+    # Discovers every eval under a target directory and runs
+    # {RunnerService} over each, returning an aggregate envelope with
+    # per-eval results and a pass/fail summary.
+    #
+    # Discovery reuses {SkillBench::Runner.discover_task_dirs} but never
+    # routes through the deprecated {SkillBench::Task::Evaluator}: each eval
+    # is executed by the supported {RunnerService}.
+    class BatchRunnerService
+      # Default directory scanned for evals when none is supplied.
+      DEFAULT_EVALS_DIR = 'evals'
+      # Default batch-level thread count.
+      #
+      # Each {RunnerService.call} already runs its baseline and context
+      # agents concurrently (#26), so this is kept modest to bound nested
+      # thread usage (batch threads x per-eval threads).
+      DEFAULT_THREADS = 2
+      # Runs every eval discovered under +evals_dir+.
+      #
+      # @param skill_names [Array<String>] Names of the skills to apply to every eval
+      # @param evals_dir [String] Directory to scan for evals
+      # @param pack [String, nil] Optional pack name for registry-based skill resolution
+      # @param registry_manifest [String, nil] Optional path to registry.json manifest
+      # @param threads [Integer] Batch-level thread count
+      # @return [Hash] Aggregate envelope with :results and :summary
+      # @raise [ArgumentError] when no evals are found under +evals_dir+
+      def self.call(skill_names:, evals_dir: DEFAULT_EVALS_DIR, pack: nil, registry_manifest: nil, threads: DEFAULT_THREADS)
+        new(
+          skill_names: skill_names,
+          evals_dir: evals_dir,
+          pack: pack,
+          registry_manifest: registry_manifest,
+          threads: threads
+        ).call
+      end
+      # @param skill_names [Array<String>] Names of the skills
+      # @param evals_dir [String] Directory to scan for evals
+      # @param pack [String, nil] Optional pack name
+      # @param registry_manifest [String, nil] Optional registry.json path
+      # @param threads [Integer] Batch-level thread count
+      def initialize(skill_names:, evals_dir:, pack:, registry_manifest:, threads:)
+        @skill_names = skill_names
+        @evals_dir = evals_dir
+        @pack = pack
+        @registry_manifest = registry_manifest
+        @threads = threads
+      end
+      # Discovers the target evals and runs each through {RunnerService}.
+      #
+      # @return [Hash] Aggregate envelope with :results and :summary
+      # @raise [ArgumentError] when no evals are found under the directory
+      def call
+        eval_dirs = discover_eval_dirs
+        raise ArgumentError, "No evals found under #{evals_dir}" if eval_dirs.empty?
+        results = run_all(eval_dirs)
+        { results: results, summary: summarize(results) }
+      end
+      private
+      attr_reader :skill_names, :evals_dir, :pack, :registry_manifest, :threads
+      # Finds every eval directory under the configured root.
+      #
+      # @return [Array<Pathname>] Directories that contain a task.md
+      def discover_eval_dirs
+        SkillBench::Runner.discover_task_dirs(Pathname.new(evals_dir))
+      end
+      # Runs every eval directory through {RunnerService} concurrently.
+      #
+      # @param eval_dirs [Array<Pathname>] Discovered eval directories
+      # @return [Array<Hash>] Per-eval RunnerService results
+      def run_all(eval_dirs)
+        Parallel.map(eval_dirs, in_threads: threads) do |eval_dir|
+          RunnerService.call(
+            eval_name: eval_dir.to_s,
+            skill_names: skill_names,
+            pack: pack,
+            registry_manifest: registry_manifest
+          )
+        end
+      end
+      # Tallies pass/fail counts, reusing the single-eval exit-code logic.
+      #
+      # @param results [Array<Hash>] Per-eval results
+      # @return [Hash] Summary with :total, :passed and :failed counts
+      def summarize(results)
+        passed = results.count { |result| SkillBench::OutputFormatter.exit_code(result).zero? }
+        { total: results.size, passed: passed, failed: results.size - passed }
+      end
+    end
+  end
+end

data/lib/skill_bench/services/compare_option_parser.rb CHANGED Viewed

@@ -44,6 +44,7 @@ module SkillBench
           opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
           opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
           opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
+          opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
           opts.on('-h', '--help', 'Prints this help') do
             puts opts
             raise SkillBench::HelpRequested

data/lib/skill_bench/services/cost_calculator.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module SkillBench
+  module Services
+    # Estimates the USD cost of an LLM run from token usage and a model name.
+    #
+    # Prices are approximate, drawn from public OpenAI/Anthropic pricing pages,
+    # and expressed in USD per 1,000 tokens. Provider pricing changes over time,
+    # so treat the result as a rough estimate and extend {PRICES} as needed.
+    class CostCalculator
+      # Approximate per-model prices in USD per 1,000 tokens.
+      # Keyed by a canonical model prefix; longer prefixes win on lookup so that
+      # dated variants (e.g. "claude-sonnet-4-20250514") resolve correctly.
+      # Source: public OpenAI and Anthropic pricing pages (approximate).
+      PRICES = {
+        'gpt-4o-mini' => { input: 0.00015, output: 0.0006 },
+        'gpt-4o' => { input: 0.005, output: 0.015 },
+        'gpt-4-turbo' => { input: 0.01, output: 0.03 },
+        'gpt-4' => { input: 0.03, output: 0.06 },
+        'gpt-3.5-turbo' => { input: 0.0005, output: 0.0015 },
+        'claude-opus-4' => { input: 0.015, output: 0.075 },
+        'claude-sonnet-4' => { input: 0.003, output: 0.015 },
+        'claude-3-5-sonnet' => { input: 0.003, output: 0.015 },
+        'claude-3-5-haiku' => { input: 0.0008, output: 0.004 },
+        'claude-3-opus' => { input: 0.015, output: 0.075 },
+        'claude-3-sonnet' => { input: 0.003, output: 0.015 },
+        'claude-3-haiku' => { input: 0.00025, output: 0.00125 }
+      }.freeze
+      # Token count that one priced unit of {PRICES} covers.
+      TOKENS_PER_UNIT = 1000.0
+      # Estimates the USD cost for a run.
+      #
+      # @param usage [Hash, nil] Token usage with :prompt_tokens and :completion_tokens.
+      # @param model [String, nil] The model name (e.g. "gpt-4o").
+      # @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
+      def self.call(usage:, model:)
+        new(usage, model).call
+      end
+      # @param usage [Hash, nil] Token usage hash.
+      # @param model [String, nil] The model name.
+      def initialize(usage, model)
+        @usage = usage || {}
+        @model = model
+      end
+      # Estimates the USD cost for the configured usage and model.
+      #
+      # @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
+      def call
+        price = price_for(@model)
+        return nil unless price
+        input_cost = units(:prompt_tokens) * price[:input]
+        output_cost = units(:completion_tokens) * price[:output]
+        (input_cost + output_cost).round(6)
+      end
+      private
+      # Finds the price entry for a model by longest matching name prefix.
+      #
+      # @param model [String, nil] The model name.
+      # @return [Hash, nil] Price entry with :input and :output, or nil when unknown.
+      def price_for(model)
+        key = model.to_s.downcase
+        return PRICES[key] if PRICES.key?(key)
+        PRICES.select { |name, _| key.start_with?(name) }.max_by { |name, _| name.length }&.last
+      end
+      # Converts a usage token count into priced 1K-token units.
+      #
+      # @param key [Symbol] The usage key to read.
+      # @return [Float] The number of priced units.
+      def units(key)
+        token_count(key) / TOKENS_PER_UNIT
+      end
+      # Reads a token count from the usage hash, tolerating string keys.
+      #
+      # @param key [Symbol] The usage key (e.g. :prompt_tokens).
+      # @return [Integer] The token count, or zero when absent.
+      def token_count(key)
+        (@usage[key] || @usage[key.to_s] || 0).to_i
+      end
+    end
+  end
+end