RubyGems - ruby-skill-bench - Versions diffs - 1.0.1 → 1.2.0 - Mend

ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

checksums.yaml +4 -4
data/README.md +299 -23
data/docs/architecture.md +3 -1
data/docs/first-eval-guide.md +7 -7
data/docs/testing-guide.md +1 -1
data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
data/lib/skill_bench/agent/react_agent/step.rb +7 -1
data/lib/skill_bench/agent/react_agent.rb +2 -1
data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
data/lib/skill_bench/cli/help_printer.rb +10 -2
data/lib/skill_bench/cli/init_command.rb +2 -1
data/lib/skill_bench/cli/result_printer.rb +1 -1
data/lib/skill_bench/cli/run_command.rb +47 -9
data/lib/skill_bench/cli/validate_command.rb +242 -0
data/lib/skill_bench/cli.rb +3 -0
data/lib/skill_bench/client.rb +43 -1
data/lib/skill_bench/clients/all.rb +3 -0
data/lib/skill_bench/clients/base_client.rb +14 -6
data/lib/skill_bench/clients/base_url_validator.rb +105 -0
data/lib/skill_bench/clients/provider_config.rb +34 -1
data/lib/skill_bench/clients/provider_schemas.rb +4 -0
data/lib/skill_bench/clients/providers/mistral.rb +47 -0
data/lib/skill_bench/clients/request_builder.rb +2 -4
data/lib/skill_bench/clients/response_builder.rb +91 -0
data/lib/skill_bench/clients/response_error_handler.rb +5 -17
data/lib/skill_bench/clients/retry_handler.rb +4 -7
data/lib/skill_bench/commands/init.rb +5 -0
data/lib/skill_bench/commands/skill_new.rb +3 -1
data/lib/skill_bench/config/applier.rb +2 -0
data/lib/skill_bench/config/defaults.rb +2 -0
data/lib/skill_bench/config/facade_readers.rb +7 -0
data/lib/skill_bench/config/facade_writers.rb +17 -0
data/lib/skill_bench/config/json_loader.rb +1 -1
data/lib/skill_bench/config/store.rb +29 -0
data/lib/skill_bench/config.rb +18 -0
data/lib/skill_bench/constants.rb +58 -0
data/lib/skill_bench/evaluation/runner.rb +20 -3
data/lib/skill_bench/execution/context_hydrator.rb +66 -15
data/lib/skill_bench/execution/sandbox.rb +76 -14
data/lib/skill_bench/judge/judge.rb +4 -0
data/lib/skill_bench/judge/prompt.rb +42 -6
data/lib/skill_bench/models/config.rb +32 -0
data/lib/skill_bench/output_formatter.rb +60 -1
data/lib/skill_bench/package_verifier.rb +1 -1
data/lib/skill_bench/rails/skill_templates.rb +19 -5
data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
data/lib/skill_bench/services/batch_runner_service.rb +111 -0
data/lib/skill_bench/services/compare_option_parser.rb +1 -0
data/lib/skill_bench/services/cost_calculator.rb +91 -0
data/lib/skill_bench/services/html_formatter.rb +289 -0
data/lib/skill_bench/services/json_formatter.rb +19 -1
data/lib/skill_bench/services/junit_formatter.rb +74 -24
data/lib/skill_bench/services/provider_resolver.rb +5 -2
data/lib/skill_bench/services/response_cache.rb +130 -0
data/lib/skill_bench/services/runner_service.rb +88 -4
data/lib/skill_bench/services/summary_formatter.rb +90 -0
data/lib/skill_bench/services/template_registry.rb +43 -9
data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
data/lib/skill_bench/tools/registry.rb +29 -3
data/lib/skill_bench/tools/run_command.rb +172 -35
data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
data/lib/skill_bench/trend_tracker.rb +5 -5
data/lib/skill_bench/version.rb +1 -1
data/lib/skill_bench.rb +3 -3
metadata +19 -36

data/lib/skill_bench/agent/react_agent/step.rb CHANGED Viewed

@@ -12,7 +12,8 @@ module SkillBench
         #
         # @param messages [Array<Hash>] The conversation history.
         # @param config [Hash] Configuration for this step (client params, system prompt, working dir).
-        # @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished), and :messages.
+        # @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished),
+        #   :usage (token usage for this step), and :messages.
         def self.call(messages, config)
           messages = messages.dup
           client_result = Client.call(
@@ -21,12 +22,14 @@ module SkillBench
             tools: Tools.definitions,
             **config[:client_params]
           )
+          usage = client_result[:usage] || {}
           unless client_result[:success]
             error_msg = client_result.dig(:response, :error, :message) || 'Unknown error'
             return {
               continue: false,
               result: client_result,
+              usage: usage,
               iteration: build_iteration(thought: '', tools_used: [], observation_summary: error_msg)
             }
           end
@@ -36,6 +39,7 @@ module SkillBench
             return {
               continue: false,
               result: { success: false, response: { error: { message: 'Empty response from LLM' } } },
+              usage: usage,
               iteration: build_iteration(thought: '', tools_used: [], observation_summary: 'Empty response from LLM')
             }
           end
@@ -51,6 +55,7 @@ module SkillBench
             return {
               continue: false,
               result: { success: true, response: { content: content } },
+              usage: usage,
               iteration: build_iteration(thought: thought, tools_used: [], observation_summary: '')
             }
           end
@@ -69,6 +74,7 @@ module SkillBench
           {
             continue: true,
             messages: messages,
+            usage: usage,
             iteration: build_iteration(thought: thought, tools_used: tools_used, observation_summary: observation_summary)
           }
         end

data/lib/skill_bench/agent/react_agent.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require_relative '../constants'
 require_relative 'react_agent/step'
 require_relative 'react_agent/loop_runner'
@@ -29,7 +30,7 @@ module SkillBench
       def initialize(params)
         @system_prompt = params[:system_prompt]
         @initial_prompt = params[:initial_prompt]
-        @max_iterations = params[:max_iterations] || 25
+        @max_iterations = params[:max_iterations] || Constants::ReactAgent::DEFAULT_MAX_ITERATIONS
         @working_dir = params[:working_dir] || Dir.pwd
         @container_id = params[:container_id]
         @client_params = params[:client_params] || {}

data/lib/skill_bench/cli/batch_result_printer.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+require_relative '../output_formatter'
+require_relative '../services/summary_formatter'
+module SkillBench
+  module Cli
+    # Prints the aggregate result of a batch `skill-bench run --all` command.
+    #
+    # Defaults to the human-readable batch summary, but can instead emit a
+    # JUnit document (`format: :junit`) or a JSON gate (`summary: true`). The
+    # returned exit code is always {OutputFormatter.batch_exit_code}, so CI
+    # gating works identically across every output mode.
+    class BatchResultPrinter
+      # Prints the aggregate summary and returns the appropriate exit code.
+      #
+      # @param aggregate [Hash] Aggregate envelope from BatchRunnerService.
+      # @param format [Symbol, nil] Output format (:junit for JUnit XML, else human).
+      # @param summary [Boolean] When true, print the JSON summary gate instead.
+      # @return [Integer] Exit code (0 when all pass, 1 when any fails).
+      def self.call(aggregate, format: nil, summary: false)
+        puts batch_output(aggregate, format: format, summary: summary)
+        OutputFormatter.batch_exit_code(aggregate)
+      end
+      # Selects the rendered batch output for the requested mode.
+      #
+      # `:junit` and `:json` produce machine-readable batch output; `:json` maps
+      # to the same JSON gate as `summary: true`. `:html` (and any other format)
+      # falls back to the human batch summary, since there is no batch HTML report.
+      #
+      # @param aggregate [Hash] Aggregate envelope from BatchRunnerService.
+      # @param format [Symbol, nil] Output format (:junit, :json, else human).
+      # @param summary [Boolean] When true, render the JSON summary gate.
+      # @return [String] The formatted batch output.
+      def self.batch_output(aggregate, format:, summary:)
+        return Services::SummaryFormatter.format(aggregate) if summary || format == :json
+        return Services::JUnitFormatter.format_batch(aggregate) if format == :junit
+        OutputFormatter.format_batch(aggregate)
+      end
+      private_class_method :batch_output
+    end
+  end
+end

data/lib/skill_bench/cli/eval/eval_options.rb CHANGED Viewed

@@ -9,6 +9,7 @@ module SkillBench
       class BaseEvalOptions
         attr_reader :options, :parser
+        # Initializes the option set and the OptionParser used to parse the command's arguments.
         def initialize
           @options = default_options
           @parser = create_parser
@@ -39,10 +40,12 @@ module SkillBench
       class NewEvalOptions < BaseEvalOptions
         protected
+        # @return [Hash] default options for the `eval new` command, with the runtime defaulting to "ruby"
         def default_options
           { runtime: 'ruby' }
         end
+        # @return [OptionParser] parser for the `eval new` command, handling --runtime and --help
         def create_parser
           OptionParser.new do |opts|
             opts.banner = 'Usage: skill-bench eval new <name> [options]'
@@ -59,6 +62,7 @@ module SkillBench
       class GenerateEvalOptions < BaseEvalOptions
         protected
+        # @return [OptionParser] parser for the `eval generate` command, handling --name and --help
         def create_parser
           OptionParser.new do |opts|
             opts.banner = 'Usage: skill-bench eval generate <skill-name> [options]'

data/lib/skill_bench/cli/help_printer.rb CHANGED Viewed

@@ -20,11 +20,14 @@ module SkillBench
               --force    Overwrite existing config file
             run <eval> --skill <name> [--skill <name>] [--format FORMAT] [--pack NAME]
-              Run an evaluation
+              Run an evaluation (single eval, or a whole directory with --all)
               --skill    Skill to use (can be specified multiple times)
               --pack     Pack context for registry-based skill resolution
               --registry-manifest PATH  Path to registry.json manifest
-              --format   Output format: human, json, junit (default: human)
+              --format   Output format: human, json, junit, html (default: human)
+              --all      Run every eval under evals/ (batch mode)
+              --evals-dir DIR  Run every eval under DIR (batch mode)
+              --summary  Emit a JSON summary gate for a batch run (batch mode)
             compare <skill-name> --variant-a SPEC --variant-b SPEC --eval PATH
               Compare the same skill across two pack variants
@@ -45,6 +48,11 @@ module SkillBench
               Auto-generate an eval from a skill
               --name     Name for the generated eval (optional)
+            validate (alias: doctor) [--criteria PATH] [--config PATH]
+              Run read-only pre-flight checks (no eval, no network)
+              --criteria  Criteria JSON to validate (default: criteria.json)
+              --config    Config file to validate (default: skill-bench.json)
           Global Options:
             -h, --help        Show this help message
         USAGE

data/lib/skill_bench/cli/init_command.rb CHANGED Viewed

@@ -45,6 +45,7 @@ module SkillBench
         OptionParser.new do |opts|
           opts.banner = 'Usage: skill-bench init --<provider> [options]'
           register_provider_options(opts, options)
+          opts.on('--mock', 'Generate offline mock config (no API key required)') { options[:provider] = :mock }
           opts.on('--force', 'Overwrite existing config file') { options[:force] = true }
           opts.on('-h', '--help', 'Prints this help') do
             puts opts
@@ -60,7 +61,7 @@ module SkillBench
       end
       def error_missing_provider
-        providers = SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" }.join(', ')
+        providers = (SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" } + ['--mock']).join(', ')
         warn "Error: provider is required. Use one of: #{providers}"
         1
       end

data/lib/skill_bench/cli/result_printer.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module SkillBench
       # Prints the result and returns the appropriate exit code.
       #
       # @param result [Hash] Result from ScoringService
-      # @param format [Symbol] Output format (:human, :json, :junit)
+      # @param format [Symbol] Output format (:human, :json, :junit, :html)
       # @return [Integer] Exit code (0 for pass, 1 for fail)
       def self.call(result, format: :human)
         puts OutputFormatter.format(result, format: format)

data/lib/skill_bench/cli/run_command.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module SkillBench
         @argv = argv
       end
-      # Parses options and runs the eval.
+      # Parses options and runs the eval(s).
       #
       # @return [Integer] Exit code
       def call
@@ -27,14 +27,9 @@ module SkillBench
         parser = build_parser(options)
         parser.parse!(@argv)
-        eval_name = @argv.shift
-        return error_missing_eval unless eval_name
-        return error_missing_skill if options[:skill_names].empty? && !options[:pack]
+        return run_batch(options) if batch_requested?(options)
-        options[:eval_name] = eval_name
-        exec_options = options.reject { |key| key == :format }
-        result = Commands::Run.run(**exec_options)
-        ResultPrinter.call(result, format: options[:format] || :human)
+        run_single(options)
       rescue HelpRequested
         0
       rescue StandardError => e
@@ -44,13 +39,56 @@ module SkillBench
       private
+      # Whether a whole-directory batch run was requested.
+      #
+      # @param options [Hash] Parsed options
+      # @return [Boolean] true when --all or --evals-dir was given
+      def batch_requested?(options)
+        options[:all] || options[:evals_dir]
+      end
+      # Runs a single eval (the original `run <eval> --skill ...` path).
+      #
+      # @param options [Hash] Parsed options
+      # @return [Integer] Exit code
+      def run_single(options)
+        eval_name = @argv.shift
+        return error_missing_eval unless eval_name
+        return error_missing_skill if options[:skill_names].empty? && !options[:pack]
+        options[:eval_name] = eval_name
+        exec_options = options.reject { |key| %i[format summary all evals_dir].include?(key) }
+        result = Commands::Run.run(**exec_options)
+        ResultPrinter.call(result, format: options[:format] || :human)
+      end
+      # Runs every eval under the target directory and prints an aggregate.
+      #
+      # @param options [Hash] Parsed options
+      # @return [Integer] Exit code
+      def run_batch(options)
+        return error_missing_skill if options[:skill_names].empty? && !options[:pack]
+        aggregate = Services::BatchRunnerService.call(
+          evals_dir: options[:evals_dir] || Services::BatchRunnerService::DEFAULT_EVALS_DIR,
+          skill_names: options[:skill_names],
+          pack: options[:pack],
+          registry_manifest: options[:registry_manifest]
+        )
+        BatchResultPrinter.call(aggregate, format: options[:format], summary: options[:summary])
+      end
       def build_parser(options)
         OptionParser.new do |opts|
           opts.banner = 'Usage: skill-bench run <eval> [options]'
           opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
           opts.on('--pack NAME', 'Pack context for skill resolution') { |v| options[:pack] = v }
           opts.on('--registry-manifest PATH', 'Path to registry.json manifest') { |v| options[:registry_manifest] = v }
-          opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
+          opts.on('--format FORMAT', 'Output format (human, json, junit, html)') { |v| options[:format] = v.to_sym }
+          opts.on('--all', 'Run every eval under the default evals/ directory') { options[:all] = true }
+          opts.on('--evals-dir DIR', 'Run every eval under DIR') { |v| options[:evals_dir] = v }
+          opts.on('--summary', 'Emit a JSON summary gate for a batch run') { options[:summary] = true }
+          opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
           opts.on('-h', '--help', 'Prints this help') do
             puts opts
             raise SkillBench::HelpRequested

data/lib/skill_bench/cli/validate_command.rb ADDED Viewed

@@ -0,0 +1,242 @@
+# frozen_string_literal: true
+require 'json'
+require 'optparse'
+module SkillBench
+  module Cli
+    # Handles the `skill-bench validate` / `doctor` subcommand.
+    #
+    # Runs read-only pre-flight checks and prints a PASS/FAIL report:
+    #   1. Criteria JSON structure (via {Models::CriteriaValidator}).
+    #   2. skill-bench.json shape (hand-rolled, lightweight schema check).
+    #   3. Provider credentials for the configured non-mock provider.
+    #
+    # It never runs an eval and never makes a network call.
+    class ValidateCommand
+      # Default criteria file validated when --criteria is not given.
+      DEFAULT_CRITERIA = 'criteria.json'
+      # @param argv [Array<String>] Raw CLI arguments
+      # @return [Integer] Exit code
+      def self.call(argv)
+        new(argv).call
+      end
+      # @param argv [Array<String>] Raw CLI arguments
+      def initialize(argv)
+        @argv = argv
+      end
+      # Parses options, runs the pre-flight checks, and prints the report.
+      #
+      # @return [Integer] Exit code (0 when all checks pass, 1 otherwise)
+      def call
+        options = parse_options
+        config_path = options[:config] || SkillBench::Config::CONFIG_FILENAME
+        config_data = load_config_data(config_path)
+        results = [
+          check_criteria(options),
+          check_config(config_path, config_data),
+          check_provider_key(config_data)
+        ]
+        print_report(results)
+        results.any? { |result| result[:status] == :fail } ? 1 : 0
+      rescue HelpRequested
+        0
+      rescue StandardError => e
+        warn "Error: #{e.message}"
+        1
+      end
+      private
+      def parse_options
+        options = {}
+        build_parser(options).parse!(@argv)
+        options
+      end
+      def build_parser(options)
+        OptionParser.new do |opts|
+          opts.banner = 'Usage: skill-bench validate [options]'
+          opts.on('--criteria PATH', 'Criteria JSON file to validate (default: criteria.json)') { |v| options[:criteria] = v }
+          opts.on('--config PATH', 'Config file to validate (default: skill-bench.json)') { |v| options[:config] = v }
+          opts.on('-h', '--help', 'Prints this help') do
+            puts opts
+            raise SkillBench::HelpRequested
+          end
+        end
+      end
+      # --- Check (a): criteria ------------------------------------------------
+      def check_criteria(options)
+        path = options[:criteria] || DEFAULT_CRITERIA
+        unless File.exist?(path)
+          return fail_result('criteria', "criteria file not found: #{path}") if options[:criteria]
+          return skip_result('criteria', "no #{DEFAULT_CRITERIA} found (skipped)")
+        end
+        result = Models::CriteriaValidator.call(path:)
+        return pass_result('criteria', "#{path} is valid") if result[:success]
+        fail_result('criteria', "#{path}: #{criteria_error(result)}")
+      end
+      def criteria_error(result)
+        result.dig(:response, :error, :message) || 'invalid criteria'
+      end
+      # --- Check (b): config shape -------------------------------------------
+      def check_config(path, config_data)
+        case config_data[:status]
+        when :missing
+          fail_result('config', "#{path} not found")
+        when :invalid_json
+          fail_result('config', "#{path} is not valid JSON: #{config_data[:message]}")
+        else
+          validate_config_shape(path, config_data[:data])
+        end
+      end
+      def validate_config_shape(path, data)
+        return fail_result('config', "#{path} must contain a JSON object") unless data.is_a?(Hash)
+        errors = config_shape_errors(data)
+        return fail_result('config', errors.join('; ')) if errors.any?
+        pass_result('config', "#{path} matches the expected shape")
+      end
+      def config_shape_errors(data)
+        errors = provider_errors(data[:provider])
+        errors.concat(max_execution_time_errors(data[:max_execution_time]))
+        errors << "'config' must be an object" if data.key?(:config) && !data[:config].is_a?(Hash)
+        errors
+      end
+      def provider_errors(provider)
+        return ["'provider' is required"] if provider.nil?
+        return ["'provider' must be a string"] unless provider.is_a?(String)
+        allowed = Models::Provider::ALLOWED_PROVIDERS
+        return [] if allowed.include?(provider)
+        ["'provider' '#{provider}' is not one of: #{allowed.join(', ')}"]
+      end
+      def max_execution_time_errors(value)
+        return [] if value.nil?
+        return [] if value.is_a?(Integer) && value.positive?
+        ["'max_execution_time' must be a positive integer"]
+      end
+      # --- Check (c): provider key -------------------------------------------
+      def check_provider_key(config_data)
+        return skip_result('provider key', 'skipped (no usable config)') unless config_data[:status] == :ok
+        provider = config_provider(config_data[:data])
+        return skip_result('provider key', 'skipped (provider invalid)') unless provider
+        return pass_result('provider key', 'mock provider requires no API key') if provider == 'mock'
+        missing = missing_provider_keys(provider, config_data[:data][:config])
+        return pass_result('provider key', "#{provider} credentials present") if missing.empty?
+        fail_result('provider key', "#{provider} is missing: #{missing.join(', ')}")
+      rescue StandardError => e
+        # Building the client can raise on unrelated config (e.g. base_url
+        # validation); surface that as a structured FAIL rather than crashing.
+        fail_result('provider key', "#{provider} config is invalid: #{e.message}")
+      end
+      def config_provider(data)
+        return nil unless data.is_a?(Hash)
+        provider = data[:provider]
+        return nil unless provider.is_a?(String) && Models::Provider::ALLOWED_PROVIDERS.include?(provider)
+        provider
+      end
+      def missing_provider_keys(provider, provider_config)
+        provider_sym = provider.to_sym
+        options = provider_client_options(provider_sym, provider_config)
+        client = Clients::ProviderRegistry.for(provider_sym).new(options)
+        return [] unless client.respond_to?(:missing_config_keys, true)
+        client.send(:missing_config_keys)
+      end
+      def provider_client_options(provider_sym, provider_config)
+        options = provider_config.is_a?(Hash) ? provider_config.dup : {}
+        Models::Provider::ENV_OVERRIDABLE_SETTINGS.each do |setting|
+          value = env_setting(provider_sym, setting)
+          options[setting] = value unless value.nil?
+        end
+        options
+      end
+      def env_setting(provider_sym, setting)
+        provider = provider_sym.to_s.upcase
+        name = setting.to_s.upcase
+        ["SKILL_BENCH_#{provider}_#{name}", "#{provider}_#{name}"].each do |var|
+          value = ENV.fetch(var, nil)
+          return value if value && !value.empty?
+        end
+        nil
+      end
+      # --- Config loading ----------------------------------------------------
+      def load_config_data(path)
+        return { status: :missing } unless File.exist?(path)
+        { status: :ok, data: JSON.parse(File.read(path), symbolize_names: true) }
+      rescue JSON::ParserError => e
+        { status: :invalid_json, message: e.message }
+      end
+      # --- Reporting ---------------------------------------------------------
+      def print_report(results)
+        puts 'skill-bench validate'
+        puts
+        results.each { |result| puts format_result(result) }
+        puts
+        puts summary_line(results)
+      end
+      def format_result(result)
+        "[#{label(result[:status])}] #{result[:name].ljust(13)} #{result[:message]}"
+      end
+      def label(status)
+        { pass: 'PASS', fail: 'FAIL', skip: 'SKIP' }.fetch(status)
+      end
+      def summary_line(results)
+        failed = results.count { |result| result[:status] == :fail }
+        return "#{failed} check(s) failed." if failed.positive?
+        'All checks passed.'
+      end
+      def pass_result(name, message)
+        { name:, status: :pass, message: }
+      end
+      def fail_result(name, message)
+        { name:, status: :fail, message: }
+      end
+      def skip_result(name, message)
+        { name:, status: :skip, message: }
+      end
+    end
+  end
+end

data/lib/skill_bench/cli.rb CHANGED Viewed

@@ -5,8 +5,10 @@ require_relative 'cli/run_command'
 require_relative 'cli/compare_command'
 require_relative 'cli/skill_command'
 require_relative 'cli/eval_command'
+require_relative 'cli/validate_command'
 require_relative 'cli/help_printer'
 require_relative 'cli/result_printer'
+require_relative 'cli/batch_result_printer'
 module SkillBench
   # Raised when -h/--help is passed to abort OptionParser and return exit code 0.
@@ -42,6 +44,7 @@ module SkillBench
       when 'compare' then Cli::CompareCommand.call(@argv)
       when 'skill'   then Cli::SkillCommand.call(@argv)
       when 'eval'    then Cli::EvalCommand.call(@argv)
+      when 'validate', 'doctor' then Cli::ValidateCommand.call(@argv)
       when '-h', '--help', 'help'
         help.call
       else

data/lib/skill_bench/client.rb CHANGED Viewed

@@ -1,13 +1,27 @@
 # frozen_string_literal: true
 require_relative 'clients/all'
+require_relative 'services/response_cache'
 module SkillBench
   # Facade for calling LLM clients.
   # Delegates to the configured provider.
   class Client
+    # Provider clients that must never be cached: their results either signal a
+    # configuration error (NullClient) or are cheap, deterministic test doubles
+    # (Mock). Caching them would provide no benefit and could mask errors.
+    UNCACHEABLE_CLIENTS = [
+      Clients::Providers::NullClient,
+      Clients::Providers::Mock
+    ].freeze
     # Calls the configured LLM provider with the given parameters.
     #
+    # When response caching is enabled (see {Services::ResponseCache.enabled?})
+    # and the resolved provider is cacheable, identical requests reuse a cached
+    # response instead of calling the provider again. When caching is disabled
+    # (the default), the provider is always invoked, leaving behavior unchanged.
+    #
     # @param system_prompt [String] System prompt for the LLM
     # @param messages [Array<Hash>] Conversation messages
     # @param provider [Symbol, nil] Override the configured LLM provider (e.g., :deepseek, :openai)
@@ -17,7 +31,35 @@ module SkillBench
       resolved = provider || Config.current_llm_provider || :openai
       client_class = Clients::ProviderRegistry.for(resolved)
       warn "WARNING: LLM provider '#{resolved}' is not configured. Falling back to null client." if client_class == Clients::Providers::NullClient
-      client_class.call(system_prompt: system_prompt, messages: messages, **options)
+      invoke = -> { client_class.call(system_prompt: system_prompt, messages: messages, **options) }
+      return invoke.call unless cache_eligible?(client_class)
+      cache_key = Services::ResponseCache.key(
+        provider: resolved,
+        model: options[:model],
+        system_prompt: system_prompt,
+        messages: messages,
+        tools: options[:tools],
+        temperature: options[:temperature],
+        provider_config: options.slice(:base_url, :request_path, :endpoint, :location, :project_id, :api_version)
+      )
+      Services::ResponseCache.fetch(cache_key, &invoke)
+    end
+    # Whether a resolved provider client may be served from the cache.
+    #
+    # Requires caching to be enabled and the client to not be one of the
+    # {UNCACHEABLE_CLIENTS} (null/mock), so disabling the cache restores the
+    # original, uncached behavior exactly.
+    #
+    # @param client_class [Class] The resolved provider client class
+    # @return [Boolean] true when the call should go through the cache
+    def self.cache_eligible?(client_class)
+      return false unless Services::ResponseCache.enabled?
+      !UNCACHEABLE_CLIENTS.include?(client_class)
     end
+    private_class_method :cache_eligible?
   end
 end

data/lib/skill_bench/clients/all.rb CHANGED Viewed

@@ -2,8 +2,10 @@
 require_relative 'response_parser'
 require_relative 'response_error_handler'
+require_relative 'response_builder'
 require_relative 'request_builder'
 require_relative 'retry_handler'
+require_relative 'base_url_validator'
 require_relative 'base_client'
 require_relative 'provider_config'
 require_relative 'provider_registry'
@@ -16,5 +18,6 @@ require_relative 'providers/azure_openai'
 require_relative 'providers/opencode'
 require_relative 'providers/groq'
 require_relative 'providers/deepseek'
+require_relative 'providers/mistral'
 require_relative 'providers/openrouter'
 require_relative 'providers/mock'

data/lib/skill_bench/clients/base_client.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require_relative '../config'
 require_relative 'provider_config'
 require_relative 'response_parser'
 require_relative 'response_error_handler'
+require_relative 'response_builder'
 require_relative 'request_builder'
 require_relative 'retry_handler'
@@ -135,7 +136,7 @@ module SkillBench
                   else
                     "#{missing.first} not set for #{@provider_display_name}"
                   end
-        { success: false, response: { error: { message: message } }, result: message, status: 'error' }
+        ResponseBuilder.error(message: message)
       end
       # Extracts the message hash from the provider's specific response body structure.
@@ -158,11 +159,22 @@ module SkillBench
       def execute_request
         RetryHandler.call do
-          connection = RequestBuilder.build_connection(base_url)
           RequestBuilder.execute(connection, request_path, headers: request_headers, body: request_body)
         end
       end
+      # Lazily builds and memoizes the Faraday connection for this client instance.
+      #
+      # Reusing one connection across the instance's sequential requests and retry
+      # attempts enables HTTP keep-alive, avoiding a fresh TCP + TLS handshake per turn.
+      # Memoization is intentionally per-instance (never global/shared) so concurrent
+      # agent and judge clients each own a connection, keeping net/http thread-safe.
+      #
+      # @return [Faraday::Connection] the reused connection for this instance.
+      def connection
+        @connection ||= RequestBuilder.build_connection(base_url)
+      end
       def handle_response(response)
         parsed = ResponseParser.parse_body(response)
         return failure_response(response, parsed) unless response.success?
@@ -182,10 +194,6 @@ module SkillBench
         message = extract_message(parsed)
         return missing_message_response(response, parsed) unless ResponseParser.valid_message?(message)
-        success_response(parsed, message)
-      end
-      def success_response(parsed, message)
         content = ResponseParser.extract_content(message)
         {
           success: true,