ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +166 -35
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  9. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  10. data/lib/skill_bench/cli/help_printer.rb +10 -2
  11. data/lib/skill_bench/cli/init_command.rb +2 -1
  12. data/lib/skill_bench/cli/result_printer.rb +1 -1
  13. data/lib/skill_bench/cli/run_command.rb +47 -9
  14. data/lib/skill_bench/cli/validate_command.rb +242 -0
  15. data/lib/skill_bench/cli.rb +3 -0
  16. data/lib/skill_bench/client.rb +43 -1
  17. data/lib/skill_bench/clients/all.rb +2 -0
  18. data/lib/skill_bench/clients/base_client.rb +12 -1
  19. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  20. data/lib/skill_bench/clients/provider_config.rb +34 -1
  21. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  22. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  23. data/lib/skill_bench/commands/init.rb +5 -0
  24. data/lib/skill_bench/commands/skill_new.rb +3 -1
  25. data/lib/skill_bench/config/applier.rb +2 -0
  26. data/lib/skill_bench/config/defaults.rb +2 -0
  27. data/lib/skill_bench/config/facade_readers.rb +7 -0
  28. data/lib/skill_bench/config/facade_writers.rb +17 -0
  29. data/lib/skill_bench/config/json_loader.rb +1 -1
  30. data/lib/skill_bench/config/store.rb +29 -0
  31. data/lib/skill_bench/config.rb +18 -0
  32. data/lib/skill_bench/evaluation/runner.rb +20 -3
  33. data/lib/skill_bench/execution/context_hydrator.rb +52 -11
  34. data/lib/skill_bench/execution/sandbox.rb +58 -11
  35. data/lib/skill_bench/judge/judge.rb +4 -0
  36. data/lib/skill_bench/judge/prompt.rb +42 -6
  37. data/lib/skill_bench/models/config.rb +32 -0
  38. data/lib/skill_bench/output_formatter.rb +60 -1
  39. data/lib/skill_bench/package_verifier.rb +1 -1
  40. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  41. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  42. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  43. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  44. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  45. data/lib/skill_bench/services/html_formatter.rb +289 -0
  46. data/lib/skill_bench/services/json_formatter.rb +19 -1
  47. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  48. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  49. data/lib/skill_bench/services/response_cache.rb +130 -0
  50. data/lib/skill_bench/services/runner_service.rb +88 -4
  51. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  52. data/lib/skill_bench/services/template_registry.rb +43 -9
  53. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  54. data/lib/skill_bench/tools/registry.rb +29 -3
  55. data/lib/skill_bench/tools/run_command.rb +171 -19
  56. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  57. data/lib/skill_bench/trend_tracker.rb +5 -5
  58. data/lib/skill_bench/version.rb +1 -1
  59. data/lib/skill_bench.rb +2 -3
  60. metadata +17 -36
@@ -9,6 +9,7 @@ module SkillBench
9
9
  class BaseEvalOptions
10
10
  attr_reader :options, :parser
11
11
 
12
+ # Initializes the option set and the OptionParser used to parse the command's arguments.
12
13
  def initialize
13
14
  @options = default_options
14
15
  @parser = create_parser
@@ -39,10 +40,12 @@ module SkillBench
39
40
  class NewEvalOptions < BaseEvalOptions
40
41
  protected
41
42
 
43
+ # @return [Hash] default options for the `eval new` command, with the runtime defaulting to "ruby"
42
44
  def default_options
43
45
  { runtime: 'ruby' }
44
46
  end
45
47
 
48
+ # @return [OptionParser] parser for the `eval new` command, handling --runtime and --help
46
49
  def create_parser
47
50
  OptionParser.new do |opts|
48
51
  opts.banner = 'Usage: skill-bench eval new <name> [options]'
@@ -59,6 +62,7 @@ module SkillBench
59
62
  class GenerateEvalOptions < BaseEvalOptions
60
63
  protected
61
64
 
65
+ # @return [OptionParser] parser for the `eval generate` command, handling --name and --help
62
66
  def create_parser
63
67
  OptionParser.new do |opts|
64
68
  opts.banner = 'Usage: skill-bench eval generate <skill-name> [options]'
@@ -20,11 +20,14 @@ module SkillBench
20
20
  --force Overwrite existing config file
21
21
 
22
22
  run <eval> --skill <name> [--skill <name>] [--format FORMAT] [--pack NAME]
23
- Run an evaluation
23
+ Run an evaluation (single eval, or a whole directory with --all)
24
24
  --skill Skill to use (can be specified multiple times)
25
25
  --pack Pack context for registry-based skill resolution
26
26
  --registry-manifest PATH Path to registry.json manifest
27
- --format Output format: human, json, junit (default: human)
27
+ --format Output format: human, json, junit, html (default: human)
28
+ --all Run every eval under evals/ (batch mode)
29
+ --evals-dir DIR Run every eval under DIR (batch mode)
30
+ --summary Emit a JSON summary gate for a batch run (batch mode)
28
31
 
29
32
  compare <skill-name> --variant-a SPEC --variant-b SPEC --eval PATH
30
33
  Compare the same skill across two pack variants
@@ -45,6 +48,11 @@ module SkillBench
45
48
  Auto-generate an eval from a skill
46
49
  --name Name for the generated eval (optional)
47
50
 
51
+ validate (alias: doctor) [--criteria PATH] [--config PATH]
52
+ Run read-only pre-flight checks (no eval, no network)
53
+ --criteria Criteria JSON to validate (default: criteria.json)
54
+ --config Config file to validate (default: skill-bench.json)
55
+
48
56
  Global Options:
49
57
  -h, --help Show this help message
50
58
  USAGE
@@ -45,6 +45,7 @@ module SkillBench
45
45
  OptionParser.new do |opts|
46
46
  opts.banner = 'Usage: skill-bench init --<provider> [options]'
47
47
  register_provider_options(opts, options)
48
+ opts.on('--mock', 'Generate offline mock config (no API key required)') { options[:provider] = :mock }
48
49
  opts.on('--force', 'Overwrite existing config file') { options[:force] = true }
49
50
  opts.on('-h', '--help', 'Prints this help') do
50
51
  puts opts
@@ -60,7 +61,7 @@ module SkillBench
60
61
  end
61
62
 
62
63
  def error_missing_provider
63
- providers = SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" }.join(', ')
64
+ providers = (SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" } + ['--mock']).join(', ')
64
65
  warn "Error: provider is required. Use one of: #{providers}"
65
66
  1
66
67
  end
@@ -9,7 +9,7 @@ module SkillBench
9
9
  # Prints the result and returns the appropriate exit code.
10
10
  #
11
11
  # @param result [Hash] Result from ScoringService
12
- # @param format [Symbol] Output format (:human, :json, :junit)
12
+ # @param format [Symbol] Output format (:human, :json, :junit, :html)
13
13
  # @return [Integer] Exit code (0 for pass, 1 for fail)
14
14
  def self.call(result, format: :human)
15
15
  puts OutputFormatter.format(result, format: format)
@@ -19,7 +19,7 @@ module SkillBench
19
19
  @argv = argv
20
20
  end
21
21
 
22
- # Parses options and runs the eval.
22
+ # Parses options and runs the eval(s).
23
23
  #
24
24
  # @return [Integer] Exit code
25
25
  def call
@@ -27,14 +27,9 @@ module SkillBench
27
27
  parser = build_parser(options)
28
28
  parser.parse!(@argv)
29
29
 
30
- eval_name = @argv.shift
31
- return error_missing_eval unless eval_name
32
- return error_missing_skill if options[:skill_names].empty? && !options[:pack]
30
+ return run_batch(options) if batch_requested?(options)
33
31
 
34
- options[:eval_name] = eval_name
35
- exec_options = options.reject { |key| key == :format }
36
- result = Commands::Run.run(**exec_options)
37
- ResultPrinter.call(result, format: options[:format] || :human)
32
+ run_single(options)
38
33
  rescue HelpRequested
39
34
  0
40
35
  rescue StandardError => e
@@ -44,13 +39,56 @@ module SkillBench
44
39
 
45
40
  private
46
41
 
42
+ # Whether a whole-directory batch run was requested.
43
+ #
44
+ # @param options [Hash] Parsed options
45
+ # @return [Boolean] true when --all or --evals-dir was given
46
+ def batch_requested?(options)
47
+ options[:all] || options[:evals_dir]
48
+ end
49
+
50
+ # Runs a single eval (the original `run <eval> --skill ...` path).
51
+ #
52
+ # @param options [Hash] Parsed options
53
+ # @return [Integer] Exit code
54
+ def run_single(options)
55
+ eval_name = @argv.shift
56
+ return error_missing_eval unless eval_name
57
+ return error_missing_skill if options[:skill_names].empty? && !options[:pack]
58
+
59
+ options[:eval_name] = eval_name
60
+ exec_options = options.reject { |key| %i[format summary all evals_dir].include?(key) }
61
+ result = Commands::Run.run(**exec_options)
62
+ ResultPrinter.call(result, format: options[:format] || :human)
63
+ end
64
+
65
+ # Runs every eval under the target directory and prints an aggregate.
66
+ #
67
+ # @param options [Hash] Parsed options
68
+ # @return [Integer] Exit code
69
+ def run_batch(options)
70
+ return error_missing_skill if options[:skill_names].empty? && !options[:pack]
71
+
72
+ aggregate = Services::BatchRunnerService.call(
73
+ evals_dir: options[:evals_dir] || Services::BatchRunnerService::DEFAULT_EVALS_DIR,
74
+ skill_names: options[:skill_names],
75
+ pack: options[:pack],
76
+ registry_manifest: options[:registry_manifest]
77
+ )
78
+ BatchResultPrinter.call(aggregate, format: options[:format], summary: options[:summary])
79
+ end
80
+
47
81
  def build_parser(options)
48
82
  OptionParser.new do |opts|
49
83
  opts.banner = 'Usage: skill-bench run <eval> [options]'
50
84
  opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
51
85
  opts.on('--pack NAME', 'Pack context for skill resolution') { |v| options[:pack] = v }
52
86
  opts.on('--registry-manifest PATH', 'Path to registry.json manifest') { |v| options[:registry_manifest] = v }
53
- opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
87
+ opts.on('--format FORMAT', 'Output format (human, json, junit, html)') { |v| options[:format] = v.to_sym }
88
+ opts.on('--all', 'Run every eval under the default evals/ directory') { options[:all] = true }
89
+ opts.on('--evals-dir DIR', 'Run every eval under DIR') { |v| options[:evals_dir] = v }
90
+ opts.on('--summary', 'Emit a JSON summary gate for a batch run') { options[:summary] = true }
91
+ opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
54
92
  opts.on('-h', '--help', 'Prints this help') do
55
93
  puts opts
56
94
  raise SkillBench::HelpRequested
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'optparse'
5
+
6
+ module SkillBench
7
+ module Cli
8
+ # Handles the `skill-bench validate` / `doctor` subcommand.
9
+ #
10
+ # Runs read-only pre-flight checks and prints a PASS/FAIL report:
11
+ # 1. Criteria JSON structure (via {Models::CriteriaValidator}).
12
+ # 2. skill-bench.json shape (hand-rolled, lightweight schema check).
13
+ # 3. Provider credentials for the configured non-mock provider.
14
+ #
15
+ # It never runs an eval and never makes a network call.
16
+ class ValidateCommand
17
+ # Default criteria file validated when --criteria is not given.
18
+ DEFAULT_CRITERIA = 'criteria.json'
19
+
20
+ # @param argv [Array<String>] Raw CLI arguments
21
+ # @return [Integer] Exit code
22
+ def self.call(argv)
23
+ new(argv).call
24
+ end
25
+
26
+ # @param argv [Array<String>] Raw CLI arguments
27
+ def initialize(argv)
28
+ @argv = argv
29
+ end
30
+
31
+ # Parses options, runs the pre-flight checks, and prints the report.
32
+ #
33
+ # @return [Integer] Exit code (0 when all checks pass, 1 otherwise)
34
+ def call
35
+ options = parse_options
36
+ config_path = options[:config] || SkillBench::Config::CONFIG_FILENAME
37
+ config_data = load_config_data(config_path)
38
+ results = [
39
+ check_criteria(options),
40
+ check_config(config_path, config_data),
41
+ check_provider_key(config_data)
42
+ ]
43
+ print_report(results)
44
+ results.any? { |result| result[:status] == :fail } ? 1 : 0
45
+ rescue HelpRequested
46
+ 0
47
+ rescue StandardError => e
48
+ warn "Error: #{e.message}"
49
+ 1
50
+ end
51
+
52
+ private
53
+
54
+ def parse_options
55
+ options = {}
56
+ build_parser(options).parse!(@argv)
57
+ options
58
+ end
59
+
60
+ def build_parser(options)
61
+ OptionParser.new do |opts|
62
+ opts.banner = 'Usage: skill-bench validate [options]'
63
+ opts.on('--criteria PATH', 'Criteria JSON file to validate (default: criteria.json)') { |v| options[:criteria] = v }
64
+ opts.on('--config PATH', 'Config file to validate (default: skill-bench.json)') { |v| options[:config] = v }
65
+ opts.on('-h', '--help', 'Prints this help') do
66
+ puts opts
67
+ raise SkillBench::HelpRequested
68
+ end
69
+ end
70
+ end
71
+
72
+ # --- Check (a): criteria ------------------------------------------------
73
+
74
+ def check_criteria(options)
75
+ path = options[:criteria] || DEFAULT_CRITERIA
76
+ unless File.exist?(path)
77
+ return fail_result('criteria', "criteria file not found: #{path}") if options[:criteria]
78
+
79
+ return skip_result('criteria', "no #{DEFAULT_CRITERIA} found (skipped)")
80
+ end
81
+
82
+ result = Models::CriteriaValidator.call(path:)
83
+ return pass_result('criteria', "#{path} is valid") if result[:success]
84
+
85
+ fail_result('criteria', "#{path}: #{criteria_error(result)}")
86
+ end
87
+
88
+ def criteria_error(result)
89
+ result.dig(:response, :error, :message) || 'invalid criteria'
90
+ end
91
+
92
+ # --- Check (b): config shape -------------------------------------------
93
+
94
+ def check_config(path, config_data)
95
+ case config_data[:status]
96
+ when :missing
97
+ fail_result('config', "#{path} not found")
98
+ when :invalid_json
99
+ fail_result('config', "#{path} is not valid JSON: #{config_data[:message]}")
100
+ else
101
+ validate_config_shape(path, config_data[:data])
102
+ end
103
+ end
104
+
105
+ def validate_config_shape(path, data)
106
+ return fail_result('config', "#{path} must contain a JSON object") unless data.is_a?(Hash)
107
+
108
+ errors = config_shape_errors(data)
109
+ return fail_result('config', errors.join('; ')) if errors.any?
110
+
111
+ pass_result('config', "#{path} matches the expected shape")
112
+ end
113
+
114
+ def config_shape_errors(data)
115
+ errors = provider_errors(data[:provider])
116
+ errors.concat(max_execution_time_errors(data[:max_execution_time]))
117
+ errors << "'config' must be an object" if data.key?(:config) && !data[:config].is_a?(Hash)
118
+ errors
119
+ end
120
+
121
+ def provider_errors(provider)
122
+ return ["'provider' is required"] if provider.nil?
123
+ return ["'provider' must be a string"] unless provider.is_a?(String)
124
+
125
+ allowed = Models::Provider::ALLOWED_PROVIDERS
126
+ return [] if allowed.include?(provider)
127
+
128
+ ["'provider' '#{provider}' is not one of: #{allowed.join(', ')}"]
129
+ end
130
+
131
+ def max_execution_time_errors(value)
132
+ return [] if value.nil?
133
+ return [] if value.is_a?(Integer) && value.positive?
134
+
135
+ ["'max_execution_time' must be a positive integer"]
136
+ end
137
+
138
+ # --- Check (c): provider key -------------------------------------------
139
+
140
+ def check_provider_key(config_data)
141
+ return skip_result('provider key', 'skipped (no usable config)') unless config_data[:status] == :ok
142
+
143
+ provider = config_provider(config_data[:data])
144
+ return skip_result('provider key', 'skipped (provider invalid)') unless provider
145
+ return pass_result('provider key', 'mock provider requires no API key') if provider == 'mock'
146
+
147
+ missing = missing_provider_keys(provider, config_data[:data][:config])
148
+ return pass_result('provider key', "#{provider} credentials present") if missing.empty?
149
+
150
+ fail_result('provider key', "#{provider} is missing: #{missing.join(', ')}")
151
+ rescue StandardError => e
152
+ # Building the client can raise on unrelated config (e.g. base_url
153
+ # validation); surface that as a structured FAIL rather than crashing.
154
+ fail_result('provider key', "#{provider} config is invalid: #{e.message}")
155
+ end
156
+
157
+ def config_provider(data)
158
+ return nil unless data.is_a?(Hash)
159
+
160
+ provider = data[:provider]
161
+ return nil unless provider.is_a?(String) && Models::Provider::ALLOWED_PROVIDERS.include?(provider)
162
+
163
+ provider
164
+ end
165
+
166
+ def missing_provider_keys(provider, provider_config)
167
+ provider_sym = provider.to_sym
168
+ options = provider_client_options(provider_sym, provider_config)
169
+ client = Clients::ProviderRegistry.for(provider_sym).new(options)
170
+ return [] unless client.respond_to?(:missing_config_keys, true)
171
+
172
+ client.send(:missing_config_keys)
173
+ end
174
+
175
+ def provider_client_options(provider_sym, provider_config)
176
+ options = provider_config.is_a?(Hash) ? provider_config.dup : {}
177
+ Models::Provider::ENV_OVERRIDABLE_SETTINGS.each do |setting|
178
+ value = env_setting(provider_sym, setting)
179
+ options[setting] = value unless value.nil?
180
+ end
181
+ options
182
+ end
183
+
184
+ def env_setting(provider_sym, setting)
185
+ provider = provider_sym.to_s.upcase
186
+ name = setting.to_s.upcase
187
+ ["SKILL_BENCH_#{provider}_#{name}", "#{provider}_#{name}"].each do |var|
188
+ value = ENV.fetch(var, nil)
189
+ return value if value && !value.empty?
190
+ end
191
+ nil
192
+ end
193
+
194
+ # --- Config loading ----------------------------------------------------
195
+
196
+ def load_config_data(path)
197
+ return { status: :missing } unless File.exist?(path)
198
+
199
+ { status: :ok, data: JSON.parse(File.read(path), symbolize_names: true) }
200
+ rescue JSON::ParserError => e
201
+ { status: :invalid_json, message: e.message }
202
+ end
203
+
204
+ # --- Reporting ---------------------------------------------------------
205
+
206
+ def print_report(results)
207
+ puts 'skill-bench validate'
208
+ puts
209
+ results.each { |result| puts format_result(result) }
210
+ puts
211
+ puts summary_line(results)
212
+ end
213
+
214
+ def format_result(result)
215
+ "[#{label(result[:status])}] #{result[:name].ljust(13)} #{result[:message]}"
216
+ end
217
+
218
+ def label(status)
219
+ { pass: 'PASS', fail: 'FAIL', skip: 'SKIP' }.fetch(status)
220
+ end
221
+
222
+ def summary_line(results)
223
+ failed = results.count { |result| result[:status] == :fail }
224
+ return "#{failed} check(s) failed." if failed.positive?
225
+
226
+ 'All checks passed.'
227
+ end
228
+
229
+ def pass_result(name, message)
230
+ { name:, status: :pass, message: }
231
+ end
232
+
233
+ def fail_result(name, message)
234
+ { name:, status: :fail, message: }
235
+ end
236
+
237
+ def skip_result(name, message)
238
+ { name:, status: :skip, message: }
239
+ end
240
+ end
241
+ end
242
+ end
@@ -5,8 +5,10 @@ require_relative 'cli/run_command'
5
5
  require_relative 'cli/compare_command'
6
6
  require_relative 'cli/skill_command'
7
7
  require_relative 'cli/eval_command'
8
+ require_relative 'cli/validate_command'
8
9
  require_relative 'cli/help_printer'
9
10
  require_relative 'cli/result_printer'
11
+ require_relative 'cli/batch_result_printer'
10
12
 
11
13
  module SkillBench
12
14
  # Raised when -h/--help is passed to abort OptionParser and return exit code 0.
@@ -42,6 +44,7 @@ module SkillBench
42
44
  when 'compare' then Cli::CompareCommand.call(@argv)
43
45
  when 'skill' then Cli::SkillCommand.call(@argv)
44
46
  when 'eval' then Cli::EvalCommand.call(@argv)
47
+ when 'validate', 'doctor' then Cli::ValidateCommand.call(@argv)
45
48
  when '-h', '--help', 'help'
46
49
  help.call
47
50
  else
@@ -1,13 +1,27 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'clients/all'
4
+ require_relative 'services/response_cache'
4
5
 
5
6
  module SkillBench
6
7
  # Facade for calling LLM clients.
7
8
  # Delegates to the configured provider.
8
9
  class Client
10
+ # Provider clients that must never be cached: their results either signal a
11
+ # configuration error (NullClient) or are cheap, deterministic test doubles
12
+ # (Mock). Caching them would provide no benefit and could mask errors.
13
+ UNCACHEABLE_CLIENTS = [
14
+ Clients::Providers::NullClient,
15
+ Clients::Providers::Mock
16
+ ].freeze
17
+
9
18
  # Calls the configured LLM provider with the given parameters.
10
19
  #
20
+ # When response caching is enabled (see {Services::ResponseCache.enabled?})
21
+ # and the resolved provider is cacheable, identical requests reuse a cached
22
+ # response instead of calling the provider again. When caching is disabled
23
+ # (the default), the provider is always invoked, leaving behavior unchanged.
24
+ #
11
25
  # @param system_prompt [String] System prompt for the LLM
12
26
  # @param messages [Array<Hash>] Conversation messages
13
27
  # @param provider [Symbol, nil] Override the configured LLM provider (e.g., :deepseek, :openai)
@@ -17,7 +31,35 @@ module SkillBench
17
31
  resolved = provider || Config.current_llm_provider || :openai
18
32
  client_class = Clients::ProviderRegistry.for(resolved)
19
33
  warn "WARNING: LLM provider '#{resolved}' is not configured. Falling back to null client." if client_class == Clients::Providers::NullClient
20
- client_class.call(system_prompt: system_prompt, messages: messages, **options)
34
+
35
+ invoke = -> { client_class.call(system_prompt: system_prompt, messages: messages, **options) }
36
+ return invoke.call unless cache_eligible?(client_class)
37
+
38
+ cache_key = Services::ResponseCache.key(
39
+ provider: resolved,
40
+ model: options[:model],
41
+ system_prompt: system_prompt,
42
+ messages: messages,
43
+ tools: options[:tools],
44
+ temperature: options[:temperature],
45
+ provider_config: options.slice(:base_url, :request_path, :endpoint, :location, :project_id, :api_version)
46
+ )
47
+ Services::ResponseCache.fetch(cache_key, &invoke)
48
+ end
49
+
50
+ # Whether a resolved provider client may be served from the cache.
51
+ #
52
+ # Requires caching to be enabled and the client to not be one of the
53
+ # {UNCACHEABLE_CLIENTS} (null/mock), so disabling the cache restores the
54
+ # original, uncached behavior exactly.
55
+ #
56
+ # @param client_class [Class] The resolved provider client class
57
+ # @return [Boolean] true when the call should go through the cache
58
+ def self.cache_eligible?(client_class)
59
+ return false unless Services::ResponseCache.enabled?
60
+
61
+ !UNCACHEABLE_CLIENTS.include?(client_class)
21
62
  end
63
+ private_class_method :cache_eligible?
22
64
  end
23
65
  end
@@ -5,6 +5,7 @@ require_relative 'response_error_handler'
5
5
  require_relative 'response_builder'
6
6
  require_relative 'request_builder'
7
7
  require_relative 'retry_handler'
8
+ require_relative 'base_url_validator'
8
9
  require_relative 'base_client'
9
10
  require_relative 'provider_config'
10
11
  require_relative 'provider_registry'
@@ -17,5 +18,6 @@ require_relative 'providers/azure_openai'
17
18
  require_relative 'providers/opencode'
18
19
  require_relative 'providers/groq'
19
20
  require_relative 'providers/deepseek'
21
+ require_relative 'providers/mistral'
20
22
  require_relative 'providers/openrouter'
21
23
  require_relative 'providers/mock'
@@ -159,11 +159,22 @@ module SkillBench
159
159
 
160
160
  def execute_request
161
161
  RetryHandler.call do
162
- connection = RequestBuilder.build_connection(base_url)
163
162
  RequestBuilder.execute(connection, request_path, headers: request_headers, body: request_body)
164
163
  end
165
164
  end
166
165
 
166
+ # Lazily builds and memoizes the Faraday connection for this client instance.
167
+ #
168
+ # Reusing one connection across the instance's sequential requests and retry
169
+ # attempts enables HTTP keep-alive, avoiding a fresh TCP + TLS handshake per turn.
170
+ # Memoization is intentionally per-instance (never global/shared) so concurrent
171
+ # agent and judge clients each own a connection, keeping net/http thread-safe.
172
+ #
173
+ # @return [Faraday::Connection] the reused connection for this instance.
174
+ def connection
175
+ @connection ||= RequestBuilder.build_connection(base_url)
176
+ end
177
+
167
178
  def handle_response(response)
168
179
  parsed = ResponseParser.parse_body(response)
169
180
  return failure_response(response, parsed) unless response.success?
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'uri'
4
+
5
+ module SkillBench
6
+ module Clients
7
+ # Validates a provider `base_url` before it is used to build an HTTP
8
+ # connection that may carry an API key / bearer token.
9
+ #
10
+ # Security rationale: `base_url` is taken verbatim from config/env input and
11
+ # the authenticated request attaches a credential to whatever host it names.
12
+ # Left unchecked this is an SSRF surface, and an `http://` URL would transmit
13
+ # the credential in cleartext. This service enforces:
14
+ #
15
+ # - the URL must be an absolute `http`/`https` URL with a host (empty/relative
16
+ # /garbage values are rejected);
17
+ # - when a credential will be attached, non-loopback hosts MUST use `https`;
18
+ # loopback hosts (`localhost`, `127.0.0.1`, `::1`) MAY use `http` — the
19
+ # legitimate self-hosted/Ollama case — and an explicit opt-in
20
+ # (`allow_insecure_base_url`) can permit cleartext for non-loopback hosts.
21
+ #
22
+ # A blank (`nil`/empty) `base_url` is allowed so providers may supply their
23
+ # own (https) default downstream. Error messages describe only the transport
24
+ # and never include the credential.
25
+ class BaseUrlValidator
26
+ # Hosts permitted to use cleartext `http` even with a credential attached.
27
+ LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1].freeze
28
+
29
+ # Raised when a base URL is structurally invalid or would leak a credential
30
+ # over cleartext transport. The message never contains the credential.
31
+ class InvalidBaseURLError < StandardError; end
32
+
33
+ # Validates a base URL and returns it unchanged when valid.
34
+ #
35
+ # @param base_url [String, nil] the URL to validate; blank values are
36
+ # returned as-is so a provider default can be applied later.
37
+ # @param has_credential [Boolean] whether a credential (api key/bearer
38
+ # token) will be attached to requests sent to this URL.
39
+ # @param allow_insecure [Boolean] explicit opt-in that permits cleartext
40
+ # `http` to a non-loopback host even when a credential is attached.
41
+ # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
42
+ # @return [String, nil] the validated URL (blank input returned unchanged).
43
+ def self.call(base_url:, has_credential: false, allow_insecure: false)
44
+ new(base_url, has_credential, allow_insecure).call
45
+ end
46
+
47
+ # @param base_url [String, nil] the URL to validate.
48
+ # @param has_credential [Boolean] whether a credential will be attached.
49
+ # @param allow_insecure [Boolean] opt-in permitting cleartext non-loopback.
50
+ def initialize(base_url, has_credential, allow_insecure)
51
+ @base_url = base_url
52
+ @has_credential = has_credential
53
+ @allow_insecure = allow_insecure
54
+ end
55
+
56
+ # Runs the validation.
57
+ #
58
+ # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
59
+ # @return [String, nil] the validated URL.
60
+ def call
61
+ return @base_url if blank?(@base_url)
62
+
63
+ validate_absolute_http_url!
64
+ validate_secure_transport!
65
+ @base_url
66
+ end
67
+
68
+ private
69
+
70
+ def blank?(value)
71
+ value.to_s.strip.empty?
72
+ end
73
+
74
+ def uri
75
+ @uri ||= URI.parse(@base_url.to_s)
76
+ rescue URI::InvalidURIError
77
+ nil
78
+ end
79
+
80
+ def validate_absolute_http_url!
81
+ return if uri.is_a?(URI::HTTP) && !blank?(uri.hostname)
82
+
83
+ raise InvalidBaseURLError,
84
+ "Invalid provider base_url #{@base_url.inspect}: " \
85
+ 'must be an absolute http(s) URL with a host.'
86
+ end
87
+
88
+ def validate_secure_transport!
89
+ return unless @has_credential
90
+ return if uri.scheme == 'https'
91
+ return if loopback?
92
+ return if @allow_insecure
93
+
94
+ raise InvalidBaseURLError,
95
+ 'Insecure provider base_url: refusing to send a credential over cleartext http ' \
96
+ "to non-loopback host #{uri.hostname.inspect}. Use https, target a loopback host, " \
97
+ 'or set allow_insecure_base_url: true to override.'
98
+ end
99
+
100
+ def loopback?
101
+ LOOPBACK_HOSTS.include?(uri.hostname)
102
+ end
103
+ end
104
+ end
105
+ end