ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Judge
7
+ # Parses and validates structured JSON responses from the LLM judge.
8
+ #
9
+ # Expects a JSON object with a 'dimensions' key mapping dimension names
10
+ # to score hashes, and an optional 'overall_reasoning' string.
11
+ class Response
12
+ attr_reader :dimensions, :overall_reasoning
13
+
14
+ # Parses a judge JSON string.
15
+ #
16
+ # @param json [String] The raw JSON string from the judge.
17
+ # @return [Hash] Service response with parsed judge response or error.
18
+ def self.call(json:)
19
+ new(json:).call
20
+ end
21
+
22
+ # @param json [String] The raw JSON string from the judge.
23
+ def initialize(json:)
24
+ @json = json
25
+ end
26
+
27
+ # Parses and validates the judge JSON.
28
+ #
29
+ # @return [Hash] Service response with judge response or error.
30
+ def call
31
+ data = parse_json
32
+ return data unless data[:success]
33
+
34
+ payload = data[:response][:data]
35
+ validation = validate_structure(payload)
36
+ return validation unless validation[:success]
37
+
38
+ dims = payload['dimensions'] || payload[:dimensions]
39
+ extracted = extract_dimensions(dims)
40
+ return extracted unless extracted[:success]
41
+
42
+ @dimensions = extracted[:response][:dimensions]
43
+ @overall_reasoning = payload['overall_reasoning'] || payload[:overall_reasoning] || ''
44
+
45
+ { success: true, response: { judge_response: self } }
46
+ rescue StandardError => e
47
+ SkillBench::ErrorLogger.log_error(e, 'Judge::Response Parse Error')
48
+ { success: false, response: { error: { message: e.message } } }
49
+ end
50
+
51
+ private
52
+
53
+ attr_reader :json
54
+
55
+ def parse_json
56
+ stripped = strip_markdown_fences(json)
57
+ data = JSON.parse(stripped)
58
+ { success: true, response: { data: data } }
59
+ rescue JSON::ParserError => e
60
+ { success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
61
+ end
62
+
63
+ def strip_markdown_fences(text)
64
+ return text unless text.start_with?('```')
65
+
66
+ lines = text.each_line.to_a
67
+ lines.shift if lines.first&.strip&.start_with?('```')
68
+ lines.pop if lines.last&.strip == '```'
69
+ lines.join.strip
70
+ end
71
+
72
+ def validate_structure(payload)
73
+ dims = payload['dimensions'] || payload[:dimensions]
74
+
75
+ return missing_dimensions_result if dims.nil?
76
+ return empty_dimensions_result if dims.empty?
77
+
78
+ { success: true, response: {} }
79
+ end
80
+
81
+ def missing_dimensions_result
82
+ { success: false, response: { error: { message: "Judge response missing 'dimensions' key" } } }
83
+ end
84
+
85
+ def empty_dimensions_result
86
+ { success: false, response: { error: { message: "Judge response 'dimensions' is empty" } } }
87
+ end
88
+
89
+ def extract_dimensions(dims)
90
+ dimensions = {}
91
+
92
+ dims.each do |name, dim|
93
+ validated = validate_dimension(name, dim)
94
+ return validated unless validated[:success]
95
+
96
+ dimensions[name] = validated[:response][:dimension]
97
+ end
98
+
99
+ { success: true, response: { dimensions: dimensions } }
100
+ end
101
+
102
+ def validate_dimension(name, dim)
103
+ score = dim['score'] || dim[:score]
104
+ return missing_score_result(name) if score.nil?
105
+
106
+ numeric_score = parse_numeric(score)
107
+ return invalid_score_result(name, score) if numeric_score.nil?
108
+
109
+ max_score = dim['max_score'] || dim[:max_score]
110
+ max_score_result = validate_max_score(name, numeric_score, max_score)
111
+ return max_score_result unless max_score_result[:success]
112
+
113
+ {
114
+ success: true,
115
+ response: {
116
+ dimension: {
117
+ score: numeric_score,
118
+ max_score: max_score,
119
+ reasoning: dim['reasoning'] || dim[:reasoning] || ''
120
+ }
121
+ }
122
+ }
123
+ end
124
+
125
+ def validate_max_score(name, numeric_score, max_score)
126
+ return { success: true, response: {} } unless max_score
127
+ return invalid_max_score_result(name, max_score) unless max_score.is_a?(Numeric)
128
+ return out_of_bounds_result(name, numeric_score, max_score) if numeric_score.negative? || numeric_score > max_score
129
+
130
+ { success: true, response: {} }
131
+ end
132
+
133
+ def parse_numeric(value)
134
+ return value if value.is_a?(Numeric)
135
+
136
+ Float(value)
137
+ rescue ArgumentError, TypeError
138
+ nil
139
+ end
140
+
141
+ def missing_score_result(name)
142
+ { success: false, response: { error: { message: "Judge dimension '#{name}' missing score" } } }
143
+ end
144
+
145
+ def invalid_score_result(name, score)
146
+ { success: false, response: { error: { message: "Judge dimension '#{name}' has invalid score: #{score.inspect}" } } }
147
+ end
148
+
149
+ def out_of_bounds_result(name, score, max_score)
150
+ { success: false, response: { error: { message: "Judge dimension '#{name}' score #{score} out of bounds (0..#{max_score})" } } }
151
+ end
152
+
153
+ def invalid_max_score_result(name, max_score)
154
+ { success: false, response: { error: { message: "Judge dimension '#{name}' has invalid max_score: #{max_score.inspect} (must be numeric)" } } }
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Namespace for the judge subsystem.
5
+ #
6
+ # The judge evaluates AI-generated code modifications by calling an LLM
7
+ # and parsing structured JSON responses.
8
+ module Judge
9
+ end
10
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require 'json'
5
+
6
+ module SkillBench
7
+ module Migration
8
+ # Migrates old provider classes to new YAML-based configuration
9
+ class ProviderMigrator
10
+ # Migrate providers to YAML config file
11
+ # @param providers [Hash] Providers to migrate (name => config hash)
12
+ # @param yaml_path [String] Path to YAML config file (default: .agent-eval.yml)
13
+ def self.migrate(providers, yaml_path = '.agent-eval.yml')
14
+ existing = if File.exist?(yaml_path)
15
+ YAML.safe_load_file(yaml_path, permitted_classes: [], aliases: false) || {}
16
+ else
17
+ {}
18
+ end
19
+
20
+ existing['providers'] ||= {}
21
+
22
+ providers.each do |name, config|
23
+ existing['providers'][name.to_s] = config.transform_keys(&:to_s)
24
+ end
25
+
26
+ File.write(yaml_path, existing.to_yaml)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'provider'
5
+
6
+ module SkillBench
7
+ module Models
8
+ # Represents the skill-bench configuration loaded from skill-bench.json
9
+ class Config
10
+ # @param data [Hash] Raw configuration data
11
+ # @raise [ArgumentError] if data is not a Hash
12
+ def initialize(data = {})
13
+ raise ArgumentError, 'Config-data must be a Hash' unless data.is_a?(Hash)
14
+
15
+ @data = data
16
+ end
17
+
18
+ # Load configuration from a JSON file
19
+ # @param path [String] Path to config file (default: skill-bench.json)
20
+ # @return [SkillBench::Models::Config] Loaded config instance
21
+ # @raise [Errno::ENOENT] if config file not found
22
+ def self.load(path = 'skill-bench.json')
23
+ raw_data = JSON.parse(File.read(path), symbolize_names: true)
24
+ new(raw_data)
25
+ end
26
+
27
+ # Returns the configured provider name
28
+ # @return [String, nil] Provider name
29
+ def provider_name
30
+ @data[:provider]
31
+ end
32
+
33
+ # Returns the provider configuration
34
+ # @return [Hash] Provider configuration
35
+ def provider_config
36
+ @data[:config] || {}
37
+ end
38
+
39
+ # Returns max execution time
40
+ # @return [Integer] Max execution time in seconds
41
+ def max_execution_time
42
+ @data[:max_execution_time] || 30
43
+ end
44
+
45
+ # Builds a Provider model from the current configuration.
46
+ # Returns a mock provider if provider name is 'mock'.
47
+ #
48
+ # @return [SkillBench::Models::Provider] The configured provider
49
+ def to_provider
50
+ return nil if provider_name.nil? || provider_name == 'mock'
51
+
52
+ Provider.new(
53
+ name: provider_name,
54
+ runtime: provider_name,
55
+ llm: provider_name,
56
+ config: provider_config
57
+ )
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Models
7
+ # Validates evaluation criteria JSON structure without building rich objects.
8
+ class CriteriaValidator
9
+ # Validates criteria from a JSON file
10
+ #
11
+ # @param path [String] Path to criteria JSON file
12
+ # @return [Hash] Validation result with success status and criteria data
13
+ def self.call(path:)
14
+ new(path).call
15
+ end
16
+
17
+ # @param path [String] Path to criteria JSON file
18
+ def initialize(path)
19
+ @path = path
20
+ end
21
+
22
+ # Validates the criteria file.
23
+ #
24
+ # @return [Hash] Validation result.
25
+ def call
26
+ return file_not_found_result unless File.exist?(path)
27
+
28
+ data = parse_json(path)
29
+ return data unless data[:success]
30
+
31
+ parsed = data[:response][:data]
32
+ validation = validate(parsed)
33
+ return validation unless validation[:success]
34
+
35
+ { success: true, response: { criteria: parsed } }
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :path
41
+
42
+ def file_not_found_result
43
+ { success: false, response: { error: { message: "File not found: #{path}" } } }
44
+ end
45
+
46
+ def parse_json(file_path)
47
+ parsed = JSON.parse(File.read(file_path), symbolize_names: true)
48
+ { success: true, response: { data: parsed } }
49
+ rescue JSON::ParserError => e
50
+ { success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
51
+ end
52
+
53
+ def validate(data)
54
+ dim_result = validate_dimensions(data.fetch(:dimensions, []))
55
+ return dim_result unless dim_result[:success]
56
+
57
+ field_result = validate_required_fields(data)
58
+ return field_result unless field_result[:success]
59
+
60
+ threshold_result = validate_pass_threshold(data[:pass_threshold])
61
+ return threshold_result unless threshold_result[:success]
62
+
63
+ validate_minimum_delta(data[:minimum_delta])
64
+ end
65
+
66
+ def validate_dimensions(dimensions)
67
+ return invalid_dimensions_result unless dimensions.is_a?(Array)
68
+ return invalid_dimensions_result unless dimensions.all? do |dim|
69
+ dim.is_a?(Hash) && dim[:name] && dim[:max_score].is_a?(Numeric)
70
+ end
71
+
72
+ total = dimensions.sum { |dim| dim[:max_score] || 0 }
73
+ return score_sum_result(total) unless total == 100
74
+
75
+ { success: true, response: {} }
76
+ end
77
+
78
+ def invalid_dimensions_result
79
+ { success: false, response: { error: { message: 'Invalid dimensions format' } } }
80
+ end
81
+
82
+ def score_sum_result(total)
83
+ { success: false, response: { error: { message: "Dimension scores must sum to 100, got #{total}" } } }
84
+ end
85
+
86
+ def validate_required_fields(data)
87
+ missing = %i[pass_threshold minimum_delta].select { |field| data[field].nil? }
88
+ return { success: true, response: {} } if missing.empty?
89
+
90
+ { success: false, response: { error: { message: "Missing required fields: #{missing.join(', ')}" } } }
91
+ end
92
+
93
+ def validate_pass_threshold(value)
94
+ return { success: true, response: {} } if value.is_a?(Integer) && value.between?(0, 100)
95
+
96
+ { success: false, response: { error: { message: 'Pass threshold must be between 0 and 100' } } }
97
+ end
98
+
99
+ def validate_minimum_delta(value)
100
+ return { success: true, response: {} } if value.is_a?(Integer) && value >= 0
101
+
102
+ { success: false, response: { error: { message: 'Minimum delta must be non-negative integer' } } }
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'pathname'
5
+ require_relative '../criteria'
6
+
7
+ module SkillBench
8
+ module Models
9
+ # Represents an evaluation scenario
10
+ class Eval
11
+ attr_reader :name, :path, :task, :criteria, :source_code, :metadata
12
+
13
+ # @param name [String] Eval name
14
+ # @param path [String] Path to eval directory
15
+ # @param task [String] Task description from task.md
16
+ # @param criteria [Hash] Criteria from criteria.json
17
+ # @param source_code [String] Source code to evaluate
18
+ # @param metadata [Hash] Metadata from metadata.json
19
+ def initialize(name:, path:, task: '', criteria: {}, source_code: '', metadata: {})
20
+ @name = name
21
+ @path = path
22
+ @task = task
23
+ @criteria = criteria
24
+ @source_code = source_code
25
+ @metadata = metadata
26
+ end
27
+
28
+ # Load an eval from a directory
29
+ # @param dir_path [String] Path to eval directory
30
+ # @return [SkillBench::Models::Eval] Loaded eval instance
31
+ # @raise [Errno::ENOENT] if eval directory does not exist
32
+ def self.load(dir_path)
33
+ path = Pathname.new(dir_path)
34
+ raise Errno::ENOENT, "Eval directory not found: #{dir_path}" unless path.exist?
35
+
36
+ name = path.basename.to_s
37
+ task = load_task(path)
38
+ criteria = load_criteria(path)
39
+ metadata = load_metadata(path)
40
+
41
+ new(name: name, path: dir_path, task: task, criteria: criteria, metadata: metadata)
42
+ end
43
+
44
+ # Load task description from task.md
45
+ # @param path [Pathname] Path to eval directory
46
+ # @return [String] Task description or empty string if file doesn't exist
47
+ def self.load_task(path)
48
+ task_md = path.join('task.md')
49
+ task_md.exist? ? File.read(task_md) : ''
50
+ end
51
+
52
+ # Load evaluation criteria from criteria.json
53
+ # @param path [Pathname] Path to eval directory
54
+ # @return [SkillBench::Criteria] Parsed criteria or empty criteria if file doesn't exist
55
+ # @raise [RuntimeError] if JSON is malformed or criteria validation fails
56
+ def self.load_criteria(path)
57
+ criteria_json = path.join('criteria.json')
58
+ return SkillBench::Criteria.empty unless criteria_json.exist?
59
+
60
+ result = SkillBench::Criteria.call(path: criteria_json.to_s)
61
+ response = result[:response]
62
+ return response[:criteria] if result[:success]
63
+
64
+ raise "Failed to load criteria: #{response[:error][:message]}"
65
+ end
66
+
67
+ # Load metadata from metadata.json
68
+ # @param path [Pathname] Path to eval directory
69
+ # @return [Hash] Parsed metadata or empty hash if file doesn't exist
70
+ # @raise [JSON::ParserError] if JSON is malformed
71
+ def self.load_metadata(path)
72
+ metadata_file = path.join('metadata.json')
73
+ return {} unless metadata_file.exist?
74
+
75
+ JSON.parse(File.read(metadata_file))
76
+ end
77
+
78
+ private_class_method :load_task, :load_criteria, :load_metadata
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../clients/provider_schemas'
4
+
5
+ module SkillBench
6
+ module Models
7
+ # Represents an agent runtime + LLM provider
8
+ class Provider
9
+ attr_reader :name, :runtime, :llm, :config
10
+
11
+ ALLOWED_PROVIDERS = (Clients::ProviderSchemas.names.map(&:to_s) + %w[mock]).freeze
12
+
13
+ # Settings that can be overridden via environment variables.
14
+ ENV_OVERRIDABLE_SETTINGS = %i[api_key model base_url endpoint location project_id api_version].freeze
15
+
16
+ # Initialize a new Provider
17
+ # @param name [String] Provider name (e.g., "openai")
18
+ # @param runtime [String] Agent runtime (e.g., "opencode")
19
+ # @param llm [String] LLM provider (e.g., "openai")
20
+ # @param config [Hash] Provider-specific configuration
21
+ def initialize(name:, runtime:, llm:, config: {})
22
+ @name = name
23
+ @runtime = runtime
24
+ @llm = llm
25
+ @config = config.is_a?(Hash) ? config.transform_keys(&:to_sym) : {}
26
+ end
27
+
28
+ # Returns merged config with environment variable fallbacks.
29
+ # Checks both `SKILL_BENCH_<PROVIDER>_<SETTING>` (documented standard)
30
+ # and `<PROVIDER>_<SETTING>` (legacy) naming conventions.
31
+ #
32
+ # @return [Hash] Merged configuration
33
+ # @raise [ArgumentError] if provider name is invalid or API key is missing
34
+ def merged_config
35
+ raise ArgumentError, "Invalid provider name: #{name}" unless ALLOWED_PROVIDERS.include?(name)
36
+
37
+ merged = config.dup
38
+ ENV_OVERRIDABLE_SETTINGS.each do |setting|
39
+ merged[setting] = resolve_env_setting(setting)
40
+ end
41
+
42
+ api_key = merged[:api_key]
43
+ raise ArgumentError, "API key not found for provider '#{name}'. Set SKILL_BENCH_#{name.upcase}_API_KEY environment variable or provide in config." if api_key.nil? || api_key.to_s.empty?
44
+
45
+ merged
46
+ end
47
+
48
+ private
49
+
50
+ # Resolves a single setting from environment variables.
51
+ # Prefers `SKILL_BENCH_<PROVIDER>_<SETTING>`, falls back to
52
+ # `<PROVIDER>_<SETTING>`, then to the config file value.
53
+ #
54
+ # @param setting [Symbol] The setting name (e.g., :api_key)
55
+ # @return [String, nil] The resolved value
56
+ def resolve_env_setting(setting)
57
+ provider_name = name.upcase
58
+ setting_name = setting.upcase
59
+
60
+ prefixed = ENV.fetch("SKILL_BENCH_#{provider_name}_#{setting_name}", nil)
61
+ return prefixed if prefixed && !prefixed.to_s.empty?
62
+
63
+ legacy = ENV.fetch("#{provider_name}_#{setting_name}", nil)
64
+ return legacy if legacy && !legacy.to_s.empty?
65
+
66
+ config[setting]
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
5
+ module SkillBench
6
+ module Models
7
+ # Represents a reusable skill for agent evaluation
8
+ class Skill
9
+ attr_reader :name, :path
10
+
11
+ # Initialize a new Skill
12
+ # @param name [String] Skill name
13
+ # @param path [String] Path to skill directory
14
+ def initialize(name:, path:)
15
+ @name = name
16
+ @path = path
17
+ end
18
+
19
+ # Discover skills from a directory recursively
20
+ # @param base_path [String] Directory to search (default: "skills/")
21
+ # @return [Array<SkillBench::Models::Skill>] Discovered skills
22
+ def self.discover(base_path = 'skills/')
23
+ return [] unless Dir.exist?(base_path)
24
+
25
+ Dir.glob(File.join(base_path, '**', 'SKILL.md')).map do |skill_md_path|
26
+ skill_dir = File.dirname(skill_md_path)
27
+ new(name: File.basename(skill_dir), path: skill_dir)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'services/iteration_formatter'
4
+ require_relative 'services/delta_table_formatter'
5
+ require_relative 'services/feedback_generator'
6
+ require_relative 'services/json_formatter'
7
+ require_relative 'services/junit_formatter'
8
+
9
+ module SkillBench
10
+ # Handles formatting output for different use cases (human, CI, etc.).
11
+ # Delegates all presentation logic to focused service objects under
12
+ # {SkillBench::Services}.
13
+ class OutputFormatter
14
+ # Format the eval result for output.
15
+ #
16
+ # @param result [Hash] Eval result with keys like :eval_name, :pass, :score, etc.
17
+ # @param format [Symbol] Output format (:human, :json, :junit)
18
+ # @return [String] Formatted output string
19
+ def self.format(result, format: :human)
20
+ case format
21
+ when :json
22
+ Services::JsonFormatter.format(result)
23
+ when :junit
24
+ Services::JUnitFormatter.format(result)
25
+ else
26
+ format_human(result)
27
+ end
28
+ end
29
+
30
+ # Determine exit code based on eval result.
31
+ #
32
+ # @param result [Hash] Eval result with :pass or :success/:response keys.
33
+ # @return [Integer] 0 if passed, 1 if failed
34
+ def self.exit_code(result)
35
+ return 0 if result[:pass]
36
+ return 1 unless result[:success]
37
+
38
+ report = result.dig(:response, :report)
39
+ report&.verdict ? 0 : 1
40
+ end
41
+
42
+ # Format result as human-readable text.
43
+ #
44
+ # @param result [Hash] Eval result in old or new format.
45
+ # @return [String] Human-readable formatted string.
46
+ def self.format_human(result)
47
+ report = result.dig(:response, :report)
48
+ return format_legacy_human(result) unless delta_report?(report)
49
+
50
+ format_delta_report(result, report)
51
+ end
52
+ private_class_method :format_human
53
+
54
+ # Checks whether a report object is a DeltaReport.
55
+ #
56
+ # @param report [Object] The report to inspect.
57
+ # @return [Boolean] true when the report has DeltaReport attributes.
58
+ def self.delta_report?(report)
59
+ report.respond_to?(:deltas) && report.respond_to?(:criteria) &&
60
+ report.respond_to?(:baseline_scores) && report.respond_to?(:context_scores)
61
+ end
62
+ private_class_method :delta_report?
63
+
64
+ # Formats a legacy result hash.
65
+ #
66
+ # @param result [Hash] Legacy eval result.
67
+ # @return [String] Human-readable formatted string.
68
+ def self.format_legacy_human(result)
69
+ status = result[:pass] ? 'PASSED' : 'FAILED'
70
+ lines = [
71
+ '=' * 60,
72
+ "Eval: #{result[:eval_name] || ''}",
73
+ "Skill: #{result[:skill_name] || ''}",
74
+ "Provider: #{result[:provider_name] || ''}",
75
+ "Status: #{status}",
76
+ "Score: #{result[:score]&.round(2) || 'N/A'}"
77
+ ]
78
+ error_msg = result.dig(:response, :error, :message)
79
+ lines << "Error: #{error_msg}" if error_msg
80
+ lines << ('=' * 60)
81
+ lines.join("\n")
82
+ end
83
+ private_class_method :format_legacy_human
84
+
85
+ # Formats a DeltaReport as a human-readable report.
86
+ #
87
+ # @param result [Hash] Eval result envelope.
88
+ # @param report [SkillBench::DeltaReport] The delta report.
89
+ # @return [String] Formatted report string.
90
+ def self.format_delta_report(result, report)
91
+ lines = [
92
+ ('═' * 55),
93
+ " Eval: #{result[:eval_name] || ''}",
94
+ " Skill: #{result[:skill_name] || ''}",
95
+ " Provider: #{result[:provider_name] || ''}",
96
+ ('═' * 55),
97
+ ''
98
+ ]
99
+
100
+ lines.concat(build_iteration_lines(result))
101
+ lines << Services::DeltaTableFormatter.format(report, result)
102
+
103
+ feedback_result = Services::FeedbackGenerator.call(report)
104
+ if feedback_result[:success]
105
+ output = feedback_result.dig(:response, :output)
106
+ lines << output unless output.empty?
107
+ end
108
+
109
+ lines.join("\n")
110
+ end
111
+ private_class_method :format_delta_report
112
+
113
+ # Builds iteration timeline lines from the result response.
114
+ #
115
+ # @param result [Hash] Eval result envelope.
116
+ # @return [Array<String>] Lines to append, or empty array.
117
+ def self.build_iteration_lines(result)
118
+ baseline = result.dig(:response, :baseline_iterations) || []
119
+ context = result.dig(:response, :context_iterations) || []
120
+ baseline_empty = baseline.empty?
121
+ context_empty = context.empty?
122
+ lines = []
123
+
124
+ lines << Services::IterationFormatter.format('BASELINE ITERATIONS', baseline) unless baseline_empty
125
+ lines << Services::IterationFormatter.format('CONTEXT ITERATIONS', context) unless context_empty
126
+ lines << '' unless baseline_empty && context_empty
127
+
128
+ lines
129
+ end
130
+ private_class_method :build_iteration_lines
131
+ end
132
+ end