ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,172 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative 'config/defaults'
5
+ require_relative 'config/store'
6
+ require_relative 'config/applier'
7
+ require_relative 'config/json_loader'
8
+ require_relative 'config/env_overrides'
9
+ require_relative 'config/facade_readers'
10
+ require_relative 'config/facade_writers'
11
+
12
+ module SkillBench
13
+ # Centralized configuration for the SkillBench system.
14
+ # Supports hierarchical loading: Defaults < Home JSON < Local JSON < ENV Variables.
15
+ class Config
16
+ # File name used for local and home evaluator configuration.
17
+ CONFIG_FILENAME = 'skill-bench.json'
18
+
19
+ class << self
20
+ include Config::FacadeReaders
21
+ include Config::FacadeWriters
22
+
23
+ # Returns the mutable configuration store behind the facade.
24
+ # Lazily initializes configuration on first access.
25
+ #
26
+ # @return [Config::Store] configuration state store
27
+ def store
28
+ @store ||= Config::Store.new
29
+ end
30
+
31
+ # Returns the default configuration.
32
+ #
33
+ # @return [Hash] default configuration hash
34
+ def defaults
35
+ Config::Defaults.call
36
+ end
37
+
38
+ # Applies configuration from the store.
39
+ #
40
+ # @return [Hash] applied configuration
41
+ def apply
42
+ Config::Applier.call(store.to_h)
43
+ end
44
+
45
+ # Loads configuration from a JSON file.
46
+ #
47
+ # @param path [String] Path to JSON file
48
+ # @return [Hash] loaded configuration
49
+ def load_from_file(path)
50
+ Config::JsonLoader.call(path)
51
+ end
52
+
53
+ # Saves configuration to a JSON file.
54
+ #
55
+ # @param path [String] Path to JSON file
56
+ # @param config [Hash] Configuration to save
57
+ # @return [void]
58
+ def save_to_file(path, config)
59
+ Config::FacadeWriters.save_to_file(path, config)
60
+ end
61
+
62
+ # Returns configuration overrides from environment variables.
63
+ #
64
+ # @return [Hash] environment-based overrides
65
+ def env_overrides
66
+ Config::EnvOverrides.call
67
+ end
68
+
69
+ # Resets and reloads configuration from all sources.
70
+ # Pipeline: Defaults → Home JSON → Local JSON → ENV overrides.
71
+ #
72
+ # @return [void]
73
+ def reset
74
+ @store = Config::Store.new
75
+ apply_defaults
76
+ apply_json_config(home_config_path)
77
+ apply_json_config(Pathname.new(Dir.pwd).join(CONFIG_FILENAME))
78
+ apply_env_overrides
79
+ end
80
+
81
+ # Sets up configuration with a block.
82
+ #
83
+ # @yieldparam config [Config::Store] Configuration store for modification
84
+ # @return [void]
85
+ def setup
86
+ yield store
87
+ end
88
+
89
+ # Returns allowed commands from configuration.
90
+ #
91
+ # @return [Array<String>, nil] List of allowed commands
92
+ def allowed_commands
93
+ store.allowed_commands
94
+ end
95
+
96
+ # Returns max execution time from configuration.
97
+ #
98
+ # @return [Integer] Maximum execution time in seconds
99
+ def max_execution_time
100
+ store.max_execution_time || 30
101
+ end
102
+
103
+ # Returns the current LLM provider name.
104
+ #
105
+ # @return [Symbol] Current provider name
106
+ def current_llm_provider
107
+ store.current_llm_provider || :openai
108
+ end
109
+
110
+ # Sets the current LLM provider.
111
+ #
112
+ # @param provider [Symbol] Provider name
113
+ # @return [void]
114
+ def current_llm_provider=(provider)
115
+ store.assign_current_llm_provider(provider)
116
+ end
117
+
118
+ # Returns LLM providers configuration.
119
+ #
120
+ # @return [Hash] Providers configuration
121
+ def llm_providers_config
122
+ store.llm_providers_config || {}
123
+ end
124
+
125
+ # Returns API key from configuration.
126
+ #
127
+ # @return [String, nil] API key
128
+ def api_key
129
+ store.api_key
130
+ end
131
+
132
+ # Returns model from configuration.
133
+ #
134
+ # @return [String, nil] Model name
135
+ def model
136
+ store.model
137
+ end
138
+
139
+ private
140
+
141
+ def home_config_path
142
+ Pathname.new(Dir.home).join(CONFIG_FILENAME)
143
+ rescue ArgumentError
144
+ nil
145
+ end
146
+
147
+ def apply_defaults
148
+ result = Config::Defaults.call
149
+ return unless result[:success]
150
+
151
+ Config::Applier.call(store: store, data: result[:response][:config])
152
+ end
153
+
154
+ def apply_json_config(path)
155
+ return unless path
156
+ return unless File.exist?(path)
157
+
158
+ result = Config::JsonLoader.call(path)
159
+ return unless result[:success]
160
+
161
+ Config::Applier.call(store: store, data: result[:response][:config])
162
+ end
163
+
164
+ def apply_env_overrides
165
+ result = Config::EnvOverrides.call
166
+ return unless result[:success]
167
+
168
+ store.apply_provider_config(result[:response][:overrides])
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ # Loads, validates, and represents evaluation criteria from criteria.json.
7
+ #
8
+ # Merges eval-specific dimension overrides with built-in default descriptions
9
+ # and validates that dimension weights sum to exactly 100.
10
+ class Criteria
11
+ attr_reader :dimensions, :context, :pass_threshold, :minimum_delta
12
+
13
+ # Loads criteria from a JSON file.
14
+ #
15
+ # @param path [String] Path to the criteria.json file.
16
+ # @return [Hash] Service response with :success and :response keys.
17
+ # @raise [TypeError] when the provided path is not a string.
18
+ def self.call(path:)
19
+ new(path:).call
20
+ end
21
+
22
+ # Returns an empty criteria with default thresholds and no dimensions.
23
+ #
24
+ # @return [SkillBench::Criteria] An empty criteria instance.
25
+ def self.empty
26
+ new(path: '').tap do |criteria|
27
+ criteria.instance_variable_set(:@context, '')
28
+ criteria.instance_variable_set(:@pass_threshold, 70)
29
+ criteria.instance_variable_set(:@minimum_delta, 10)
30
+ criteria.instance_variable_set(:@dimensions, [])
31
+ end
32
+ end
33
+
34
+ # @param path [String] Path to the criteria.json file.
35
+ def initialize(path:)
36
+ @path = path
37
+ end
38
+
39
+ # Loads and validates the criteria file.
40
+ #
41
+ # @return [Hash] Service response with criteria or error.
42
+ def call
43
+ raw = load_json
44
+ return raw unless raw[:success]
45
+
46
+ data = raw[:response][:data]
47
+ raw_dimensions = data['dimensions'] || data[:dimensions] || []
48
+ dimensions = build_dimensions(raw_dimensions)
49
+
50
+ core_validation = validate_core_dimensions(dimensions)
51
+ return core_validation unless core_validation[:success]
52
+
53
+ validation = validate_dimensions(dimensions)
54
+ return validation unless validation[:success]
55
+
56
+ assign_attributes(data, dimensions)
57
+
58
+ { success: true, response: { criteria: self } }
59
+ rescue StandardError => e
60
+ SkillBench::ErrorLogger.log_error(e, 'Criteria Load Error')
61
+ { success: false, response: { error: { message: e.message } } }
62
+ end
63
+
64
+ private
65
+
66
+ attr_reader :path
67
+
68
+ def load_json
69
+ return missing_file_result unless File.exist?(path)
70
+
71
+ data = JSON.parse(File.read(path))
72
+ { success: true, response: { data: data } }
73
+ rescue JSON::ParserError => e
74
+ { success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
75
+ end
76
+
77
+ def missing_file_result
78
+ { success: false, response: { error: { message: "Criteria file #{path} does not exist" } } }
79
+ end
80
+
81
+ def build_dimensions(raw_dimensions)
82
+ defaults = DEFAULT_DIMENSIONS.to_h { |d| [d.name, d] }
83
+
84
+ raw_dimensions.map do |raw|
85
+ name = raw['name'] || raw[:name]
86
+ default = defaults[name]
87
+ description = raw['description'] || raw[:description] || default&.description || ''
88
+
89
+ Dimension.new(
90
+ name: name,
91
+ description: description,
92
+ max_score: raw['max_score'] || raw[:max_score]
93
+ )
94
+ end
95
+ end
96
+
97
+ def validate_dimensions(dimensions)
98
+ invalid = invalid_dimensions(dimensions)
99
+ return invalid_max_score_result(invalid) unless invalid.empty?
100
+
101
+ total = dimensions.sum(&:max_score)
102
+ return { success: true, response: {} } if total == 100
103
+
104
+ {
105
+ success: false,
106
+ response: { error: { message: "Dimension max_scores must sum to 100, got #{total}" } }
107
+ }
108
+ end
109
+
110
+ def invalid_dimensions(dimensions)
111
+ dimensions.reject { |d| d.max_score.is_a?(Numeric) }
112
+ end
113
+
114
+ def invalid_max_score_result(invalid)
115
+ names = invalid.map(&:name).join(', ')
116
+ {
117
+ success: false,
118
+ response: { error: { message: "Dimensions missing or invalid max_score: #{names}" } }
119
+ }
120
+ end
121
+
122
+ def validate_core_dimensions(dimensions)
123
+ core_names = DEFAULT_DIMENSIONS.map(&:name)
124
+ present_names = dimensions.map(&:name)
125
+ missing = core_names - present_names
126
+ return { success: true, response: {} } if missing.empty?
127
+
128
+ {
129
+ success: false,
130
+ response: { error: { message: "missing required core dimensions: #{missing.join(', ')}" } }
131
+ }
132
+ end
133
+
134
+ def assign_attributes(data, dimensions)
135
+ @context = data['context'] || data[:context] || ''
136
+ @pass_threshold = [data['pass_threshold'], data[:pass_threshold]].compact.first || 70
137
+ @minimum_delta = [data['minimum_delta'], data[:minimum_delta]].compact.first || 10
138
+ @dimensions = dimensions
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Computes baseline vs context deltas per dimension and determines verdict.
5
+ #
6
+ # Verdict is true when context score meets pass_threshold AND
7
+ # the total delta meets minimum_delta.
8
+ class DeltaReport
9
+ attr_reader :deltas, :baseline_total, :context_total, :verdict, :baseline_scores, :context_scores, :criteria,
10
+ :baseline_dimensions, :context_dimensions
11
+
12
+ # Computes deltas and verdict from baseline and context judge responses.
13
+ #
14
+ # @param baseline [Hash] Baseline judge dimensions hash.
15
+ # @param context [Hash] Context judge dimensions hash.
16
+ # @param criteria [SkillBench::Criteria] The eval criteria with thresholds.
17
+ # @return [Hash] Service response with delta_report or error.
18
+ def self.call(baseline:, context:, criteria:)
19
+ new(baseline:, context:, criteria:).call
20
+ end
21
+
22
+ # @param baseline [Hash] Baseline dimensions.
23
+ # @param context [Hash] Context dimensions.
24
+ # @param criteria [SkillBench::Criteria] Eval criteria.
25
+ def initialize(baseline:, context:, criteria:)
26
+ @baseline = baseline
27
+ @context = context
28
+ @criteria = criteria
29
+ @deltas = {}
30
+ end
31
+
32
+ # Computes deltas and determines verdict.
33
+ #
34
+ # @return [Hash] Service response with delta_report or error.
35
+ def call
36
+ return mismatch_result unless dimensions_match?
37
+
38
+ @baseline_dimensions = deep_copy_dimensions(baseline)
39
+ @context_dimensions = deep_copy_dimensions(context)
40
+ @baseline_scores = extract_scores(baseline)
41
+ @context_scores = extract_scores(context)
42
+ compute_totals
43
+ compute_deltas
44
+ determine_verdict
45
+
46
+ { success: true, response: { delta_report: self } }
47
+ rescue StandardError => e
48
+ SkillBench::ErrorLogger.log_error(e, 'DeltaReport Error')
49
+ { success: false, response: { error: { message: e.message } } }
50
+ end
51
+
52
+ private
53
+
54
+ attr_reader :baseline, :context
55
+
56
+ def dimensions_match?
57
+ baseline.keys.sort == context.keys.sort
58
+ end
59
+
60
+ def mismatch_result
61
+ { success: false, response: { error: { message: 'Baseline and context dimension names mismatch' } } }
62
+ end
63
+
64
+ def compute_totals
65
+ @baseline_total = baseline.values.sum { |v| extract_score(v) }
66
+ @context_total = context.values.sum { |v| extract_score(v) }
67
+ end
68
+
69
+ def compute_deltas
70
+ baseline.each do |name, base|
71
+ base_score = extract_score(base)
72
+ context_score = extract_score(context[name])
73
+ @deltas[name] = context_score - base_score
74
+ end
75
+ end
76
+
77
+ def extract_score(dim)
78
+ dim[:score] || dim['score']
79
+ end
80
+
81
+ def extract_scores(dimensions)
82
+ dimensions.transform_values { |dim| extract_score(dim) }
83
+ end
84
+
85
+ def deep_copy_dimensions(dimensions)
86
+ dimensions.transform_values(&:dup)
87
+ end
88
+
89
+ def determine_verdict
90
+ @verdict = context_total >= criteria.pass_threshold && total_delta >= criteria.minimum_delta
91
+ end
92
+
93
+ def total_delta
94
+ context_total - baseline_total
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Value object representing a scoring dimension for evaluation.
5
+ #
6
+ # Dimensions are used by the judge to score agent output across
7
+ # different aspects such as correctness, code quality, and skill adherence.
8
+ class Dimension
9
+ attr_reader :name, :description, :max_score
10
+
11
+ # @param name [String] The machine-friendly identifier for the dimension.
12
+ # @param description [String] Human-readable explanation of what the dimension measures.
13
+ # @param max_score [Integer, nil] Maximum score this dimension can contribute. Nil in defaults.
14
+ def initialize(name:, description:, max_score:)
15
+ @name = name
16
+ @description = description
17
+ @max_score = max_score
18
+ end
19
+
20
+ # Compares two Dimension instances for equality.
21
+ #
22
+ # @param other [Object] The object to compare against.
23
+ # @return [Boolean] true when all attributes match.
24
+ def ==(other)
25
+ other.is_a?(Dimension) &&
26
+ name == other.name &&
27
+ description == other.description &&
28
+ max_score == other.max_score
29
+ end
30
+ alias eql? ==
31
+
32
+ # Computes a hash code based on attributes.
33
+ #
34
+ # @return [Integer] The hash code.
35
+ def hash
36
+ [name, description, max_score].hash
37
+ end
38
+ end
39
+
40
+ # Canonical dimensions used when eval authors do not override descriptions.
41
+ # Weights (max_score) are nil here; the eval's criteria.json provides them.
42
+ DEFAULT_DIMENSIONS = [
43
+ Dimension.new(
44
+ name: 'correctness',
45
+ description: 'Does the output fulfill the task requirements? Are all specified behaviors present and correct?',
46
+ max_score: nil
47
+ ),
48
+ Dimension.new(
49
+ name: 'skill_adherence',
50
+ description: 'Did the agent follow the specific patterns, hard gates, and workflows defined in the skill?',
51
+ max_score: nil
52
+ ),
53
+ Dimension.new(
54
+ name: 'code_quality',
55
+ description: 'Is the code clean, well-structured, free of smells, follows SRP, and avoids duplication?',
56
+ max_score: nil
57
+ ),
58
+ Dimension.new(
59
+ name: 'test_coverage',
60
+ description: 'Are there meaningful tests? Do they test the right things? Are they following TDD/best practices from the skill?',
61
+ max_score: nil
62
+ ),
63
+ Dimension.new(
64
+ name: 'documentation',
65
+ description: 'Is there adequate YARD documentation, clear intent, and helpful inline comments where needed?',
66
+ max_score: nil
67
+ )
68
+ ].freeze
69
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Shared error logging mixin for service objects.
5
+ # Logs error message and backtrace to Rails.logger or stderr.
6
+ module ErrorLogger
7
+ # Logs an error with message and backtrace.
8
+ #
9
+ # @param error [StandardError] The exception to log
10
+ # @param prefix [String] Optional prefix for the log message
11
+ # @return [void]
12
+ def log_error(error, prefix = nil)
13
+ message = prefix ? "#{prefix}: #{error.message}" : error.message
14
+ backtrace = error.backtrace&.first(5)&.join("\n") || '(no backtrace)'
15
+
16
+ return if skip_stderr_output?
17
+
18
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
19
+ Rails.logger.error(message)
20
+ Rails.logger.error(backtrace)
21
+ else
22
+ warn(message)
23
+ warn(backtrace)
24
+ end
25
+ end
26
+
27
+ # @return [Boolean] true when stderr should be skipped (test mode without explicit capture).
28
+ def skip_stderr_output?
29
+ defined?(Minitest) && !$stderr.is_a?(StringIO)
30
+ end
31
+
32
+ module_function :log_error
33
+ module_function :skip_stderr_output?
34
+ end
35
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative 'runner'
5
+ require_relative 'services/option_parser_service'
6
+ require_relative 'services/result_printer_service'
7
+ require_relative 'services/output_persistence_service'
8
+
9
+ module SkillBench
10
+ # Implements the `skill-bench run` CLI command.
11
+ # Orchestrates option parsing, evaluation execution, result printing, and output persistence.
12
+ # @deprecated Use {SkillBench::Cli::RunCommand} and {SkillBench::Services::RunnerService} instead.
13
+ class EvaluateCommand
14
+ # Parses arguments, runs the evaluator, prints the report, and records history.
15
+ #
16
+ # @param argv [Array<String>] Raw CLI arguments.
17
+ # @param stdout [#puts, #write] Output stream for user-visible messages.
18
+ # @return [Integer] Shell-compatible exit code.
19
+ # @raise [OptionParser::ParseError] when invalid CLI flags are provided.
20
+ # @raise [SystemCallError] if writing output fails.
21
+ def self.call(argv, stdout: $stdout)
22
+ new(argv, stdout: stdout).call
23
+ end
24
+
25
+ # @param argv [Array<String>] Raw CLI arguments.
26
+ # @param stdout [#puts, #write] Output stream for user-visible messages.
27
+ def initialize(argv, stdout:)
28
+ @argv = argv
29
+ @stdout = stdout
30
+ @options = nil
31
+ end
32
+
33
+ # Executes the command by orchestrating service objects.
34
+ #
35
+ # @return [Integer] Shell-compatible exit code.
36
+ # @raise [OptionParser::ParseError] when invalid CLI flags are provided.
37
+ # @raise [SystemCallError] when the optional JSON output file cannot be written.
38
+ def call
39
+ return 1 unless parse_options? && validate_options?
40
+
41
+ result = run_evaluation
42
+ return 1 unless result[:success]
43
+
44
+ return 1 unless persist_output?(result)
45
+
46
+ SkillBench::HistoryRecorder.record(
47
+ result,
48
+ source_path: result[:source_path],
49
+ model: SkillBench::Config.model
50
+ )
51
+
52
+ 0
53
+ end
54
+
55
+ private
56
+
57
+ def parse_options?
58
+ options_result = Services::OptionParserService.call(@argv)
59
+ @options = options_result[:response]
60
+
61
+ unless options_result[:success]
62
+ @stdout.puts "Error: #{@options[:error][:message]}"
63
+ return false
64
+ end
65
+
66
+ true
67
+ end
68
+
69
+ def validate_options?
70
+ eval_path = @options[:eval]
71
+ return true if eval_path
72
+
73
+ @stdout.puts 'Error: The --eval option is required.'
74
+ @stdout.puts 'Example: bin/evaluate -e evals/skills/infrastructure/rails-api-versioning/api-versioning-with-controller-inheritan'
75
+ false
76
+ end
77
+
78
+ def run_evaluation
79
+ skill_option = @options[:skill]
80
+ eval_path = safe_expand_path(@options[:eval])
81
+ skill_path = skill_option ? safe_expand_path(skill_option) : nil
82
+
83
+ result = SkillBench::Runner.call(
84
+ eval_folder_path: eval_path,
85
+ skill_path: skill_path
86
+ )
87
+ Services::ResultPrinterService.call(result, stdout: @stdout)
88
+ result
89
+ end
90
+
91
+ def persist_output?(result)
92
+ output_result = Services::OutputPersistenceService.call(result, output_path: @options[:output])
93
+ output_response = output_result[:response]
94
+ message = output_response[:message]
95
+
96
+ if output_result[:success]
97
+ @stdout.puts(message) if message
98
+ true
99
+ else
100
+ @stdout.puts "Error saving report: #{output_response[:error][:message]}"
101
+ false
102
+ end
103
+ end
104
+
105
+ def safe_expand_path(path)
106
+ expanded = File.expand_path(path)
107
+ base = File.expand_path(Dir.pwd)
108
+
109
+ real_expanded = File.exist?(expanded) ? File.realpath(expanded) : expanded
110
+ real_base = File.realpath(base)
111
+
112
+ relative = Pathname.new(real_expanded).relative_path_from(Pathname.new(real_base)).to_s
113
+ raise ArgumentError, "Path '#{path}' resolves outside the current working directory" if relative.start_with?('..')
114
+
115
+ expanded
116
+ rescue Errno::ENOENT, Errno::EACCES => e
117
+ raise ArgumentError, "Path '#{path}' is not accessible: #{e.message}"
118
+ end
119
+ end
120
+ end