ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems/package'
4
+ require_relative 'error_logger'
5
+
6
+ module SkillBench
7
+ # Verifies that a built gem package includes the files required for release.
8
+ class PackageVerifier
9
+ # Raised when a built gem package is missing required files.
10
+ class Error < StandardError; end
11
+
12
+ # Files that must be present for a usable evaluator gem package.
13
+ REQUIRED_FILES = %w[
14
+ README.md
15
+ LICENSE
16
+ bin/skill-bench
17
+ docs/architecture.md
18
+ docs/testing-guide.md
19
+ lib/skill_bench.rb
20
+ lib/skill_bench/config/applier.rb
21
+ lib/skill_bench/config/defaults.rb
22
+ lib/skill_bench/config/env_overrides.rb
23
+ lib/skill_bench/config/facade_readers.rb
24
+ lib/skill_bench/config/facade_writers.rb
25
+ lib/skill_bench/config/json_loader.rb
26
+ lib/skill_bench/config/store.rb
27
+ lib/skill_bench/package_verifier.rb
28
+ lib/skill_bench/source_path_resolver.rb
29
+ lib/skill_bench/runner.rb
30
+ ].freeze
31
+
32
+ # Verifies that a gem package includes required release files.
33
+ #
34
+ # @param package_path [String] path to the built `.gem` file
35
+ # @param required_files [Array<String>] files that must be present in the gemspec payload
36
+ # @return [Hash] result envelope with package verification details
37
+ def self.call(package_path:, required_files: REQUIRED_FILES)
38
+ new(package_path:, required_files:).call
39
+ end
40
+
41
+ # Initializes the verifier.
42
+ #
43
+ # @param package_path [String] path to the built `.gem` file
44
+ # @param required_files [Array<String>] files that must be present in the gemspec payload
45
+ # @return [PackageVerifier] a verifier instance
46
+ def initialize(package_path:, required_files: REQUIRED_FILES)
47
+ @package_path = package_path
48
+ @required_files = required_files
49
+ end
50
+
51
+ # Verifies that the configured package contains all required files.
52
+ #
53
+ # @return [Hash] result envelope with package verification details
54
+ def call
55
+ files = packaged_files
56
+ missing = @required_files - files
57
+ return failure("Missing packaged files: #{missing.join(', ')}") if missing.any?
58
+
59
+ { success: true, response: { missing_files: [], packaged_files: files } }
60
+ rescue StandardError => e
61
+ SkillBench::ErrorLogger.log_error(e, 'PackageVerifier Error')
62
+ failure(e.message)
63
+ end
64
+
65
+ private
66
+
67
+ def packaged_files
68
+ Gem::Package.new(@package_path).spec.files
69
+ end
70
+
71
+ def failure(message)
72
+ {
73
+ success: false,
74
+ response: {
75
+ error: { message: message }
76
+ }
77
+ }
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'active_support/inflector'
4
+
5
+ module SkillBench
6
+ module Rails
7
+ # Generates Rails-specific skill templates
8
+ class SkillTemplates
9
+ # Generate a service object template
10
+ # @param name [String] Service name (e.g., 'my_service' or 'my-service')
11
+ # @return [String] Service object Ruby class
12
+ def self.service_object(name)
13
+ class_name = name.split(/[-_]/).map(&:capitalize).join
14
+ <<~RUBY
15
+ # frozen_string_literal: true
16
+
17
+ module SkillBench
18
+ module Skills
19
+ class #{class_name}
20
+ # Initialize with required parameters
21
+ # @param args [Hash] Keyword arguments for the service
22
+ def initialize(**args)
23
+ # Set instance variables from args
24
+ end
25
+
26
+ # Execute the service
27
+ # @return [Hash] Result with :success and :response keys
28
+ def call
29
+ # Implement service logic here
30
+ { success: true, response: { message: 'Not implemented' } }
31
+ rescue StandardError => e
32
+ Rails.logger.error(e.message)
33
+ Rails.logger.error(e.backtrace.first(5).join("\n"))
34
+ { success: false, response: { error: { message: e.message } } }
35
+ end
36
+ end
37
+ end
38
+ end
39
+ RUBY
40
+ end
41
+
42
+ # Generate a concern template
43
+ # @param name [String] Concern name (e.g., 'my_concern')
44
+ # @return [String] Concern module
45
+ def self.concern(name)
46
+ module_name = name.camelize
47
+ <<~RUBY
48
+ # frozen_string_literal: true
49
+
50
+ module #{module_name}
51
+ extend ActiveSupport::Concern
52
+
53
+ included do
54
+ # Add class methods, associations, validations here
55
+ end
56
+
57
+ class_methods do
58
+ # Add class methods here
59
+ end
60
+
61
+ # Add instance methods here
62
+ end
63
+ RUBY
64
+ end
65
+
66
+ # Generate an ActiveRecord model template
67
+ # @param name [String] Model name (e.g., 'my_model')
68
+ # @return [String] ActiveRecord model class
69
+ def self.active_record_model(name)
70
+ class_name = name.camelize
71
+ <<~RUBY
72
+ # frozen_string_literal: true
73
+
74
+ class #{class_name} < ApplicationRecord
75
+ # Validations
76
+ validates :name, presence: true
77
+
78
+ # Associations
79
+ # belongs_to :user
80
+ # has_many :items
81
+
82
+ # Scopes
83
+ # scope :active, -> { where(active: true) }
84
+
85
+ # Instance methods
86
+ # def some_method
87
+ # ...
88
+ # end
89
+
90
+ # Class methods
91
+ # def self.some_class_method
92
+ # ...
93
+ # end
94
+ end
95
+ RUBY
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'parallel'
5
+ require_relative 'task/evaluator'
6
+ require_relative 'error_logger'
7
+
8
+ module SkillBench
9
+ # Orchestrates the entire evaluation process.
10
+ # Compares how an AI coding agent performs with and without contextual skills.
11
+ # @deprecated Use {SkillBench::Services::RunnerService} instead.
12
+ class Runner
13
+ # Initiates a full evaluation run.
14
+ #
15
+ # @param params [Hash] The configuration for the evaluation.
16
+ # @option params [String] :eval_folder_path The path to the evaluation directory containing task and criteria.
17
+ # @option params [String] :skill_path Optional override for the source directory being tested.
18
+ # @option params [String, Pathname] :base_path (optional) The base path for relative file resolution.
19
+ # @option params [Hash] :client_params (optional) Parameters to pass to the LLM client.
20
+ # @return [Hash] A result hash with :success and :response payload containing the judge scores and diffs.
21
+ # @raise [ArgumentError] If the eval path does not match a supported source-path convention.
22
+ def self.call(params)
23
+ new(params).call
24
+ end
25
+
26
+ # @param params [Hash] The configuration for the evaluation.
27
+ def initialize(params)
28
+ @eval_folder_path = params[:eval_folder_path]
29
+ @skill_path = params[:skill_path]
30
+ @base_path = params[:base_path] || Pathname.new(Dir.pwd)
31
+ @client_params = params[:client_params] || {}
32
+ end
33
+
34
+ # Executes the baseline and context-hydrated evaluations, then scores them.
35
+ #
36
+ # @return [Hash] The final evaluation result.
37
+ def call
38
+ full_path = @base_path.join(@eval_folder_path)
39
+
40
+ return { success: false, response: { error: { message: "Evaluation path #{full_path} does not exist" } } } unless full_path.exist?
41
+
42
+ task_dirs = self.class.discover_task_dirs(full_path)
43
+ if task_dirs.empty?
44
+ return { success: false,
45
+ response: { error: { message: "No task.md found in #{full_path} or its subdirectories" } } }
46
+ end
47
+
48
+ results = Parallel.map(task_dirs, in_threads: 4) do |task_dir|
49
+ task_result = Task::Evaluator.call(
50
+ full_eval_path: task_dir,
51
+ base_path: @base_path,
52
+ skill_path: @skill_path,
53
+ client_params: @client_params
54
+ )
55
+ # Normalize to uniform envelope
56
+ if task_result.key?(:success)
57
+ task_result
58
+ else
59
+ { success: true, response: task_result }
60
+ end
61
+ end
62
+
63
+ overall_success = results.all? { |task_result| task_result[:success] }
64
+
65
+ {
66
+ success: overall_success,
67
+ response: {
68
+ source_path: @skill_path || 'multiple (batch run)',
69
+ tasks: results
70
+ }
71
+ }
72
+ rescue StandardError => e
73
+ SkillBench::ErrorLogger.log_error(e, 'Runner Error')
74
+ { success: false, response: { error: { message: e.message } } }
75
+ end
76
+
77
+ # Finds all directories containing a task.md file starting from the root_path.
78
+ #
79
+ # @param root_path [Pathname] The root directory to search.
80
+ # @return [Array<Pathname>] A list of task directory paths.
81
+ def self.discover_task_dirs(root_path)
82
+ if File.exist?(root_path.join('task.md'))
83
+ [root_path]
84
+ else
85
+ Dir.glob(root_path.join('**/task.md')).map { |f| Pathname.new(f).parent }.uniq.sort
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'formatting_helpers'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Formats the dimension scoring table, totals, trend, and verdict for a DeltaReport.
8
+ class DeltaTableFormatter
9
+ extend FormattingHelpers
10
+
11
+ # Formats the delta report scoring section.
12
+ #
13
+ # @param report [SkillBench::DeltaReport] The delta report.
14
+ # @param result [Hash, nil] Eval result envelope (used for trend data).
15
+ # @return [String] Formatted table, totals, trend, and verdict.
16
+ def self.format(report, result = nil)
17
+ lines = [
18
+ ' DIMENSION BASELINE CONTEXT DELTA',
19
+ ' ──────────────────────── ───────── ───────── ───────'
20
+ ]
21
+
22
+ report.deltas.each do |name, delta|
23
+ lines << format_dimension_row(name, delta, report)
24
+ end
25
+
26
+ lines << ' ──────────────────────── ───────── ───────── ───────'
27
+ lines << format_total_row(report)
28
+ lines << ''
29
+ trend = result[:trend] if result
30
+ lines << format_trend(trend) if trend
31
+
32
+ status = report.verdict ? 'PASS' : 'FAIL'
33
+ criteria = report.criteria
34
+ threshold = criteria.pass_threshold
35
+ delta_threshold = criteria.minimum_delta
36
+ lines << " VERDICT: #{status} (threshold: #{threshold}, minimum delta: #{delta_threshold})"
37
+ lines << ('═' * 55)
38
+
39
+ lines.join("\n")
40
+ end
41
+
42
+ private_class_method def self.format_dimension_row(name, delta, report)
43
+ dim = report.criteria.dimensions.find { |d| d.name == name }
44
+ max_score = dim&.max_score || ''
45
+ humanized = humanize(name)
46
+ label = dim ? "#{humanized} (#{max_score})" : humanized
47
+ baseline_score = report.baseline_scores[name]
48
+ context_score = report.context_scores[name]
49
+ Kernel.format(' %<label>-24s %<baseline>9s %<context>9s %<delta>7s',
50
+ label: label, baseline: baseline_score, context: context_score,
51
+ delta: delta_str(delta))
52
+ end
53
+
54
+ private_class_method def self.format_total_row(report)
55
+ Kernel.format(' %<label>-24s %<baseline>9s %<context>9s %<delta>7s',
56
+ label: 'TOTAL', baseline: "#{report.baseline_total}/100",
57
+ context: "#{report.context_total}/100",
58
+ delta: delta_str(report.deltas.values.sum))
59
+ end
60
+
61
+ private_class_method def self.format_trend(trend)
62
+ return nil unless trend
63
+
64
+ baseline_icon = trend_icon(trend[:baseline_trend])
65
+ context_icon = trend_icon(trend[:context_trend])
66
+ baseline_delta = trend[:baseline_delta]
67
+ context_delta = trend[:context_delta]
68
+ " TREND: baseline #{baseline_icon} (#{delta_str(baseline_delta)}), context #{context_icon} (#{delta_str(context_delta)})"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'formatting_helpers'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Categorizes dimension scores into "what went well", "what went wrong",
8
+ # and actionable advice based on judge reasoning.
9
+ class FeedbackGenerator
10
+ extend FormattingHelpers
11
+
12
+ # Generates feedback sections from a DeltaReport.
13
+ #
14
+ # @param report [SkillBench::DeltaReport] The delta report.
15
+ # @return [Hash] Standardized response hash:
16
+ # - { success: true, response: { output: String } }
17
+ def self.call(report)
18
+ output = generate_feedback(report)
19
+ { success: true, response: { output: output } }
20
+ end
21
+
22
+ private_class_method def self.generate_feedback(report)
23
+ return '' unless feedback_applicable?(report)
24
+
25
+ context_dims = report.context_dimensions || {}
26
+ baseline_dims = report.baseline_dimensions || {}
27
+ well, wrong, advice = categorize_dimensions(context_dims, baseline_dims, report)
28
+
29
+ assemble_feedback_lines(well, wrong, advice)
30
+ end
31
+
32
+ private_class_method def self.feedback_applicable?(report)
33
+ return false unless report.respond_to?(:baseline_dimensions) && report.respond_to?(:context_dimensions)
34
+
35
+ context_dims = report.context_dimensions || {}
36
+ baseline_dims = report.baseline_dimensions || {}
37
+ context_dims.any? { |name, dim| baseline_dims[name] && dim }
38
+ end
39
+
40
+ private_class_method def self.categorize_dimensions(context_dims, baseline_dims, report)
41
+ well = []
42
+ wrong = []
43
+ advice = []
44
+
45
+ context_dims.each do |name, dim|
46
+ baseline_dim = baseline_dims[name]
47
+ next unless baseline_dim && dim
48
+
49
+ cat = categorize_dimension(name, dim, baseline_dim, report)
50
+ well.concat(cat[:well])
51
+ wrong.concat(cat[:wrong])
52
+ advice.concat(cat[:advice])
53
+ end
54
+
55
+ [well, wrong, advice]
56
+ end
57
+
58
+ private_class_method def self.categorize_dimension(name, dim, baseline_dim, report)
59
+ values = extract_values(dim, baseline_dim)
60
+ score = values[:score]
61
+ max_score = values[:max_score]
62
+ baseline_score = values[:baseline_score]
63
+ reasoning = values[:reasoning]
64
+
65
+ pct = compute_percentage(score, max_score)
66
+ dim_obj = report.criteria.dimensions.find { |d| d.name == name }
67
+ humanized = humanize(name)
68
+ label = "#{humanized} (#{score}/#{max_score}, baseline: #{baseline_score}/#{max_score})"
69
+
70
+ build_categorization(pct, label, reasoning, humanized, dim_obj)
71
+ end
72
+
73
+ private_class_method def self.extract_values(dim, baseline_dim)
74
+ {
75
+ score: dim[:score] || dim['score'] || 0,
76
+ max_score: dim[:max_score] || dim['max_score'] || 1,
77
+ reasoning: dim[:reasoning] || dim['reasoning'] || '',
78
+ baseline_score: baseline_dim[:score] || baseline_dim['score'] || 0
79
+ }
80
+ end
81
+
82
+ private_class_method def self.compute_percentage(score, max_score)
83
+ max_score.positive? ? (score.to_f / max_score * 100).round : 0
84
+ end
85
+
86
+ private_class_method def self.build_categorization(pct, label, reasoning, humanized, dim_obj)
87
+ well = []
88
+ wrong = []
89
+ advice = []
90
+ has_reasoning = !reasoning.empty?
91
+
92
+ if pct >= 80
93
+ well << " #{label}"
94
+ well << " #{reasoning}" if has_reasoning
95
+ else
96
+ wrong << " #{label}"
97
+ wrong << " #{reasoning}" if has_reasoning
98
+ dim_advice = dim_obj&.description.to_s
99
+ advice << " #{humanized}: #{dim_advice}" unless dim_advice.empty?
100
+ end
101
+
102
+ { well: well, wrong: wrong, advice: advice }
103
+ end
104
+
105
+ private_class_method def self.assemble_feedback_lines(well, wrong, advice)
106
+ lines = []
107
+ append_section(lines, 'WHAT WENT WELL', well)
108
+ append_section(lines, 'WHAT WENT WRONG', wrong)
109
+ append_section(lines, 'ADVICE', advice)
110
+ lines.join("\n")
111
+ end
112
+
113
+ private_class_method def self.append_section(lines, title, items)
114
+ return if items.empty?
115
+
116
+ lines << ''
117
+ lines << " === #{title} ==="
118
+ lines.concat(items)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Shared string-formatting utilities used across output formatters.
6
+ module FormattingHelpers
7
+ module_function
8
+
9
+ # Converts a snake_case name to Title Case.
10
+ #
11
+ # @param name [String, Symbol] The dimension name.
12
+ # @return [String] Human-readable name.
13
+ def humanize(name)
14
+ name.to_s.split('_').map(&:capitalize).join(' ')
15
+ end
16
+
17
+ # Formats a numeric delta with a +/- sign.
18
+ #
19
+ # @param delta [Numeric] The delta value.
20
+ # @return [String] Formatted delta string.
21
+ def delta_str(delta)
22
+ delta >= 0 ? "+#{delta}" : delta.to_s
23
+ end
24
+
25
+ # Truncates a string to a maximum length with ellipsis.
26
+ #
27
+ # @param text [String] The text to truncate.
28
+ # @param max_length [Integer] Maximum length.
29
+ # @return [String] Truncated text.
30
+ def truncate(text, max_length)
31
+ return text if text.length <= max_length
32
+
33
+ "#{text[0...max_length]}..."
34
+ end
35
+
36
+ # Returns the Unicode arrow icon for a trend direction.
37
+ #
38
+ # @param direction [Symbol] :improved, :regressed, or :unchanged.
39
+ # @return [String] Arrow icon.
40
+ def trend_icon(direction)
41
+ { improved: '↑', regressed: '↓', unchanged: '→' }.fetch(direction, '?')
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'formatting_helpers'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Formats ReAct loop iteration timelines for human-readable output.
8
+ class IterationFormatter
9
+ extend FormattingHelpers
10
+
11
+ # Formats an iteration timeline section.
12
+ #
13
+ # @param title [String] Section title.
14
+ # @param iterations [Array<Hash>] Iteration metadata with keys :step_number,
15
+ # :thought, :tools_used, :observation_summary.
16
+ # @return [String] Formatted section.
17
+ def self.format(title, iterations)
18
+ lines = [" === #{title} ==="]
19
+ iterations.each do |iter|
20
+ tools = iter[:tools_used] || []
21
+ tool_str = tools.empty? ? '' : " → Tool: #{tools.join(', ')}"
22
+ observation = iter[:observation_summary].to_s
23
+ observation_str = observation.empty? ? '' : " → Observation: #{truncate(observation, 60)}"
24
+ lines << " Step #{iter[:step_number]}: #{iter[:thought]}#{tool_str}#{observation_str}"
25
+ end
26
+ lines.join("\n")
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Formats evaluation results as JSON.
8
+ class JsonFormatter
9
+ # Format result as JSON.
10
+ #
11
+ # @param result [Hash] Eval result.
12
+ # @return [String] JSON-formatted string.
13
+ def self.format(result)
14
+ JSON.pretty_generate(result)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Service object for parsing judge score responses from evaluation results.
8
+ # Handles JSON strings with optional code blocks, Hash inputs, and provides
9
+ # standardized error handling for malformed data.
10
+ # @deprecated Scoring is now handled internally by {SkillBench::Judge} and {SkillBench::DeltaReport}.
11
+ class JudgeScoreParserService
12
+ PARSE_ERROR = 'Failed to parse judge score'
13
+
14
+ # Parses a judge score response into a standardized format.
15
+ #
16
+ # @param judge_score [String, Hash, nil] Raw judge score response. Can be:
17
+ # - A JSON string (with or without markdown code blocks)
18
+ # - A Hash (with string or symbol keys)
19
+ # - nil (which will result in an error response)
20
+ # @return [Hash] Standardized response hash with format:
21
+ # - { success: true, response: Hash } on success
22
+ # - { success: false, response: { error: { message: String } } on failure
23
+ # @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally)
24
+ def self.call(judge_score)
25
+ new(judge_score).call
26
+ end
27
+
28
+ # @param judge_score [String, Hash, nil] Raw judge score response
29
+ def initialize(judge_score)
30
+ @judge_score = judge_score
31
+ end
32
+
33
+ # @return [Hash] { success: Boolean, response: Hash }
34
+ # @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally)
35
+ def call
36
+ case @judge_score
37
+ when String
38
+ parsed = parse_string_input
39
+ parsed ? { success: true, response: parsed } : error_response
40
+ when Hash
41
+ { success: true, response: @judge_score.transform_keys(&:to_s) }
42
+ else
43
+ error_response
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def error_response
50
+ { success: false, response: { error: { message: PARSE_ERROR } } }
51
+ end
52
+
53
+ # @return [Hash, nil] Parsed JSON hash or nil if parsing fails or not a Hash
54
+ def parse_string_input
55
+ # Remove markdown code blocks and extra whitespace
56
+ cleaned_score = @judge_score.strip
57
+ cleaned_score = cleaned_score.gsub(/\A```json\s*|\s*```\z/, '').strip
58
+
59
+ parsed = JSON.parse(cleaned_score)
60
+ parsed.is_a?(Hash) ? parsed : nil
61
+ rescue JSON::ParserError
62
+ nil
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Formats evaluation results as JUnit XML.
8
+ class JUnitFormatter
9
+ # Format result as JUnit XML.
10
+ #
11
+ # Supports both legacy format (result[:pass]) and modern DeltaReport format.
12
+ #
13
+ # @param result [Hash] Eval result.
14
+ # @return [String] JUnit XML-formatted string.
15
+ def self.format(result)
16
+ report = result.dig(:response, :report)
17
+ verdict = report.respond_to?(:verdict) ? report.verdict : result[:pass]
18
+ eval_name = CGI.escapeHTML(result[:eval_name].to_s)
19
+
20
+ if verdict
21
+ <<~XML
22
+ <?xml version="1.0"?>
23
+ <testsuite name="SkillBench" tests="1" failures="0">
24
+ <testcase name="#{eval_name}" classname="SkillBench"/>
25
+ </testsuite>
26
+ XML
27
+ else
28
+ score = report.respond_to?(:context_total) ? report.context_total : result[:score]
29
+ escaped_score = CGI.escapeHTML(score.to_s)
30
+ <<~XML
31
+ <?xml version="1.0"?>
32
+ <testsuite name="SkillBench" tests="1" failures="1">
33
+ <testcase name="#{eval_name}" classname="SkillBench">
34
+ <failure message="Score: #{escaped_score}">Eval failed</failure>
35
+ </testcase>
36
+ </testsuite>
37
+ XML
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end