ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rubygems/package'
|
|
4
|
+
require_relative 'error_logger'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
# Verifies that a built gem package includes the files required for release.
|
|
8
|
+
class PackageVerifier
|
|
9
|
+
# Raised when a built gem package is missing required files.
|
|
10
|
+
class Error < StandardError; end
|
|
11
|
+
|
|
12
|
+
# Files that must be present for a usable evaluator gem package.
|
|
13
|
+
REQUIRED_FILES = %w[
|
|
14
|
+
README.md
|
|
15
|
+
LICENSE
|
|
16
|
+
bin/skill-bench
|
|
17
|
+
docs/architecture.md
|
|
18
|
+
docs/testing-guide.md
|
|
19
|
+
lib/skill_bench.rb
|
|
20
|
+
lib/skill_bench/config/applier.rb
|
|
21
|
+
lib/skill_bench/config/defaults.rb
|
|
22
|
+
lib/skill_bench/config/env_overrides.rb
|
|
23
|
+
lib/skill_bench/config/facade_readers.rb
|
|
24
|
+
lib/skill_bench/config/facade_writers.rb
|
|
25
|
+
lib/skill_bench/config/json_loader.rb
|
|
26
|
+
lib/skill_bench/config/store.rb
|
|
27
|
+
lib/skill_bench/package_verifier.rb
|
|
28
|
+
lib/skill_bench/source_path_resolver.rb
|
|
29
|
+
lib/skill_bench/runner.rb
|
|
30
|
+
].freeze
|
|
31
|
+
|
|
32
|
+
# Verifies that a gem package includes required release files.
|
|
33
|
+
#
|
|
34
|
+
# @param package_path [String] path to the built `.gem` file
|
|
35
|
+
# @param required_files [Array<String>] files that must be present in the gemspec payload
|
|
36
|
+
# @return [Hash] result envelope with package verification details
|
|
37
|
+
def self.call(package_path:, required_files: REQUIRED_FILES)
|
|
38
|
+
new(package_path:, required_files:).call
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Initializes the verifier.
|
|
42
|
+
#
|
|
43
|
+
# @param package_path [String] path to the built `.gem` file
|
|
44
|
+
# @param required_files [Array<String>] files that must be present in the gemspec payload
|
|
45
|
+
# @return [PackageVerifier] a verifier instance
|
|
46
|
+
def initialize(package_path:, required_files: REQUIRED_FILES)
|
|
47
|
+
@package_path = package_path
|
|
48
|
+
@required_files = required_files
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Verifies that the configured package contains all required files.
|
|
52
|
+
#
|
|
53
|
+
# @return [Hash] result envelope with package verification details
|
|
54
|
+
def call
|
|
55
|
+
files = packaged_files
|
|
56
|
+
missing = @required_files - files
|
|
57
|
+
return failure("Missing packaged files: #{missing.join(', ')}") if missing.any?
|
|
58
|
+
|
|
59
|
+
{ success: true, response: { missing_files: [], packaged_files: files } }
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
SkillBench::ErrorLogger.log_error(e, 'PackageVerifier Error')
|
|
62
|
+
failure(e.message)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def packaged_files
|
|
68
|
+
Gem::Package.new(@package_path).spec.files
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def failure(message)
|
|
72
|
+
{
|
|
73
|
+
success: false,
|
|
74
|
+
response: {
|
|
75
|
+
error: { message: message }
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'active_support/inflector'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Rails
|
|
7
|
+
# Generates Rails-specific skill templates
|
|
8
|
+
class SkillTemplates
|
|
9
|
+
# Generate a service object template
|
|
10
|
+
# @param name [String] Service name (e.g., 'my_service' or 'my-service')
|
|
11
|
+
# @return [String] Service object Ruby class
|
|
12
|
+
def self.service_object(name)
|
|
13
|
+
class_name = name.split(/[-_]/).map(&:capitalize).join
|
|
14
|
+
<<~RUBY
|
|
15
|
+
# frozen_string_literal: true
|
|
16
|
+
|
|
17
|
+
module SkillBench
|
|
18
|
+
module Skills
|
|
19
|
+
class #{class_name}
|
|
20
|
+
# Initialize with required parameters
|
|
21
|
+
# @param args [Hash] Keyword arguments for the service
|
|
22
|
+
def initialize(**args)
|
|
23
|
+
# Set instance variables from args
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Execute the service
|
|
27
|
+
# @return [Hash] Result with :success and :response keys
|
|
28
|
+
def call
|
|
29
|
+
# Implement service logic here
|
|
30
|
+
{ success: true, response: { message: 'Not implemented' } }
|
|
31
|
+
rescue StandardError => e
|
|
32
|
+
Rails.logger.error(e.message)
|
|
33
|
+
Rails.logger.error(e.backtrace.first(5).join("\n"))
|
|
34
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
RUBY
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Generate a concern template
|
|
43
|
+
# @param name [String] Concern name (e.g., 'my_concern')
|
|
44
|
+
# @return [String] Concern module
|
|
45
|
+
def self.concern(name)
|
|
46
|
+
module_name = name.camelize
|
|
47
|
+
<<~RUBY
|
|
48
|
+
# frozen_string_literal: true
|
|
49
|
+
|
|
50
|
+
module #{module_name}
|
|
51
|
+
extend ActiveSupport::Concern
|
|
52
|
+
|
|
53
|
+
included do
|
|
54
|
+
# Add class methods, associations, validations here
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
class_methods do
|
|
58
|
+
# Add class methods here
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Add instance methods here
|
|
62
|
+
end
|
|
63
|
+
RUBY
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Generate an ActiveRecord model template
|
|
67
|
+
# @param name [String] Model name (e.g., 'my_model')
|
|
68
|
+
# @return [String] ActiveRecord model class
|
|
69
|
+
def self.active_record_model(name)
|
|
70
|
+
class_name = name.camelize
|
|
71
|
+
<<~RUBY
|
|
72
|
+
# frozen_string_literal: true
|
|
73
|
+
|
|
74
|
+
class #{class_name} < ApplicationRecord
|
|
75
|
+
# Validations
|
|
76
|
+
validates :name, presence: true
|
|
77
|
+
|
|
78
|
+
# Associations
|
|
79
|
+
# belongs_to :user
|
|
80
|
+
# has_many :items
|
|
81
|
+
|
|
82
|
+
# Scopes
|
|
83
|
+
# scope :active, -> { where(active: true) }
|
|
84
|
+
|
|
85
|
+
# Instance methods
|
|
86
|
+
# def some_method
|
|
87
|
+
# ...
|
|
88
|
+
# end
|
|
89
|
+
|
|
90
|
+
# Class methods
|
|
91
|
+
# def self.some_class_method
|
|
92
|
+
# ...
|
|
93
|
+
# end
|
|
94
|
+
end
|
|
95
|
+
RUBY
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require 'parallel'
|
|
5
|
+
require_relative 'task/evaluator'
|
|
6
|
+
require_relative 'error_logger'
|
|
7
|
+
|
|
8
|
+
module SkillBench
|
|
9
|
+
# Orchestrates the entire evaluation process.
|
|
10
|
+
# Compares how an AI coding agent performs with and without contextual skills.
|
|
11
|
+
# @deprecated Use {SkillBench::Services::RunnerService} instead.
|
|
12
|
+
class Runner
|
|
13
|
+
# Initiates a full evaluation run.
|
|
14
|
+
#
|
|
15
|
+
# @param params [Hash] The configuration for the evaluation.
|
|
16
|
+
# @option params [String] :eval_folder_path The path to the evaluation directory containing task and criteria.
|
|
17
|
+
# @option params [String] :skill_path Optional override for the source directory being tested.
|
|
18
|
+
# @option params [String, Pathname] :base_path (optional) The base path for relative file resolution.
|
|
19
|
+
# @option params [Hash] :client_params (optional) Parameters to pass to the LLM client.
|
|
20
|
+
# @return [Hash] A result hash with :success and :response payload containing the judge scores and diffs.
|
|
21
|
+
# @raise [ArgumentError] If the eval path does not match a supported source-path convention.
|
|
22
|
+
def self.call(params)
|
|
23
|
+
new(params).call
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param params [Hash] The configuration for the evaluation.
|
|
27
|
+
def initialize(params)
|
|
28
|
+
@eval_folder_path = params[:eval_folder_path]
|
|
29
|
+
@skill_path = params[:skill_path]
|
|
30
|
+
@base_path = params[:base_path] || Pathname.new(Dir.pwd)
|
|
31
|
+
@client_params = params[:client_params] || {}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Executes the baseline and context-hydrated evaluations, then scores them.
|
|
35
|
+
#
|
|
36
|
+
# @return [Hash] The final evaluation result.
|
|
37
|
+
def call
|
|
38
|
+
full_path = @base_path.join(@eval_folder_path)
|
|
39
|
+
|
|
40
|
+
return { success: false, response: { error: { message: "Evaluation path #{full_path} does not exist" } } } unless full_path.exist?
|
|
41
|
+
|
|
42
|
+
task_dirs = self.class.discover_task_dirs(full_path)
|
|
43
|
+
if task_dirs.empty?
|
|
44
|
+
return { success: false,
|
|
45
|
+
response: { error: { message: "No task.md found in #{full_path} or its subdirectories" } } }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
results = Parallel.map(task_dirs, in_threads: 4) do |task_dir|
|
|
49
|
+
task_result = Task::Evaluator.call(
|
|
50
|
+
full_eval_path: task_dir,
|
|
51
|
+
base_path: @base_path,
|
|
52
|
+
skill_path: @skill_path,
|
|
53
|
+
client_params: @client_params
|
|
54
|
+
)
|
|
55
|
+
# Normalize to uniform envelope
|
|
56
|
+
if task_result.key?(:success)
|
|
57
|
+
task_result
|
|
58
|
+
else
|
|
59
|
+
{ success: true, response: task_result }
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
overall_success = results.all? { |task_result| task_result[:success] }
|
|
64
|
+
|
|
65
|
+
{
|
|
66
|
+
success: overall_success,
|
|
67
|
+
response: {
|
|
68
|
+
source_path: @skill_path || 'multiple (batch run)',
|
|
69
|
+
tasks: results
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
rescue StandardError => e
|
|
73
|
+
SkillBench::ErrorLogger.log_error(e, 'Runner Error')
|
|
74
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Finds all directories containing a task.md file starting from the root_path.
|
|
78
|
+
#
|
|
79
|
+
# @param root_path [Pathname] The root directory to search.
|
|
80
|
+
# @return [Array<Pathname>] A list of task directory paths.
|
|
81
|
+
def self.discover_task_dirs(root_path)
|
|
82
|
+
if File.exist?(root_path.join('task.md'))
|
|
83
|
+
[root_path]
|
|
84
|
+
else
|
|
85
|
+
Dir.glob(root_path.join('**/task.md')).map { |f| Pathname.new(f).parent }.uniq.sort
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'formatting_helpers'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Formats the dimension scoring table, totals, trend, and verdict for a DeltaReport.
|
|
8
|
+
class DeltaTableFormatter
|
|
9
|
+
extend FormattingHelpers
|
|
10
|
+
|
|
11
|
+
# Formats the delta report scoring section.
|
|
12
|
+
#
|
|
13
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
14
|
+
# @param result [Hash, nil] Eval result envelope (used for trend data).
|
|
15
|
+
# @return [String] Formatted table, totals, trend, and verdict.
|
|
16
|
+
def self.format(report, result = nil)
|
|
17
|
+
lines = [
|
|
18
|
+
' DIMENSION BASELINE CONTEXT DELTA',
|
|
19
|
+
' ──────────────────────── ───────── ───────── ───────'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
report.deltas.each do |name, delta|
|
|
23
|
+
lines << format_dimension_row(name, delta, report)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
lines << ' ──────────────────────── ───────── ───────── ───────'
|
|
27
|
+
lines << format_total_row(report)
|
|
28
|
+
lines << ''
|
|
29
|
+
trend = result[:trend] if result
|
|
30
|
+
lines << format_trend(trend) if trend
|
|
31
|
+
|
|
32
|
+
status = report.verdict ? 'PASS' : 'FAIL'
|
|
33
|
+
criteria = report.criteria
|
|
34
|
+
threshold = criteria.pass_threshold
|
|
35
|
+
delta_threshold = criteria.minimum_delta
|
|
36
|
+
lines << " VERDICT: #{status} (threshold: #{threshold}, minimum delta: #{delta_threshold})"
|
|
37
|
+
lines << ('═' * 55)
|
|
38
|
+
|
|
39
|
+
lines.join("\n")
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private_class_method def self.format_dimension_row(name, delta, report)
|
|
43
|
+
dim = report.criteria.dimensions.find { |d| d.name == name }
|
|
44
|
+
max_score = dim&.max_score || ''
|
|
45
|
+
humanized = humanize(name)
|
|
46
|
+
label = dim ? "#{humanized} (#{max_score})" : humanized
|
|
47
|
+
baseline_score = report.baseline_scores[name]
|
|
48
|
+
context_score = report.context_scores[name]
|
|
49
|
+
Kernel.format(' %<label>-24s %<baseline>9s %<context>9s %<delta>7s',
|
|
50
|
+
label: label, baseline: baseline_score, context: context_score,
|
|
51
|
+
delta: delta_str(delta))
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private_class_method def self.format_total_row(report)
|
|
55
|
+
Kernel.format(' %<label>-24s %<baseline>9s %<context>9s %<delta>7s',
|
|
56
|
+
label: 'TOTAL', baseline: "#{report.baseline_total}/100",
|
|
57
|
+
context: "#{report.context_total}/100",
|
|
58
|
+
delta: delta_str(report.deltas.values.sum))
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private_class_method def self.format_trend(trend)
|
|
62
|
+
return nil unless trend
|
|
63
|
+
|
|
64
|
+
baseline_icon = trend_icon(trend[:baseline_trend])
|
|
65
|
+
context_icon = trend_icon(trend[:context_trend])
|
|
66
|
+
baseline_delta = trend[:baseline_delta]
|
|
67
|
+
context_delta = trend[:context_delta]
|
|
68
|
+
" TREND: baseline #{baseline_icon} (#{delta_str(baseline_delta)}), context #{context_icon} (#{delta_str(context_delta)})"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'formatting_helpers'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Categorizes dimension scores into "what went well", "what went wrong",
|
|
8
|
+
# and actionable advice based on judge reasoning.
|
|
9
|
+
class FeedbackGenerator
|
|
10
|
+
extend FormattingHelpers
|
|
11
|
+
|
|
12
|
+
# Generates feedback sections from a DeltaReport.
|
|
13
|
+
#
|
|
14
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
15
|
+
# @return [Hash] Standardized response hash:
|
|
16
|
+
# - { success: true, response: { output: String } }
|
|
17
|
+
def self.call(report)
|
|
18
|
+
output = generate_feedback(report)
|
|
19
|
+
{ success: true, response: { output: output } }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private_class_method def self.generate_feedback(report)
|
|
23
|
+
return '' unless feedback_applicable?(report)
|
|
24
|
+
|
|
25
|
+
context_dims = report.context_dimensions || {}
|
|
26
|
+
baseline_dims = report.baseline_dimensions || {}
|
|
27
|
+
well, wrong, advice = categorize_dimensions(context_dims, baseline_dims, report)
|
|
28
|
+
|
|
29
|
+
assemble_feedback_lines(well, wrong, advice)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private_class_method def self.feedback_applicable?(report)
|
|
33
|
+
return false unless report.respond_to?(:baseline_dimensions) && report.respond_to?(:context_dimensions)
|
|
34
|
+
|
|
35
|
+
context_dims = report.context_dimensions || {}
|
|
36
|
+
baseline_dims = report.baseline_dimensions || {}
|
|
37
|
+
context_dims.any? { |name, dim| baseline_dims[name] && dim }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private_class_method def self.categorize_dimensions(context_dims, baseline_dims, report)
|
|
41
|
+
well = []
|
|
42
|
+
wrong = []
|
|
43
|
+
advice = []
|
|
44
|
+
|
|
45
|
+
context_dims.each do |name, dim|
|
|
46
|
+
baseline_dim = baseline_dims[name]
|
|
47
|
+
next unless baseline_dim && dim
|
|
48
|
+
|
|
49
|
+
cat = categorize_dimension(name, dim, baseline_dim, report)
|
|
50
|
+
well.concat(cat[:well])
|
|
51
|
+
wrong.concat(cat[:wrong])
|
|
52
|
+
advice.concat(cat[:advice])
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
[well, wrong, advice]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private_class_method def self.categorize_dimension(name, dim, baseline_dim, report)
|
|
59
|
+
values = extract_values(dim, baseline_dim)
|
|
60
|
+
score = values[:score]
|
|
61
|
+
max_score = values[:max_score]
|
|
62
|
+
baseline_score = values[:baseline_score]
|
|
63
|
+
reasoning = values[:reasoning]
|
|
64
|
+
|
|
65
|
+
pct = compute_percentage(score, max_score)
|
|
66
|
+
dim_obj = report.criteria.dimensions.find { |d| d.name == name }
|
|
67
|
+
humanized = humanize(name)
|
|
68
|
+
label = "#{humanized} (#{score}/#{max_score}, baseline: #{baseline_score}/#{max_score})"
|
|
69
|
+
|
|
70
|
+
build_categorization(pct, label, reasoning, humanized, dim_obj)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private_class_method def self.extract_values(dim, baseline_dim)
|
|
74
|
+
{
|
|
75
|
+
score: dim[:score] || dim['score'] || 0,
|
|
76
|
+
max_score: dim[:max_score] || dim['max_score'] || 1,
|
|
77
|
+
reasoning: dim[:reasoning] || dim['reasoning'] || '',
|
|
78
|
+
baseline_score: baseline_dim[:score] || baseline_dim['score'] || 0
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private_class_method def self.compute_percentage(score, max_score)
|
|
83
|
+
max_score.positive? ? (score.to_f / max_score * 100).round : 0
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private_class_method def self.build_categorization(pct, label, reasoning, humanized, dim_obj)
|
|
87
|
+
well = []
|
|
88
|
+
wrong = []
|
|
89
|
+
advice = []
|
|
90
|
+
has_reasoning = !reasoning.empty?
|
|
91
|
+
|
|
92
|
+
if pct >= 80
|
|
93
|
+
well << " #{label}"
|
|
94
|
+
well << " #{reasoning}" if has_reasoning
|
|
95
|
+
else
|
|
96
|
+
wrong << " #{label}"
|
|
97
|
+
wrong << " #{reasoning}" if has_reasoning
|
|
98
|
+
dim_advice = dim_obj&.description.to_s
|
|
99
|
+
advice << " #{humanized}: #{dim_advice}" unless dim_advice.empty?
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
{ well: well, wrong: wrong, advice: advice }
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
private_class_method def self.assemble_feedback_lines(well, wrong, advice)
|
|
106
|
+
lines = []
|
|
107
|
+
append_section(lines, 'WHAT WENT WELL', well)
|
|
108
|
+
append_section(lines, 'WHAT WENT WRONG', wrong)
|
|
109
|
+
append_section(lines, 'ADVICE', advice)
|
|
110
|
+
lines.join("\n")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private_class_method def self.append_section(lines, title, items)
|
|
114
|
+
return if items.empty?
|
|
115
|
+
|
|
116
|
+
lines << ''
|
|
117
|
+
lines << " === #{title} ==="
|
|
118
|
+
lines.concat(items)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Shared string-formatting utilities used across output formatters.
|
|
6
|
+
module FormattingHelpers
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
# Converts a snake_case name to Title Case.
|
|
10
|
+
#
|
|
11
|
+
# @param name [String, Symbol] The dimension name.
|
|
12
|
+
# @return [String] Human-readable name.
|
|
13
|
+
def humanize(name)
|
|
14
|
+
name.to_s.split('_').map(&:capitalize).join(' ')
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Formats a numeric delta with a +/- sign.
|
|
18
|
+
#
|
|
19
|
+
# @param delta [Numeric] The delta value.
|
|
20
|
+
# @return [String] Formatted delta string.
|
|
21
|
+
def delta_str(delta)
|
|
22
|
+
delta >= 0 ? "+#{delta}" : delta.to_s
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Truncates a string to a maximum length with ellipsis.
|
|
26
|
+
#
|
|
27
|
+
# @param text [String] The text to truncate.
|
|
28
|
+
# @param max_length [Integer] Maximum length.
|
|
29
|
+
# @return [String] Truncated text.
|
|
30
|
+
def truncate(text, max_length)
|
|
31
|
+
return text if text.length <= max_length
|
|
32
|
+
|
|
33
|
+
"#{text[0...max_length]}..."
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Returns the Unicode arrow icon for a trend direction.
|
|
37
|
+
#
|
|
38
|
+
# @param direction [Symbol] :improved, :regressed, or :unchanged.
|
|
39
|
+
# @return [String] Arrow icon.
|
|
40
|
+
def trend_icon(direction)
|
|
41
|
+
{ improved: '↑', regressed: '↓', unchanged: '→' }.fetch(direction, '?')
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'formatting_helpers'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Formats ReAct loop iteration timelines for human-readable output.
|
|
8
|
+
class IterationFormatter
|
|
9
|
+
extend FormattingHelpers
|
|
10
|
+
|
|
11
|
+
# Formats an iteration timeline section.
|
|
12
|
+
#
|
|
13
|
+
# @param title [String] Section title.
|
|
14
|
+
# @param iterations [Array<Hash>] Iteration metadata with keys :step_number,
|
|
15
|
+
# :thought, :tools_used, :observation_summary.
|
|
16
|
+
# @return [String] Formatted section.
|
|
17
|
+
def self.format(title, iterations)
|
|
18
|
+
lines = [" === #{title} ==="]
|
|
19
|
+
iterations.each do |iter|
|
|
20
|
+
tools = iter[:tools_used] || []
|
|
21
|
+
tool_str = tools.empty? ? '' : " → Tool: #{tools.join(', ')}"
|
|
22
|
+
observation = iter[:observation_summary].to_s
|
|
23
|
+
observation_str = observation.empty? ? '' : " → Observation: #{truncate(observation, 60)}"
|
|
24
|
+
lines << " Step #{iter[:step_number]}: #{iter[:thought]}#{tool_str}#{observation_str}"
|
|
25
|
+
end
|
|
26
|
+
lines.join("\n")
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Formats evaluation results as JSON.
|
|
8
|
+
class JsonFormatter
|
|
9
|
+
# Format result as JSON.
|
|
10
|
+
#
|
|
11
|
+
# @param result [Hash] Eval result.
|
|
12
|
+
# @return [String] JSON-formatted string.
|
|
13
|
+
def self.format(result)
|
|
14
|
+
JSON.pretty_generate(result)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Service object for parsing judge score responses from evaluation results.
|
|
8
|
+
# Handles JSON strings with optional code blocks, Hash inputs, and provides
|
|
9
|
+
# standardized error handling for malformed data.
|
|
10
|
+
# @deprecated Scoring is now handled internally by {SkillBench::Judge} and {SkillBench::DeltaReport}.
|
|
11
|
+
class JudgeScoreParserService
|
|
12
|
+
PARSE_ERROR = 'Failed to parse judge score'
|
|
13
|
+
|
|
14
|
+
# Parses a judge score response into a standardized format.
|
|
15
|
+
#
|
|
16
|
+
# @param judge_score [String, Hash, nil] Raw judge score response. Can be:
|
|
17
|
+
# - A JSON string (with or without markdown code blocks)
|
|
18
|
+
# - A Hash (with string or symbol keys)
|
|
19
|
+
# - nil (which will result in an error response)
|
|
20
|
+
# @return [Hash] Standardized response hash with format:
|
|
21
|
+
# - { success: true, response: Hash } on success
|
|
22
|
+
# - { success: false, response: { error: { message: String } } on failure
|
|
23
|
+
# @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally)
|
|
24
|
+
def self.call(judge_score)
|
|
25
|
+
new(judge_score).call
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @param judge_score [String, Hash, nil] Raw judge score response
|
|
29
|
+
def initialize(judge_score)
|
|
30
|
+
@judge_score = judge_score
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @return [Hash] { success: Boolean, response: Hash }
|
|
34
|
+
# @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally)
|
|
35
|
+
def call
|
|
36
|
+
case @judge_score
|
|
37
|
+
when String
|
|
38
|
+
parsed = parse_string_input
|
|
39
|
+
parsed ? { success: true, response: parsed } : error_response
|
|
40
|
+
when Hash
|
|
41
|
+
{ success: true, response: @judge_score.transform_keys(&:to_s) }
|
|
42
|
+
else
|
|
43
|
+
error_response
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def error_response
|
|
50
|
+
{ success: false, response: { error: { message: PARSE_ERROR } } }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @return [Hash, nil] Parsed JSON hash or nil if parsing fails or not a Hash
|
|
54
|
+
def parse_string_input
|
|
55
|
+
# Remove markdown code blocks and extra whitespace
|
|
56
|
+
cleaned_score = @judge_score.strip
|
|
57
|
+
cleaned_score = cleaned_score.gsub(/\A```json\s*|\s*```\z/, '').strip
|
|
58
|
+
|
|
59
|
+
parsed = JSON.parse(cleaned_score)
|
|
60
|
+
parsed.is_a?(Hash) ? parsed : nil
|
|
61
|
+
rescue JSON::ParserError
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Formats evaluation results as JUnit XML.
|
|
8
|
+
class JUnitFormatter
|
|
9
|
+
# Format result as JUnit XML.
|
|
10
|
+
#
|
|
11
|
+
# Supports both legacy format (result[:pass]) and modern DeltaReport format.
|
|
12
|
+
#
|
|
13
|
+
# @param result [Hash] Eval result.
|
|
14
|
+
# @return [String] JUnit XML-formatted string.
|
|
15
|
+
def self.format(result)
|
|
16
|
+
report = result.dig(:response, :report)
|
|
17
|
+
verdict = report.respond_to?(:verdict) ? report.verdict : result[:pass]
|
|
18
|
+
eval_name = CGI.escapeHTML(result[:eval_name].to_s)
|
|
19
|
+
|
|
20
|
+
if verdict
|
|
21
|
+
<<~XML
|
|
22
|
+
<?xml version="1.0"?>
|
|
23
|
+
<testsuite name="SkillBench" tests="1" failures="0">
|
|
24
|
+
<testcase name="#{eval_name}" classname="SkillBench"/>
|
|
25
|
+
</testsuite>
|
|
26
|
+
XML
|
|
27
|
+
else
|
|
28
|
+
score = report.respond_to?(:context_total) ? report.context_total : result[:score]
|
|
29
|
+
escaped_score = CGI.escapeHTML(score.to_s)
|
|
30
|
+
<<~XML
|
|
31
|
+
<?xml version="1.0"?>
|
|
32
|
+
<testsuite name="SkillBench" tests="1" failures="1">
|
|
33
|
+
<testcase name="#{eval_name}" classname="SkillBench">
|
|
34
|
+
<failure message="Score: #{escaped_score}">Eval failed</failure>
|
|
35
|
+
</testcase>
|
|
36
|
+
</testsuite>
|
|
37
|
+
XML
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|