ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Parses CLI arguments for the EvaluateCommand using Ruby's OptionParser.
|
|
8
|
+
# Provides standardized error handling for invalid flags and missing arguments.
|
|
9
|
+
# @deprecated Use {SkillBench::Cli::RunCommand} option parsing instead.
|
|
10
|
+
class OptionParserService
|
|
11
|
+
# Parses command-line options into a hash.
|
|
12
|
+
#
|
|
13
|
+
# @param argv [Array<String>] Raw CLI arguments.
|
|
14
|
+
# @return [Hash] Result envelope with parsed options or error message.
|
|
15
|
+
def self.call(argv)
|
|
16
|
+
new(argv).call
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @param argv [Array<String>] Raw CLI arguments.
|
|
20
|
+
def initialize(argv)
|
|
21
|
+
@argv = argv
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Parses the arguments and returns a result hash.
|
|
25
|
+
#
|
|
26
|
+
# @return [Hash] Result envelope with parsed options or error message.
|
|
27
|
+
def call
|
|
28
|
+
options = {}
|
|
29
|
+
|
|
30
|
+
parser(options).parse!(@argv)
|
|
31
|
+
|
|
32
|
+
{ success: true, response: options }
|
|
33
|
+
rescue OptionParser::ParseError => e
|
|
34
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def parser(options)
|
|
40
|
+
OptionParser.new do |opts|
|
|
41
|
+
opts.banner = 'Usage: skill-bench [options]'
|
|
42
|
+
|
|
43
|
+
opts.on('-e', '--eval FOLDER', 'Path to the eval folder') do |eval_path|
|
|
44
|
+
options[:eval] = eval_path
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
opts.on('-s', '--skill FOLDER', 'Optional override for the source skill folder') do |skill_path|
|
|
48
|
+
options[:skill] = skill_path
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
opts.on('-o', '--output FILE', 'Path to save the JSON report') do |output_path|
|
|
52
|
+
options[:output] = output_path
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
56
|
+
puts opts
|
|
57
|
+
raise SkillBench::HelpRequested
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Service object for persisting evaluation results to JSON files.
|
|
9
|
+
# Handles file I/O, JSON serialization, and provides standardized error responses
|
|
10
|
+
# for filesystem operations.
|
|
11
|
+
# @deprecated Use {SkillBench::Cli::RunCommand} output handling instead.
|
|
12
|
+
class OutputPersistenceService
|
|
13
|
+
WRITE_ERROR = 'Failed to write output file'
|
|
14
|
+
|
|
15
|
+
# Persists evaluation results to a JSON file with proper formatting.
|
|
16
|
+
#
|
|
17
|
+
# @param result [Hash] Evaluation result hash containing all evaluation data
|
|
18
|
+
# @param output_path [String, nil] Path to save the JSON report. If nil or empty, no action is taken
|
|
19
|
+
# @return [Hash] Standardized response hash with format:
|
|
20
|
+
# - { success: true, response: { message: String } } on success
|
|
21
|
+
# - { success: true, response: {} } when no output path is provided
|
|
22
|
+
# - { success: false, response: { error: { message: String } } } on failure
|
|
23
|
+
# @example Save to file
|
|
24
|
+
# result = OutputPersistenceService.call(evaluation_result, output_path: 'output.json')
|
|
25
|
+
# # => { success: true, response: { message: 'Report saved to output.json' } }
|
|
26
|
+
# @example No output path
|
|
27
|
+
# result = OutputPersistenceService.call(evaluation_result, output_path: nil)
|
|
28
|
+
# # => { success: true, response: {} }
|
|
29
|
+
def self.call(result, output_path:)
|
|
30
|
+
new(result, output_path: output_path).call
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Initializes a new persistence service instance.
|
|
34
|
+
#
|
|
35
|
+
# @param result [Hash] Evaluation result hash containing all evaluation data
|
|
36
|
+
# @param output_path [String, nil] Path to save the JSON report
|
|
37
|
+
def initialize(result, output_path:)
|
|
38
|
+
@result = result
|
|
39
|
+
@output_path = output_path
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Persists the evaluation result to the specified output path.
|
|
43
|
+
#
|
|
44
|
+
# @return [Hash] Standardized response hash with format:
|
|
45
|
+
# - { success: true, response: { message: String } } on success
|
|
46
|
+
# - { success: true, response: {} } when no output path is provided
|
|
47
|
+
# - { success: false, response: { error: { message: String } } } on failure
|
|
48
|
+
# @raise [SystemCallError] when file system operations fail (handled internally)
|
|
49
|
+
def call
|
|
50
|
+
return { success: true, response: {} } if @output_path.to_s.empty?
|
|
51
|
+
|
|
52
|
+
ensure_directory_exists
|
|
53
|
+
write_json_file
|
|
54
|
+
|
|
55
|
+
{ success: true, response: { message: "Report saved to #{@output_path}" } }
|
|
56
|
+
rescue SystemCallError, JSON::GeneratorError => e
|
|
57
|
+
{ success: false, response: { error: { message: "#{WRITE_ERROR}: #{e.message}" } } }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Ensures the parent directory for the output file exists.
|
|
63
|
+
# Creates the directory structure if it doesn't exist.
|
|
64
|
+
def ensure_directory_exists
|
|
65
|
+
directory = File.dirname(@output_path)
|
|
66
|
+
FileUtils.mkdir_p(directory) unless File.directory?(directory)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Writes the evaluation result as a formatted JSON file.
|
|
70
|
+
#
|
|
71
|
+
# @raise [SystemCallError] when file write operation fails
|
|
72
|
+
def write_json_file
|
|
73
|
+
File.write(@output_path, JSON.generate(@result, pretty: true))
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'judge_score_parser_service'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Service object for printing formatted evaluation results to stdout.
|
|
8
|
+
# Handles result formatting, score parsing, and provides standardized output for
|
|
9
|
+
# both successful and failed evaluations.
|
|
10
|
+
# @deprecated Use {SkillBench::Cli::ResultPrinter} instead.
|
|
11
|
+
class ResultPrinterService
|
|
12
|
+
RESULTS_BANNER = "\n=========================================\n " \
|
|
13
|
+
"RESULTS \n" \
|
|
14
|
+
"=========================================\n"
|
|
15
|
+
|
|
16
|
+
# Prints formatted evaluation results to the specified output stream.
|
|
17
|
+
#
|
|
18
|
+
# @param result [Hash] Evaluation result hash containing success status and task data
|
|
19
|
+
# @param stdout [#puts, #write] Output stream for user-visible messages. Defaults to $stdout
|
|
20
|
+
# @return [Hash] Standardized response hash with format:
|
|
21
|
+
# - { success: true, response: {} } on successful printing
|
|
22
|
+
# @example Print successful results
|
|
23
|
+
# result = ResultPrinterService.call(evaluation_result)
|
|
24
|
+
# # => { success: true, response: {} }
|
|
25
|
+
# @example Print to custom stream
|
|
26
|
+
# result = ResultPrinterService.call(evaluation_result, stdout: string_io)
|
|
27
|
+
# # => { success: true, response: {} }
|
|
28
|
+
def self.call(result, stdout: $stdout)
|
|
29
|
+
new(result, stdout: stdout).call
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Initializes a new result printer instance.
|
|
33
|
+
#
|
|
34
|
+
# @param result [Hash] Evaluation result hash containing success status and task data
|
|
35
|
+
# @param stdout [#puts, #write] Output stream for user-visible messages. Defaults to $stdout
|
|
36
|
+
def initialize(result, stdout: $stdout)
|
|
37
|
+
@result = result
|
|
38
|
+
@stdout = stdout
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Prints the evaluation results in a formatted, user-friendly manner.
|
|
42
|
+
# Handles both successful evaluations and error cases.
|
|
43
|
+
#
|
|
44
|
+
# @return [Hash] Standardized response hash with format:
|
|
45
|
+
# - { success: true, response: {} } on successful printing
|
|
46
|
+
def call
|
|
47
|
+
@stdout.puts RESULTS_BANNER
|
|
48
|
+
|
|
49
|
+
unless @result[:success]
|
|
50
|
+
error_msg = @result.dig(:response, :error, :message) || 'Unknown error'
|
|
51
|
+
@stdout.puts "Evaluation failed: #{error_msg}"
|
|
52
|
+
return { success: true, response: {} }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
@result[:tasks]&.each do |task_result|
|
|
56
|
+
@stdout.puts "\n========================================="
|
|
57
|
+
@stdout.puts " RESULTS: #{task_result[:path]} "
|
|
58
|
+
@stdout.puts "=========================================\n"
|
|
59
|
+
print_task_result(task_result)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
{ success: true, response: {} }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
# Prints the result for a single task, including scores and diffs.
|
|
68
|
+
#
|
|
69
|
+
# @param task_result [Hash] Individual task result containing judge scores and diffs
|
|
70
|
+
def print_task_result(task_result)
|
|
71
|
+
score_payload = task_result[:judge_score]
|
|
72
|
+
parser_class = SkillBench::Services::JudgeScoreParserService
|
|
73
|
+
parsed_judge = parser_class.call(score_payload)
|
|
74
|
+
|
|
75
|
+
unless parsed_judge[:success]
|
|
76
|
+
print_parse_error
|
|
77
|
+
@stdout.puts(score_payload || 'nil')
|
|
78
|
+
return
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
print_judge_summary(parsed_judge[:response])
|
|
82
|
+
print_task_diffs(task_result[:path], task_result[:baseline_diff], task_result[:context_diff])
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Prints an error message when judge score parsing fails.
|
|
86
|
+
def print_parse_error
|
|
87
|
+
@stdout.puts 'Could not parse judge JSON response. Raw output:'
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Prints the judge score summary including baseline and context scores.
|
|
91
|
+
#
|
|
92
|
+
# @param parsed_judge [Hash] Parsed judge score data containing scores and reasoning
|
|
93
|
+
def print_judge_summary(parsed_judge)
|
|
94
|
+
@stdout.puts "Baseline Score: #{parsed_judge['baseline_score']}/100"
|
|
95
|
+
@stdout.puts "Context Score: #{parsed_judge['context_score']}/100"
|
|
96
|
+
@stdout.puts "\nReasoning:"
|
|
97
|
+
@stdout.puts parsed_judge['reasoning']
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Prints the baseline and context diffs for a task.
|
|
101
|
+
#
|
|
102
|
+
# @param path [String] The file path associated with the diff
|
|
103
|
+
# @param baseline_diff [String] The diff content for the baseline
|
|
104
|
+
# @param context_diff [String] The diff content for the context
|
|
105
|
+
def print_task_diffs(path, baseline_diff, context_diff)
|
|
106
|
+
print_diff_section('BASELINE CHANGES', path, baseline_diff)
|
|
107
|
+
print_diff_section('CONTEXT CHANGES', path, context_diff)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Prints a formatted diff section with a banner.
|
|
111
|
+
#
|
|
112
|
+
# @param title [String] The title for the diff section (e.g., 'BASELINE CHANGES')
|
|
113
|
+
# @param path [String] The file path associated with the diff
|
|
114
|
+
# @param diff [String] The diff content to print
|
|
115
|
+
def print_diff_section(title, path, diff)
|
|
116
|
+
sep_newline = "\n========================================="
|
|
117
|
+
sep_plain = "=========================================\n"
|
|
118
|
+
|
|
119
|
+
@stdout.puts sep_newline
|
|
120
|
+
@stdout.puts " #{title}: #{path} "
|
|
121
|
+
@stdout.puts sep_plain
|
|
122
|
+
@stdout.puts diff
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
require_relative '../models/eval'
|
|
6
|
+
require_relative '../models/skill'
|
|
7
|
+
require_relative '../models/config'
|
|
8
|
+
require_relative '../models/provider'
|
|
9
|
+
require_relative '../clients/all'
|
|
10
|
+
require_relative 'skill_resolver'
|
|
11
|
+
require_relative '../trend_tracker'
|
|
12
|
+
require_relative '../execution/sandbox'
|
|
13
|
+
require_relative '../execution/context_hydrator'
|
|
14
|
+
require_relative '../execution/source_path_resolver'
|
|
15
|
+
require_relative '../agent/react_agent'
|
|
16
|
+
|
|
17
|
+
module SkillBench
|
|
18
|
+
module Services
|
|
19
|
+
# Orchestrates the execution of an eval with baseline and context runs.
|
|
20
|
+
# rubocop:disable Metrics/ClassLength
|
|
21
|
+
class RunnerService
|
|
22
|
+
# Stand-in provider when no LLM config is available.
|
|
23
|
+
MOCK_PROVIDER = Struct.new(:name, :runtime, :llm, :merged_config)
|
|
24
|
+
private_constant :MOCK_PROVIDER
|
|
25
|
+
|
|
26
|
+
# Runs an eval with the given parameters.
|
|
27
|
+
#
|
|
28
|
+
# @param eval_name [String] Name or path of the eval to run
|
|
29
|
+
# @param skill_names [Array<String>] Names of the skills to use
|
|
30
|
+
# @return [Hash] Result from EvaluationRunner
|
|
31
|
+
def self.call(eval_name:, skill_names:)
|
|
32
|
+
new(eval_name: eval_name, skill_names: skill_names).call
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @param eval_name [String] Name or path of the eval
|
|
36
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
37
|
+
def initialize(eval_name:, skill_names:)
|
|
38
|
+
@eval_name = eval_name
|
|
39
|
+
@skill_names = skill_names
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Executes the eval: resolves entities, runs baseline and context, evaluates.
|
|
43
|
+
#
|
|
44
|
+
# @return [Hash] Evaluation result with deltas and verdict.
|
|
45
|
+
# @raise [Errno::ENOENT] when the eval directory does not exist.
|
|
46
|
+
# @raise [ArgumentError] when a skill cannot be resolved.
|
|
47
|
+
def call
|
|
48
|
+
evaluation = resolve_eval
|
|
49
|
+
skills = resolve_skills
|
|
50
|
+
provider = resolve_provider
|
|
51
|
+
|
|
52
|
+
config_result = resolve_provider_config(provider)
|
|
53
|
+
return config_error_result(config_result[:error], evaluation, provider) unless config_result[:success]
|
|
54
|
+
|
|
55
|
+
config = config_result[:config]
|
|
56
|
+
baseline_prompt = build_baseline_system_prompt
|
|
57
|
+
|
|
58
|
+
baseline_output = spawn_agent(evaluation, baseline_prompt, provider, config)
|
|
59
|
+
return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
|
|
60
|
+
|
|
61
|
+
skill_context = load_combined_skill_context(skills)
|
|
62
|
+
return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
|
|
63
|
+
|
|
64
|
+
context_prompt = build_context_system_prompt(evaluation, skills)
|
|
65
|
+
context_output = spawn_agent(evaluation, context_prompt, provider, config)
|
|
66
|
+
return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
|
|
67
|
+
|
|
68
|
+
criteria = evaluation.criteria
|
|
69
|
+
|
|
70
|
+
judge_params = build_judge_params(provider, config)
|
|
71
|
+
|
|
72
|
+
result = Evaluation::Runner.call(
|
|
73
|
+
task: evaluation.task,
|
|
74
|
+
criteria: criteria,
|
|
75
|
+
skill_context: skill_context,
|
|
76
|
+
baseline_output: format_output(baseline_output),
|
|
77
|
+
context_output: format_output(context_output),
|
|
78
|
+
judge_params: judge_params
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return enrich_error_result(result, evaluation, provider) unless result[:success]
|
|
82
|
+
|
|
83
|
+
trend_result = record_and_compute_trend(result)
|
|
84
|
+
return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
|
|
85
|
+
|
|
86
|
+
{
|
|
87
|
+
success: true,
|
|
88
|
+
eval_name: eval_name,
|
|
89
|
+
skill_name: skill_names.join(', '),
|
|
90
|
+
provider_name: provider.name,
|
|
91
|
+
response: result[:response].merge(
|
|
92
|
+
trend: trend_result[:trend],
|
|
93
|
+
baseline_iterations: baseline_output[:iterations] || [],
|
|
94
|
+
context_iterations: context_output[:iterations] || []
|
|
95
|
+
)
|
|
96
|
+
}
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
attr_reader :eval_name, :skill_names
|
|
102
|
+
|
|
103
|
+
def resolve_eval
|
|
104
|
+
eval_path = eval_name.include?('/') ? eval_name : "evals/#{eval_name}"
|
|
105
|
+
SkillBench::Models::Eval.load(eval_path)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def resolve_skills
|
|
109
|
+
skill_names.map { |name| Services::SkillResolver.call(name) }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def resolve_provider_config(provider)
|
|
113
|
+
{ success: true, config: provider.merged_config }
|
|
114
|
+
rescue ArgumentError => e
|
|
115
|
+
{ success: false, error: e }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Safely calls merged_config, returning nil on any error.
|
|
119
|
+
#
|
|
120
|
+
# @param provider [Object] The provider to query.
|
|
121
|
+
# @return [Hash, nil] The merged config or nil.
|
|
122
|
+
def safe_merged_config(provider)
|
|
123
|
+
provider.merged_config
|
|
124
|
+
rescue StandardError
|
|
125
|
+
nil
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def resolve_provider
|
|
129
|
+
config = SkillBench::Models::Config.load
|
|
130
|
+
provider = config.to_provider
|
|
131
|
+
return provider if provider
|
|
132
|
+
|
|
133
|
+
warn 'Config load failed, using mock provider'
|
|
134
|
+
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Spawns the LLM agent with the given system prompt.
|
|
138
|
+
#
|
|
139
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
140
|
+
# @param system_prompt [String] The system prompt for the agent.
|
|
141
|
+
# @param provider [Object] The resolved provider.
|
|
142
|
+
# @param config [Hash, nil] Provider config.
|
|
143
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations.
|
|
144
|
+
def spawn_agent(evaluation, system_prompt, provider, config)
|
|
145
|
+
return { result: 'mock result', status: :success, iterations: [] } if provider.name == 'mock'
|
|
146
|
+
|
|
147
|
+
client_params = build_client_params(provider, config)
|
|
148
|
+
|
|
149
|
+
max_iterations = config&.[](:max_iterations) || config&.[]('max_iterations') || 25
|
|
150
|
+
|
|
151
|
+
Execution::Sandbox.run(evaluation.path) do |sandbox|
|
|
152
|
+
agent_result = Agent::ReactAgent.call(
|
|
153
|
+
system_prompt: system_prompt,
|
|
154
|
+
initial_prompt: evaluation.task,
|
|
155
|
+
working_dir: sandbox.path,
|
|
156
|
+
container_id: sandbox.container_id,
|
|
157
|
+
client_params: client_params,
|
|
158
|
+
max_iterations: max_iterations
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
status = agent_result[:success] ? :success : :error
|
|
162
|
+
final_answer = agent_result.dig(:response, :content) || ''
|
|
163
|
+
diff = Execution::Sandbox.capture_diff(sandbox.path)
|
|
164
|
+
iterations = agent_result.dig(:response, :iterations) || []
|
|
165
|
+
|
|
166
|
+
output = [final_answer, diff].reject(&:empty?).join("\n\n")
|
|
167
|
+
|
|
168
|
+
{
|
|
169
|
+
result: output,
|
|
170
|
+
status: status,
|
|
171
|
+
runtime: provider.runtime,
|
|
172
|
+
usage: {},
|
|
173
|
+
raw_response: agent_result,
|
|
174
|
+
iterations: iterations
|
|
175
|
+
}
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Builds client parameters for the ReactAgent.
|
|
180
|
+
#
|
|
181
|
+
# @param provider [Object] The resolved provider.
|
|
182
|
+
# @param config [Hash, nil] Provider config.
|
|
183
|
+
# @return [Hash] Client parameters.
|
|
184
|
+
def build_client_params(provider, config)
|
|
185
|
+
config ||= safe_merged_config(provider)
|
|
186
|
+
return {} unless config
|
|
187
|
+
|
|
188
|
+
params = config.dup
|
|
189
|
+
params[:model] ||= provider.llm
|
|
190
|
+
params[:provider] = provider.runtime.to_sym
|
|
191
|
+
params
|
|
192
|
+
rescue StandardError
|
|
193
|
+
{}
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Builds the baseline system prompt (no skill context).
|
|
197
|
+
#
|
|
198
|
+
# @return [String] The baseline system prompt.
|
|
199
|
+
def build_baseline_system_prompt
|
|
200
|
+
<<~PROMPT
|
|
201
|
+
You are an expert Ruby on Rails developer. Your job is to read the task,
|
|
202
|
+
modify the codebase using the tools provided to meet the requirements,
|
|
203
|
+
and then explain what you did.
|
|
204
|
+
PROMPT
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Builds the context-aware system prompt based on eval metadata.
|
|
208
|
+
#
|
|
209
|
+
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
210
|
+
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
211
|
+
#
|
|
212
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
213
|
+
# @param skills [Array<SkillBench::Models::Skill>] Resolved skills.
|
|
214
|
+
# @return [String] The context system prompt.
|
|
215
|
+
def build_context_system_prompt(evaluation, skills)
|
|
216
|
+
skill_md_content = load_combined_skill_context(skills)
|
|
217
|
+
return skill_md_content unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
|
|
218
|
+
|
|
219
|
+
source_path = resolve_source_path(evaluation)
|
|
220
|
+
return skill_md_content unless source_path
|
|
221
|
+
|
|
222
|
+
xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
|
|
223
|
+
hydrator_response = xml_result[:response]
|
|
224
|
+
xml_context = hydrator_response[:context]
|
|
225
|
+
return skill_md_content unless xml_result[:success] && !xml_context.empty?
|
|
226
|
+
|
|
227
|
+
<<~PROMPT
|
|
228
|
+
You are an expert Ruby on Rails developer.
|
|
229
|
+
You have access to a skill file and source code wrapped in <agent_context> tags.
|
|
230
|
+
Use the skill instructions and the provided source code to solve the task.
|
|
231
|
+
|
|
232
|
+
## Skill Instructions
|
|
233
|
+
#{skill_md_content}
|
|
234
|
+
|
|
235
|
+
## Source Code
|
|
236
|
+
#{xml_context}
|
|
237
|
+
PROMPT
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Resolves the source path for context hydration.
|
|
241
|
+
#
|
|
242
|
+
# Tries the eval's `source/` subdirectory first, then falls back to
|
|
243
|
+
# SourcePathResolver inference.
|
|
244
|
+
#
|
|
245
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
246
|
+
# @return [String, nil] The resolved source path, or nil if not found.
|
|
247
|
+
def resolve_source_path(evaluation)
|
|
248
|
+
eval_path = evaluation.path
|
|
249
|
+
eval_source = File.join(eval_path, 'source')
|
|
250
|
+
return eval_source if Dir.exist?(eval_source)
|
|
251
|
+
|
|
252
|
+
inferred = Execution::SourcePathResolver.call(eval_folder_path: eval_path.to_s)
|
|
253
|
+
inferred if inferred && Dir.exist?(inferred)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Returns an error result when skill context is empty.
|
|
257
|
+
#
|
|
258
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
259
|
+
# @param provider [Object] The resolved provider.
|
|
260
|
+
# @return [Hash] Error result with metadata.
|
|
261
|
+
def empty_context_error_result(evaluation, provider)
|
|
262
|
+
{
|
|
263
|
+
success: false,
|
|
264
|
+
response: {
|
|
265
|
+
error: {
|
|
266
|
+
message: 'Skill context is empty. Ensure SKILL.md exists and has content.'
|
|
267
|
+
}
|
|
268
|
+
},
|
|
269
|
+
eval_name: evaluation.name,
|
|
270
|
+
skill_name: skill_names.join(', '),
|
|
271
|
+
provider_name: provider.name
|
|
272
|
+
}
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def load_combined_skill_context(skills)
|
|
276
|
+
return '' if skills.nil? || skills.empty?
|
|
277
|
+
|
|
278
|
+
contexts = skills.map { |skill| load_skill_context(skill) }
|
|
279
|
+
contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def load_skill_context(skill)
|
|
283
|
+
skill_md = File.join(skill.path, 'SKILL.md')
|
|
284
|
+
File.exist?(skill_md) ? File.read(skill_md) : ''
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def build_judge_params(provider, config)
|
|
288
|
+
return {} if provider.name == 'mock'
|
|
289
|
+
|
|
290
|
+
config ||= safe_merged_config(provider)
|
|
291
|
+
return {} unless config
|
|
292
|
+
|
|
293
|
+
{
|
|
294
|
+
api_key: config[:api_key],
|
|
295
|
+
model: config[:model] || provider.llm,
|
|
296
|
+
provider: provider.runtime.to_sym
|
|
297
|
+
}
|
|
298
|
+
rescue StandardError
|
|
299
|
+
{}
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def format_output(agent_result)
|
|
303
|
+
agent_result[:result].to_s
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def agent_error_result(result, phase, evaluation, provider)
|
|
307
|
+
raw = result[:raw_response]
|
|
308
|
+
error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
|
|
309
|
+
{
|
|
310
|
+
success: false,
|
|
311
|
+
response: {
|
|
312
|
+
error: {
|
|
313
|
+
message: "#{phase.capitalize} agent failed: #{error_msg}"
|
|
314
|
+
}
|
|
315
|
+
},
|
|
316
|
+
eval_name: evaluation.name,
|
|
317
|
+
skill_name: skill_names.join(', '),
|
|
318
|
+
provider_name: provider.name
|
|
319
|
+
}
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def config_error_result(error, evaluation, provider)
|
|
323
|
+
{
|
|
324
|
+
success: false,
|
|
325
|
+
response: {
|
|
326
|
+
error: {
|
|
327
|
+
message: "Configuration error: #{error.message}"
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
eval_name: evaluation.name,
|
|
331
|
+
skill_name: skill_names.join(', '),
|
|
332
|
+
provider_name: provider.name
|
|
333
|
+
}
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def enrich_error_result(result, evaluation, provider)
|
|
337
|
+
result.merge(
|
|
338
|
+
eval_name: evaluation.name,
|
|
339
|
+
skill_name: skill_names.join(', '),
|
|
340
|
+
provider_name: provider.name
|
|
341
|
+
)
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def record_and_compute_trend(result)
|
|
345
|
+
tracker = TrendTracker.new
|
|
346
|
+
enriched = result.merge(eval_name: eval_name, skill_names: skill_names)
|
|
347
|
+
trend = tracker.trend_for(enriched)
|
|
348
|
+
record_result = tracker.record(enriched)
|
|
349
|
+
|
|
350
|
+
record_success = record_result.is_a?(Hash) && record_result[:success]
|
|
351
|
+
unless record_success
|
|
352
|
+
message = if record_result.is_a?(Hash)
|
|
353
|
+
record_result.dig(:response, :error, :message) ||
|
|
354
|
+
record_result.dig(:error, :message) ||
|
|
355
|
+
'Unknown error'
|
|
356
|
+
else
|
|
357
|
+
'Unexpected record response'
|
|
358
|
+
end
|
|
359
|
+
SkillBench::ErrorLogger.log_error(
|
|
360
|
+
StandardError.new(message),
|
|
361
|
+
"Trend tracking record failed for eval #{eval_name}"
|
|
362
|
+
)
|
|
363
|
+
return {
|
|
364
|
+
success: false,
|
|
365
|
+
response: {
|
|
366
|
+
error: {
|
|
367
|
+
message: "Trend tracking record failed: #{message}",
|
|
368
|
+
record_result: record_result
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
end
|
|
373
|
+
{ success: true, trend: trend }
|
|
374
|
+
rescue StandardError => e
|
|
375
|
+
SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
|
|
376
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
377
|
+
end
|
|
378
|
+
# rubocop:enable Metrics/ClassLength
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
end
|