ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Parses CLI arguments for the EvaluateCommand using Ruby's OptionParser.
8
+ # Provides standardized error handling for invalid flags and missing arguments.
9
+ # @deprecated Use {SkillBench::Cli::RunCommand} option parsing instead.
10
+ class OptionParserService
11
+ # Parses command-line options into a hash.
12
+ #
13
+ # @param argv [Array<String>] Raw CLI arguments.
14
+ # @return [Hash] Result envelope with parsed options or error message.
15
+ def self.call(argv)
16
+ new(argv).call
17
+ end
18
+
19
+ # @param argv [Array<String>] Raw CLI arguments.
20
+ def initialize(argv)
21
+ @argv = argv
22
+ end
23
+
24
+ # Parses the arguments and returns a result hash.
25
+ #
26
+ # @return [Hash] Result envelope with parsed options or error message.
27
+ def call
28
+ options = {}
29
+
30
+ parser(options).parse!(@argv)
31
+
32
+ { success: true, response: options }
33
+ rescue OptionParser::ParseError => e
34
+ { success: false, response: { error: { message: e.message } } }
35
+ end
36
+
37
+ private
38
+
39
+ def parser(options)
40
+ OptionParser.new do |opts|
41
+ opts.banner = 'Usage: skill-bench [options]'
42
+
43
+ opts.on('-e', '--eval FOLDER', 'Path to the eval folder') do |eval_path|
44
+ options[:eval] = eval_path
45
+ end
46
+
47
+ opts.on('-s', '--skill FOLDER', 'Optional override for the source skill folder') do |skill_path|
48
+ options[:skill] = skill_path
49
+ end
50
+
51
+ opts.on('-o', '--output FILE', 'Path to save the JSON report') do |output_path|
52
+ options[:output] = output_path
53
+ end
54
+
55
+ opts.on('-h', '--help', 'Prints this help') do
56
+ puts opts
57
+ raise SkillBench::HelpRequested
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Service object for persisting evaluation results to JSON files.
9
+ # Handles file I/O, JSON serialization, and provides standardized error responses
10
+ # for filesystem operations.
11
+ # @deprecated Use {SkillBench::Cli::RunCommand} output handling instead.
12
+ class OutputPersistenceService
13
+ WRITE_ERROR = 'Failed to write output file'
14
+
15
+ # Persists evaluation results to a JSON file with proper formatting.
16
+ #
17
+ # @param result [Hash] Evaluation result hash containing all evaluation data
18
+ # @param output_path [String, nil] Path to save the JSON report. If nil or empty, no action is taken
19
+ # @return [Hash] Standardized response hash with format:
20
+ # - { success: true, response: { message: String } } on success
21
+ # - { success: true, response: {} } when no output path is provided
22
+ # - { success: false, response: { error: { message: String } } } on failure
23
+ # @example Save to file
24
+ # result = OutputPersistenceService.call(evaluation_result, output_path: 'output.json')
25
+ # # => { success: true, response: { message: 'Report saved to output.json' } }
26
+ # @example No output path
27
+ # result = OutputPersistenceService.call(evaluation_result, output_path: nil)
28
+ # # => { success: true, response: {} }
29
+ def self.call(result, output_path:)
30
+ new(result, output_path: output_path).call
31
+ end
32
+
33
+ # Initializes a new persistence service instance.
34
+ #
35
+ # @param result [Hash] Evaluation result hash containing all evaluation data
36
+ # @param output_path [String, nil] Path to save the JSON report
37
+ def initialize(result, output_path:)
38
+ @result = result
39
+ @output_path = output_path
40
+ end
41
+
42
+ # Persists the evaluation result to the specified output path.
43
+ #
44
+ # @return [Hash] Standardized response hash with format:
45
+ # - { success: true, response: { message: String } } on success
46
+ # - { success: true, response: {} } when no output path is provided
47
+ # - { success: false, response: { error: { message: String } } } on failure
48
+ # @raise [SystemCallError] when file system operations fail (handled internally)
49
+ def call
50
+ return { success: true, response: {} } if @output_path.to_s.empty?
51
+
52
+ ensure_directory_exists
53
+ write_json_file
54
+
55
+ { success: true, response: { message: "Report saved to #{@output_path}" } }
56
+ rescue SystemCallError, JSON::GeneratorError => e
57
+ { success: false, response: { error: { message: "#{WRITE_ERROR}: #{e.message}" } } }
58
+ end
59
+
60
+ private
61
+
62
+ # Ensures the parent directory for the output file exists.
63
+ # Creates the directory structure if it doesn't exist.
64
+ def ensure_directory_exists
65
+ directory = File.dirname(@output_path)
66
+ FileUtils.mkdir_p(directory) unless File.directory?(directory)
67
+ end
68
+
69
+ # Writes the evaluation result as a formatted JSON file.
70
+ #
71
+ # @raise [SystemCallError] when file write operation fails
72
+ def write_json_file
73
+ File.write(@output_path, JSON.generate(@result, pretty: true))
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'judge_score_parser_service'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Service object for printing formatted evaluation results to stdout.
8
+ # Handles result formatting, score parsing, and provides standardized output for
9
+ # both successful and failed evaluations.
10
+ # @deprecated Use {SkillBench::Cli::ResultPrinter} instead.
11
+ class ResultPrinterService
12
+ RESULTS_BANNER = "\n=========================================\n " \
13
+ "RESULTS \n" \
14
+ "=========================================\n"
15
+
16
+ # Prints formatted evaluation results to the specified output stream.
17
+ #
18
+ # @param result [Hash] Evaluation result hash containing success status and task data
19
+ # @param stdout [#puts, #write] Output stream for user-visible messages. Defaults to $stdout
20
+ # @return [Hash] Standardized response hash with format:
21
+ # - { success: true, response: {} } on successful printing
22
+ # @example Print successful results
23
+ # result = ResultPrinterService.call(evaluation_result)
24
+ # # => { success: true, response: {} }
25
+ # @example Print to custom stream
26
+ # result = ResultPrinterService.call(evaluation_result, stdout: string_io)
27
+ # # => { success: true, response: {} }
28
+ def self.call(result, stdout: $stdout)
29
+ new(result, stdout: stdout).call
30
+ end
31
+
32
+ # Initializes a new result printer instance.
33
+ #
34
+ # @param result [Hash] Evaluation result hash containing success status and task data
35
+ # @param stdout [#puts, #write] Output stream for user-visible messages. Defaults to $stdout
36
+ def initialize(result, stdout: $stdout)
37
+ @result = result
38
+ @stdout = stdout
39
+ end
40
+
41
+ # Prints the evaluation results in a formatted, user-friendly manner.
42
+ # Handles both successful evaluations and error cases.
43
+ #
44
+ # @return [Hash] Standardized response hash with format:
45
+ # - { success: true, response: {} } on successful printing
46
+ def call
47
+ @stdout.puts RESULTS_BANNER
48
+
49
+ unless @result[:success]
50
+ error_msg = @result.dig(:response, :error, :message) || 'Unknown error'
51
+ @stdout.puts "Evaluation failed: #{error_msg}"
52
+ return { success: true, response: {} }
53
+ end
54
+
55
+ @result[:tasks]&.each do |task_result|
56
+ @stdout.puts "\n========================================="
57
+ @stdout.puts " RESULTS: #{task_result[:path]} "
58
+ @stdout.puts "=========================================\n"
59
+ print_task_result(task_result)
60
+ end
61
+
62
+ { success: true, response: {} }
63
+ end
64
+
65
+ private
66
+
67
+ # Prints the result for a single task, including scores and diffs.
68
+ #
69
+ # @param task_result [Hash] Individual task result containing judge scores and diffs
70
+ def print_task_result(task_result)
71
+ score_payload = task_result[:judge_score]
72
+ parser_class = SkillBench::Services::JudgeScoreParserService
73
+ parsed_judge = parser_class.call(score_payload)
74
+
75
+ unless parsed_judge[:success]
76
+ print_parse_error
77
+ @stdout.puts(score_payload || 'nil')
78
+ return
79
+ end
80
+
81
+ print_judge_summary(parsed_judge[:response])
82
+ print_task_diffs(task_result[:path], task_result[:baseline_diff], task_result[:context_diff])
83
+ end
84
+
85
+ # Prints an error message when judge score parsing fails.
86
+ def print_parse_error
87
+ @stdout.puts 'Could not parse judge JSON response. Raw output:'
88
+ end
89
+
90
+ # Prints the judge score summary including baseline and context scores.
91
+ #
92
+ # @param parsed_judge [Hash] Parsed judge score data containing scores and reasoning
93
+ def print_judge_summary(parsed_judge)
94
+ @stdout.puts "Baseline Score: #{parsed_judge['baseline_score']}/100"
95
+ @stdout.puts "Context Score: #{parsed_judge['context_score']}/100"
96
+ @stdout.puts "\nReasoning:"
97
+ @stdout.puts parsed_judge['reasoning']
98
+ end
99
+
100
+ # Prints the baseline and context diffs for a task.
101
+ #
102
+ # @param path [String] The file path associated with the diff
103
+ # @param baseline_diff [String] The diff content for the baseline
104
+ # @param context_diff [String] The diff content for the context
105
+ def print_task_diffs(path, baseline_diff, context_diff)
106
+ print_diff_section('BASELINE CHANGES', path, baseline_diff)
107
+ print_diff_section('CONTEXT CHANGES', path, context_diff)
108
+ end
109
+
110
+ # Prints a formatted diff section with a banner.
111
+ #
112
+ # @param title [String] The title for the diff section (e.g., 'BASELINE CHANGES')
113
+ # @param path [String] The file path associated with the diff
114
+ # @param diff [String] The diff content to print
115
+ def print_diff_section(title, path, diff)
116
+ sep_newline = "\n========================================="
117
+ sep_plain = "=========================================\n"
118
+
119
+ @stdout.puts sep_newline
120
+ @stdout.puts " #{title}: #{path} "
121
+ @stdout.puts sep_plain
122
+ @stdout.puts diff
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,381 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'pathname'
5
+ require_relative '../models/eval'
6
+ require_relative '../models/skill'
7
+ require_relative '../models/config'
8
+ require_relative '../models/provider'
9
+ require_relative '../clients/all'
10
+ require_relative 'skill_resolver'
11
+ require_relative '../trend_tracker'
12
+ require_relative '../execution/sandbox'
13
+ require_relative '../execution/context_hydrator'
14
+ require_relative '../execution/source_path_resolver'
15
+ require_relative '../agent/react_agent'
16
+
17
+ module SkillBench
18
+ module Services
19
+ # Orchestrates the execution of an eval with baseline and context runs.
20
+ # rubocop:disable Metrics/ClassLength
21
+ class RunnerService
22
+ # Stand-in provider when no LLM config is available.
23
+ MOCK_PROVIDER = Struct.new(:name, :runtime, :llm, :merged_config)
24
+ private_constant :MOCK_PROVIDER
25
+
26
+ # Runs an eval with the given parameters.
27
+ #
28
+ # @param eval_name [String] Name or path of the eval to run
29
+ # @param skill_names [Array<String>] Names of the skills to use
30
+ # @return [Hash] Result from EvaluationRunner
31
+ def self.call(eval_name:, skill_names:)
32
+ new(eval_name: eval_name, skill_names: skill_names).call
33
+ end
34
+
35
+ # @param eval_name [String] Name or path of the eval
36
+ # @param skill_names [Array<String>] Names of the skills
37
+ def initialize(eval_name:, skill_names:)
38
+ @eval_name = eval_name
39
+ @skill_names = skill_names
40
+ end
41
+
42
+ # Executes the eval: resolves entities, runs baseline and context, evaluates.
43
+ #
44
+ # @return [Hash] Evaluation result with deltas and verdict.
45
+ # @raise [Errno::ENOENT] when the eval directory does not exist.
46
+ # @raise [ArgumentError] when a skill cannot be resolved.
47
+ def call
48
+ evaluation = resolve_eval
49
+ skills = resolve_skills
50
+ provider = resolve_provider
51
+
52
+ config_result = resolve_provider_config(provider)
53
+ return config_error_result(config_result[:error], evaluation, provider) unless config_result[:success]
54
+
55
+ config = config_result[:config]
56
+ baseline_prompt = build_baseline_system_prompt
57
+
58
+ baseline_output = spawn_agent(evaluation, baseline_prompt, provider, config)
59
+ return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
60
+
61
+ skill_context = load_combined_skill_context(skills)
62
+ return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
63
+
64
+ context_prompt = build_context_system_prompt(evaluation, skills)
65
+ context_output = spawn_agent(evaluation, context_prompt, provider, config)
66
+ return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
67
+
68
+ criteria = evaluation.criteria
69
+
70
+ judge_params = build_judge_params(provider, config)
71
+
72
+ result = Evaluation::Runner.call(
73
+ task: evaluation.task,
74
+ criteria: criteria,
75
+ skill_context: skill_context,
76
+ baseline_output: format_output(baseline_output),
77
+ context_output: format_output(context_output),
78
+ judge_params: judge_params
79
+ )
80
+
81
+ return enrich_error_result(result, evaluation, provider) unless result[:success]
82
+
83
+ trend_result = record_and_compute_trend(result)
84
+ return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
85
+
86
+ {
87
+ success: true,
88
+ eval_name: eval_name,
89
+ skill_name: skill_names.join(', '),
90
+ provider_name: provider.name,
91
+ response: result[:response].merge(
92
+ trend: trend_result[:trend],
93
+ baseline_iterations: baseline_output[:iterations] || [],
94
+ context_iterations: context_output[:iterations] || []
95
+ )
96
+ }
97
+ end
98
+
99
+ private
100
+
101
+ attr_reader :eval_name, :skill_names
102
+
103
+ def resolve_eval
104
+ eval_path = eval_name.include?('/') ? eval_name : "evals/#{eval_name}"
105
+ SkillBench::Models::Eval.load(eval_path)
106
+ end
107
+
108
+ def resolve_skills
109
+ skill_names.map { |name| Services::SkillResolver.call(name) }
110
+ end
111
+
112
+ def resolve_provider_config(provider)
113
+ { success: true, config: provider.merged_config }
114
+ rescue ArgumentError => e
115
+ { success: false, error: e }
116
+ end
117
+
118
+ # Safely calls merged_config, returning nil on any error.
119
+ #
120
+ # @param provider [Object] The provider to query.
121
+ # @return [Hash, nil] The merged config or nil.
122
+ def safe_merged_config(provider)
123
+ provider.merged_config
124
+ rescue StandardError
125
+ nil
126
+ end
127
+
128
+ def resolve_provider
129
+ config = SkillBench::Models::Config.load
130
+ provider = config.to_provider
131
+ return provider if provider
132
+
133
+ warn 'Config load failed, using mock provider'
134
+ MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
135
+ end
136
+
137
+ # Spawns the LLM agent with the given system prompt.
138
+ #
139
+ # @param evaluation [SkillBench::Models::Eval] The eval being run.
140
+ # @param system_prompt [String] The system prompt for the agent.
141
+ # @param provider [Object] The resolved provider.
142
+ # @param config [Hash, nil] Provider config.
143
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations.
144
+ def spawn_agent(evaluation, system_prompt, provider, config)
145
+ return { result: 'mock result', status: :success, iterations: [] } if provider.name == 'mock'
146
+
147
+ client_params = build_client_params(provider, config)
148
+
149
+ max_iterations = config&.[](:max_iterations) || config&.[]('max_iterations') || 25
150
+
151
+ Execution::Sandbox.run(evaluation.path) do |sandbox|
152
+ agent_result = Agent::ReactAgent.call(
153
+ system_prompt: system_prompt,
154
+ initial_prompt: evaluation.task,
155
+ working_dir: sandbox.path,
156
+ container_id: sandbox.container_id,
157
+ client_params: client_params,
158
+ max_iterations: max_iterations
159
+ )
160
+
161
+ status = agent_result[:success] ? :success : :error
162
+ final_answer = agent_result.dig(:response, :content) || ''
163
+ diff = Execution::Sandbox.capture_diff(sandbox.path)
164
+ iterations = agent_result.dig(:response, :iterations) || []
165
+
166
+ output = [final_answer, diff].reject(&:empty?).join("\n\n")
167
+
168
+ {
169
+ result: output,
170
+ status: status,
171
+ runtime: provider.runtime,
172
+ usage: {},
173
+ raw_response: agent_result,
174
+ iterations: iterations
175
+ }
176
+ end
177
+ end
178
+
179
+ # Builds client parameters for the ReactAgent.
180
+ #
181
+ # @param provider [Object] The resolved provider.
182
+ # @param config [Hash, nil] Provider config.
183
+ # @return [Hash] Client parameters.
184
+ def build_client_params(provider, config)
185
+ config ||= safe_merged_config(provider)
186
+ return {} unless config
187
+
188
+ params = config.dup
189
+ params[:model] ||= provider.llm
190
+ params[:provider] = provider.runtime.to_sym
191
+ params
192
+ rescue StandardError
193
+ {}
194
+ end
195
+
196
+ # Builds the baseline system prompt (no skill context).
197
+ #
198
+ # @return [String] The baseline system prompt.
199
+ def build_baseline_system_prompt
200
+ <<~PROMPT
201
+ You are an expert Ruby on Rails developer. Your job is to read the task,
202
+ modify the codebase using the tools provided to meet the requirements,
203
+ and then explain what you did.
204
+ PROMPT
205
+ end
206
+
207
+ # Builds the context-aware system prompt based on eval metadata.
208
+ #
209
+ # For `skill_bundle_xml` context mode, combines SKILL.md with source code
210
+ # via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
211
+ #
212
+ # @param evaluation [SkillBench::Models::Eval] The eval being run.
213
+ # @param skills [Array<SkillBench::Models::Skill>] Resolved skills.
214
+ # @return [String] The context system prompt.
215
+ def build_context_system_prompt(evaluation, skills)
216
+ skill_md_content = load_combined_skill_context(skills)
217
+ return skill_md_content unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
218
+
219
+ source_path = resolve_source_path(evaluation)
220
+ return skill_md_content unless source_path
221
+
222
+ xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
223
+ hydrator_response = xml_result[:response]
224
+ xml_context = hydrator_response[:context]
225
+ return skill_md_content unless xml_result[:success] && !xml_context.empty?
226
+
227
+ <<~PROMPT
228
+ You are an expert Ruby on Rails developer.
229
+ You have access to a skill file and source code wrapped in <agent_context> tags.
230
+ Use the skill instructions and the provided source code to solve the task.
231
+
232
+ ## Skill Instructions
233
+ #{skill_md_content}
234
+
235
+ ## Source Code
236
+ #{xml_context}
237
+ PROMPT
238
+ end
239
+
240
+ # Resolves the source path for context hydration.
241
+ #
242
+ # Tries the eval's `source/` subdirectory first, then falls back to
243
+ # SourcePathResolver inference.
244
+ #
245
+ # @param evaluation [SkillBench::Models::Eval] The eval being run.
246
+ # @return [String, nil] The resolved source path, or nil if not found.
247
+ def resolve_source_path(evaluation)
248
+ eval_path = evaluation.path
249
+ eval_source = File.join(eval_path, 'source')
250
+ return eval_source if Dir.exist?(eval_source)
251
+
252
+ inferred = Execution::SourcePathResolver.call(eval_folder_path: eval_path.to_s)
253
+ inferred if inferred && Dir.exist?(inferred)
254
+ end
255
+
256
+ # Returns an error result when skill context is empty.
257
+ #
258
+ # @param evaluation [SkillBench::Models::Eval] The eval being run.
259
+ # @param provider [Object] The resolved provider.
260
+ # @return [Hash] Error result with metadata.
261
+ def empty_context_error_result(evaluation, provider)
262
+ {
263
+ success: false,
264
+ response: {
265
+ error: {
266
+ message: 'Skill context is empty. Ensure SKILL.md exists and has content.'
267
+ }
268
+ },
269
+ eval_name: evaluation.name,
270
+ skill_name: skill_names.join(', '),
271
+ provider_name: provider.name
272
+ }
273
+ end
274
+
275
+ def load_combined_skill_context(skills)
276
+ return '' if skills.nil? || skills.empty?
277
+
278
+ contexts = skills.map { |skill| load_skill_context(skill) }
279
+ contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
280
+ end
281
+
282
+ def load_skill_context(skill)
283
+ skill_md = File.join(skill.path, 'SKILL.md')
284
+ File.exist?(skill_md) ? File.read(skill_md) : ''
285
+ end
286
+
287
+ def build_judge_params(provider, config)
288
+ return {} if provider.name == 'mock'
289
+
290
+ config ||= safe_merged_config(provider)
291
+ return {} unless config
292
+
293
+ {
294
+ api_key: config[:api_key],
295
+ model: config[:model] || provider.llm,
296
+ provider: provider.runtime.to_sym
297
+ }
298
+ rescue StandardError
299
+ {}
300
+ end
301
+
302
+ def format_output(agent_result)
303
+ agent_result[:result].to_s
304
+ end
305
+
306
+ def agent_error_result(result, phase, evaluation, provider)
307
+ raw = result[:raw_response]
308
+ error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
309
+ {
310
+ success: false,
311
+ response: {
312
+ error: {
313
+ message: "#{phase.capitalize} agent failed: #{error_msg}"
314
+ }
315
+ },
316
+ eval_name: evaluation.name,
317
+ skill_name: skill_names.join(', '),
318
+ provider_name: provider.name
319
+ }
320
+ end
321
+
322
+ def config_error_result(error, evaluation, provider)
323
+ {
324
+ success: false,
325
+ response: {
326
+ error: {
327
+ message: "Configuration error: #{error.message}"
328
+ }
329
+ },
330
+ eval_name: evaluation.name,
331
+ skill_name: skill_names.join(', '),
332
+ provider_name: provider.name
333
+ }
334
+ end
335
+
336
+ def enrich_error_result(result, evaluation, provider)
337
+ result.merge(
338
+ eval_name: evaluation.name,
339
+ skill_name: skill_names.join(', '),
340
+ provider_name: provider.name
341
+ )
342
+ end
343
+
344
+ def record_and_compute_trend(result)
345
+ tracker = TrendTracker.new
346
+ enriched = result.merge(eval_name: eval_name, skill_names: skill_names)
347
+ trend = tracker.trend_for(enriched)
348
+ record_result = tracker.record(enriched)
349
+
350
+ record_success = record_result.is_a?(Hash) && record_result[:success]
351
+ unless record_success
352
+ message = if record_result.is_a?(Hash)
353
+ record_result.dig(:response, :error, :message) ||
354
+ record_result.dig(:error, :message) ||
355
+ 'Unknown error'
356
+ else
357
+ 'Unexpected record response'
358
+ end
359
+ SkillBench::ErrorLogger.log_error(
360
+ StandardError.new(message),
361
+ "Trend tracking record failed for eval #{eval_name}"
362
+ )
363
+ return {
364
+ success: false,
365
+ response: {
366
+ error: {
367
+ message: "Trend tracking record failed: #{message}",
368
+ record_result: record_result
369
+ }
370
+ }
371
+ }
372
+ end
373
+ { success: true, trend: trend }
374
+ rescue StandardError => e
375
+ SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
376
+ { success: false, response: { error: { message: e.message } } }
377
+ end
378
+ # rubocop:enable Metrics/ClassLength
379
+ end
380
+ end
381
+ end