ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+
6
+ module SkillBench
7
+ class HistoryRecorder
8
+ # Handles atomic read/write of benchmark history JSON files.
9
+ class HistoryFile
10
+ # Loads history from the given path.
11
+ #
12
+ # @param path [String] path to the JSON history file
13
+ # @return [Array<Hash>] parsed history entries
14
+ def self.load(path)
15
+ new.load(path)
16
+ end
17
+
18
+ # Writes history data atomically to the given path.
19
+ #
20
+ # @param path [String] target file path
21
+ # @param data [Array<Hash>] history entries to serialize
22
+ # @return [void]
23
+ def self.write(path, data)
24
+ new.write(path, data)
25
+ end
26
+
27
+ # Loads history from the given path.
28
+ #
29
+ # @param path [String] path to the JSON history file
30
+ # @return [Array<Hash>] parsed history entries
31
+ def load(path)
32
+ return [] unless File.exist?(path)
33
+
34
+ JSON.parse(File.read(path), symbolize_names: true)
35
+ rescue JSON::ParserError => e
36
+ SkillBench::ErrorLogger.log_error(e, 'corrupted benchmarks.json')
37
+ []
38
+ rescue StandardError => e
39
+ SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
40
+ []
41
+ end
42
+
43
+ # Writes history data atomically using a temp file and rename.
44
+ #
45
+ # @param path [String] target file path
46
+ # @param data [Array<Hash>] history entries to serialize
47
+ # @return [void]
48
+ def write(path, data)
49
+ dir = File.dirname(path)
50
+ FileUtils.mkpath(dir)
51
+
52
+ temp_path = "#{path}.tmp.#{Process.pid}"
53
+ File.open(temp_path, File::WRONLY | File::CREAT | File::TRUNC, 0o644) do |file|
54
+ file.flock(File::LOCK_EX)
55
+ file.write(JSON.pretty_generate(data))
56
+ file.fsync
57
+ end
58
+ File.rename(temp_path, path)
59
+ logger&.info("History recorded to #{path}")
60
+ end
61
+
62
+ private
63
+
64
+ def logger
65
+ ::Rails.logger
66
+ rescue NameError
67
+ nil
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+
5
+ module SkillBench
6
+ class HistoryRecorder
7
+ # Resolves the best writable path for storing benchmark history.
8
+ # Checks env var, cwd, local share, and XDG data home in order.
9
+ class HistoryPathResolver
10
+ # Finds the best writable path for the history file.
11
+ #
12
+ # @return [String, nil] writable path or nil if none found
13
+ def self.resolve
14
+ new.resolve
15
+ end
16
+
17
+ # Finds the best writable path for the history file.
18
+ #
19
+ # @return [String, nil] writable path or nil if none found
20
+ def resolve
21
+ env_path || cwd_path || local_path || xdg_path || begin
22
+ warn('Warning: Could not find writable location for benchmarks.json')
23
+ nil
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def env_path
30
+ raw = ENV.fetch('SKILL_BENCH_HISTORY_FILE', '').to_s.strip
31
+ return nil if raw.empty?
32
+
33
+ expanded = File.expand_path(raw)
34
+ unless contained?(expanded)
35
+ warn "Warning: SKILL_BENCH_HISTORY_FILE '#{raw}' rejected (outside allowed directories or not writable)."
36
+ return nil
37
+ end
38
+ return nil unless prepare_and_writable?(expanded)
39
+
40
+ expanded
41
+ end
42
+
43
+ def cwd_path
44
+ path = File.join(Dir.pwd, 'benchmarks.json')
45
+ return nil unless File.writable?(File.dirname(path))
46
+
47
+ path
48
+ end
49
+
50
+ def local_path
51
+ path = File.join(Dir.home, '.local', 'share', 'skill_bench', 'benchmarks.json')
52
+ return nil unless prepare_and_writable?(path)
53
+
54
+ path
55
+ end
56
+
57
+ def xdg_path
58
+ xdg_data_home = ENV.fetch('XDG_DATA_HOME', File.join(Dir.home, '.local', 'share'))
59
+ path = File.join(xdg_data_home, 'skill_bench', 'benchmarks.json')
60
+ return nil unless prepare_and_writable?(path)
61
+
62
+ path
63
+ end
64
+
65
+ def contained?(path)
66
+ path_with_sep = path + File::SEPARATOR
67
+ allowed_prefixes.any? do |prefix|
68
+ expanded_prefix = File.expand_path(prefix) + File::SEPARATOR
69
+ path_with_sep.start_with?(expanded_prefix) || path == expanded_prefix.chomp(File::SEPARATOR)
70
+ end
71
+ end
72
+
73
+ def allowed_prefixes
74
+ [Dir.pwd, File.join(Dir.home, '.local', 'share', 'skill_bench')]
75
+ end
76
+
77
+ def prepare_and_writable?(path)
78
+ dir_name = File.dirname(path)
79
+ FileUtils.mkpath(dir_name)
80
+ File.writable?(dir_name)
81
+ rescue StandardError => e
82
+ SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
83
+ false
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ class HistoryRecorder
5
+ # Orchestrates recording evaluation results to the history file.
6
+ # Thin service that delegates path resolution and file I/O to
7
+ # HistoryPathResolver and HistoryFile respectively.
8
+ class PersistenceService
9
+ # Records evaluation results into a historical benchmarks file.
10
+ #
11
+ # @param results [Hash] The results from a Runner.call.
12
+ # @param source_path [String] The resolved source path used for the evaluation.
13
+ # @param model [String] The model name used for the evaluation.
14
+ # @return [Boolean] true if recorded successfully, false otherwise.
15
+ def self.record(results, source_path:, model:)
16
+ return false unless results[:success]
17
+
18
+ history_file = HistoryPathResolver.resolve
19
+ return false unless history_file
20
+
21
+ history = HistoryFile.load(history_file)
22
+ entry = {
23
+ timestamp: Time.now.iso8601,
24
+ source_path: source_path,
25
+ model: model,
26
+ summary: SummaryService.summarize(results[:tasks])
27
+ }
28
+
29
+ history << entry
30
+ HistoryFile.write(history_file, history)
31
+ true
32
+ rescue StandardError => e
33
+ SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
34
+ false
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ class HistoryRecorder
7
+ # Service object for summarizing evaluation results.
8
+ # Handles score normalization and statistical calculations.
9
+ # Follows Single Responsibility Principle by isolating summary concerns.
10
+ class SummaryService
11
+ # Summarizes the results of multiple tasks.
12
+ #
13
+ # @param tasks [Array<Hash>] The list of task results.
14
+ # @return [Hash] A summary of scores including averages and improvement.
15
+ def self.summarize(tasks)
16
+ return {} if Array(tasks).empty?
17
+
18
+ scores = tasks.map { |task| normalize_score(task[:judge_score]) }
19
+ calculate_summary(scores)
20
+ end
21
+
22
+ # Normalizes the raw judge score into a standardized Hash.
23
+ #
24
+ # @param raw_score [String, Hash, nil] The raw score from the judge.
25
+ # @return [Hash] The normalized score with :baseline_score and :context_score.
26
+ # @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally).
27
+ def self.normalize_score(raw_score)
28
+ return {} unless raw_score
29
+ return raw_score if raw_score.is_a?(Hash)
30
+
31
+ begin
32
+ JSON.parse(raw_score, symbolize_names: true)
33
+ rescue JSON::ParserError
34
+ {}
35
+ end
36
+ end
37
+
38
+ # Calculates statistical summary from a list of normalized scores.
39
+ #
40
+ # @param scores [Array<Hash>] List of normalized scores.
41
+ # @return [Hash] Summary statistics.
42
+ def self.calculate_summary(scores)
43
+ count = scores.size
44
+ baseline_total = 0.0
45
+ context_total = 0.0
46
+
47
+ scores.each do |score|
48
+ baseline_total += (score[:baseline_score] || 0).to_f
49
+ context_total += (score[:context_score] || 0).to_f
50
+ end
51
+
52
+ {
53
+ task_count: count,
54
+ average_baseline: (baseline_total / count).round(2),
55
+ average_context: (context_total / count).round(2),
56
+ improvement: ((context_total - baseline_total) / count).round(2)
57
+ }
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'history_recorder/persistence_service'
4
+ require_relative 'history_recorder/summary_service'
5
+ require_relative 'history_recorder/history_path_resolver'
6
+ require_relative 'history_recorder/history_file'
7
+
8
+ # Top-level namespace for the Rails Agent Evaluator.
9
+ module SkillBench
10
+ # Records evaluation results into a historical benchmarks file.
11
+ # Delegates to specialized services following Single Responsibility Principle.
12
+ class HistoryRecorder
13
+ # The default file where historical benchmarks are stored.
14
+ HISTORY_FILE = 'benchmarks.json'
15
+
16
+ # Records evaluation results into a historical benchmarks file.
17
+ # Delegates to PersistenceService.
18
+ def self.record(results, source_path:, model:)
19
+ PersistenceService.record(results, source_path: source_path, model: model)
20
+ end
21
+
22
+ # Loads existing history from the benchmarks file.
23
+ # Delegates to HistoryFile.
24
+ def self.load_history(path = HISTORY_FILE)
25
+ HistoryFile.load(path)
26
+ end
27
+
28
+ # Summarizes the results of multiple tasks.
29
+ # Delegates to SummaryService.
30
+ def self.summarize(tasks)
31
+ SummaryService.summarize(tasks)
32
+ end
33
+
34
+ # Logs errors with backtrace.
35
+ # Delegates to ErrorLogger.
36
+ def self.log_error(exception)
37
+ SkillBench::ErrorLogger.log_error(exception, 'HistoryRecorder')
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'skill_bench/commands/run'
4
+
5
+ # Provides interactive CLI mode using gum-like menu system
6
+ module SkillBench
7
+ # Interactive CLI module for skill-bench
8
+ module Interactive
9
+ # Run the interactive CLI mode
10
+ # @return [Hash, nil] Result from Run.run, or nil if user exits
11
+ def self.run
12
+ choice = gum_choose
13
+ return nil unless choice
14
+
15
+ case choice
16
+ when 'Run Eval'
17
+ eval_name = select_eval
18
+ skill_name = select_skill
19
+ provider_name = select_provider
20
+
21
+ return nil unless eval_name && skill_name && provider_name
22
+
23
+ SkillBench::Commands::Run.run(
24
+ eval_name: eval_name,
25
+ skill_name: skill_name,
26
+ provider_name: provider_name
27
+ )
28
+ when 'Exit'
29
+ exit 0
30
+ end
31
+ end
32
+
33
+ # Display main menu using gum
34
+ # @return [String, nil] User's choice or nil
35
+ # @raise [NotImplementedError] Raised when gum integration is not enabled
36
+ def self.gum_choose
37
+ raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
38
+ end
39
+
40
+ # Select an eval from available evals
41
+ # @return [String, nil] Eval name or nil
42
+ # @raise [NotImplementedError] Raised when gum integration is not enabled
43
+ def self.select_eval
44
+ raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
45
+ end
46
+
47
+ # Select a skill from available skills
48
+ # @return [String, nil] Skill name or nil
49
+ # @raise [NotImplementedError] Raised when gum integration is not enabled
50
+ def self.select_skill
51
+ raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
52
+ end
53
+
54
+ # Select a provider from available providers
55
+ # @return [String, nil] Provider name or nil
56
+ # @raise [NotImplementedError] Raised when gum integration is not enabled
57
+ def self.select_provider
58
+ raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+ require_relative '../client'
5
+
6
+ module SkillBench
7
+ module Judge
8
+ # Responsible for evaluating AI-generated code modifications.
9
+ #
10
+ # Accepts a structured judge prompt, calls the LLM client,
11
+ # and parses the response into a Judge::Response with per-dimension scores.
12
+ class Judge
13
+ # System prompt sent to the LLM judge defining its role and output format.
14
+ SYSTEM_PROMPT = 'You are an objective judge evaluating AI coding models. ' \
15
+ 'Your goal is to score responses based strictly on the provided criteria. ' \
16
+ 'Return only valid JSON.'
17
+
18
+ # Evaluates agent output via the LLM judge.
19
+ #
20
+ # @param prompt [String] The structured judge prompt.
21
+ # @param client_params [Hash] Optional parameters to pass to the client.
22
+ # @return [Hash] with :success [Boolean] and :response containing Judge::Response or error.
23
+ def self.call(prompt:, client_params: {})
24
+ new(prompt:, client_params:).call
25
+ end
26
+
27
+ # @param prompt [String] The structured judge prompt.
28
+ # @param client_params [Hash] Optional client parameters.
29
+ def initialize(prompt:, client_params:)
30
+ @prompt = prompt
31
+ @client_params = client_params
32
+ end
33
+
34
+ # Executes the evaluation process via the LLM client.
35
+ #
36
+ # @return [Hash] Service response with Judge::Response or error.
37
+ def call
38
+ judge_result = Client.call(
39
+ system_prompt: SYSTEM_PROMPT,
40
+ messages: [{ role: 'user', content: prompt }],
41
+ **client_params
42
+ )
43
+
44
+ return judge_result unless judge_result[:success]
45
+
46
+ content = extract_content(judge_result)
47
+ return empty_response_result unless content
48
+
49
+ Response.call(json: content)
50
+ rescue StandardError => e
51
+ SkillBench::ErrorLogger.log_error(e, 'Judge Evaluation Error')
52
+ { success: false, response: { error: { message: e.message } } }
53
+ end
54
+
55
+ private
56
+
57
+ attr_reader :prompt, :client_params
58
+
59
+ def extract_content(judge_result)
60
+ response = judge_result[:response]
61
+ message = response[:message] || response['message']
62
+ return nil unless message.is_a?(Hash)
63
+
64
+ message[:content] || message['content']
65
+ end
66
+
67
+ def empty_response_result
68
+ { success: false, response: { error: { message: 'Empty response from judge' } } }
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Judge
5
+ # Builds structured prompts for the LLM judge.
6
+ #
7
+ # Assembles task description, evaluation criteria, skill context,
8
+ # and agent output into a single prompt for blind scoring.
9
+ class Prompt
10
+ # Builds the judge prompt.
11
+ #
12
+ # @param task [String] The task description from task.md.
13
+ # @param criteria [SkillBench::Criteria] The eval criteria with dimensions.
14
+ # @param skill_context [String, nil] XML-wrapped skill context (nil for baseline runs).
15
+ # @param agent_output [String] Git diff and agent summary.
16
+ # @return [Hash] Service response with prompt or error.
17
+ def self.call(task:, criteria:, skill_context:, agent_output:)
18
+ new(task:, criteria:, skill_context:, agent_output:).call
19
+ end
20
+
21
+ # @param task [String] The task description.
22
+ # @param criteria [SkillBench::Criteria] The eval criteria.
23
+ # @param skill_context [String, nil] The skill context XML (nil for baseline runs).
24
+ # @param agent_output [String] The agent output.
25
+ def initialize(task:, criteria:, skill_context:, agent_output:)
26
+ @task = task
27
+ @criteria = criteria
28
+ @skill_context = skill_context
29
+ @agent_output = agent_output
30
+ end
31
+
32
+ # Assembles and returns the judge prompt.
33
+ #
34
+ # @return [Hash] Service response with prompt or error.
35
+ def call
36
+ return missing_task_result if task.nil? || task.strip.empty?
37
+ return missing_criteria_result if criteria.nil?
38
+ return missing_agent_output_result if agent_output.nil? || agent_output.to_s.strip.empty?
39
+ return missing_skill_context_result unless valid_skill_context?
40
+
41
+ prompt = assemble_prompt
42
+ { success: true, response: { prompt: prompt } }
43
+ rescue StandardError => e
44
+ SkillBench::ErrorLogger.log_error(e, 'Judge::Prompt Build Error')
45
+ { success: false, response: { error: { message: e.message } } }
46
+ end
47
+
48
+ private
49
+
50
+ attr_reader :task, :criteria, :skill_context, :agent_output
51
+
52
+ def missing_task_result
53
+ { success: false, response: { error: { message: 'Task is required' } } }
54
+ end
55
+
56
+ def missing_criteria_result
57
+ { success: false, response: { error: { message: 'Criteria is required' } } }
58
+ end
59
+
60
+ def missing_agent_output_result
61
+ { success: false, response: { error: { message: 'Agent output is required' } } }
62
+ end
63
+
64
+ def missing_skill_context_result
65
+ { success: false, response: { error: { message: 'Skill context is required' } } }
66
+ end
67
+
68
+ def valid_skill_context?
69
+ return true if skill_context.nil?
70
+
71
+ skill_context.is_a?(String) && !skill_context.strip.empty?
72
+ end
73
+
74
+ def assemble_prompt
75
+ sections = [
76
+ task_section,
77
+ criteria_section,
78
+ skill_context_section,
79
+ agent_output_section,
80
+ instructions_section
81
+ ]
82
+
83
+ sections.join("\n\n")
84
+ end
85
+
86
+ def task_section
87
+ "## Task\n\n#{task}"
88
+ end
89
+
90
+ def criteria_section
91
+ lines = ['## Criteria']
92
+ lines << "\nContext: #{criteria.context}"
93
+ lines << "\nDimensions:"
94
+
95
+ criteria.dimensions.each do |dim|
96
+ lines << "- #{dim.name}: max_score=#{dim.max_score}, description=#{dim.description}"
97
+ end
98
+
99
+ lines.join("\n")
100
+ end
101
+
102
+ def skill_context_section
103
+ "## Skill Context\n\n#{skill_context}"
104
+ end
105
+
106
+ def agent_output_section
107
+ "## Agent Output\n\n#{agent_output}"
108
+ end
109
+
110
+ def instructions_section
111
+ <<~INSTRUCTIONS
112
+ ## Instructions
113
+
114
+ Score each dimension independently. Return JSON with:
115
+ - "dimensions": object mapping each dimension name to { "score": number, "max_score": number, "reasoning": string }
116
+ - "overall_reasoning": string summarizing the evaluation
117
+ INSTRUCTIONS
118
+ end
119
+ end
120
+ end
121
+ end