ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
class HistoryRecorder
|
|
8
|
+
# Handles atomic read/write of benchmark history JSON files.
|
|
9
|
+
class HistoryFile
|
|
10
|
+
# Loads history from the given path.
|
|
11
|
+
#
|
|
12
|
+
# @param path [String] path to the JSON history file
|
|
13
|
+
# @return [Array<Hash>] parsed history entries
|
|
14
|
+
def self.load(path)
|
|
15
|
+
new.load(path)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Writes history data atomically to the given path.
|
|
19
|
+
#
|
|
20
|
+
# @param path [String] target file path
|
|
21
|
+
# @param data [Array<Hash>] history entries to serialize
|
|
22
|
+
# @return [void]
|
|
23
|
+
def self.write(path, data)
|
|
24
|
+
new.write(path, data)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Loads history from the given path.
|
|
28
|
+
#
|
|
29
|
+
# @param path [String] path to the JSON history file
|
|
30
|
+
# @return [Array<Hash>] parsed history entries
|
|
31
|
+
def load(path)
|
|
32
|
+
return [] unless File.exist?(path)
|
|
33
|
+
|
|
34
|
+
JSON.parse(File.read(path), symbolize_names: true)
|
|
35
|
+
rescue JSON::ParserError => e
|
|
36
|
+
SkillBench::ErrorLogger.log_error(e, 'corrupted benchmarks.json')
|
|
37
|
+
[]
|
|
38
|
+
rescue StandardError => e
|
|
39
|
+
SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
|
|
40
|
+
[]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Writes history data atomically using a temp file and rename.
|
|
44
|
+
#
|
|
45
|
+
# @param path [String] target file path
|
|
46
|
+
# @param data [Array<Hash>] history entries to serialize
|
|
47
|
+
# @return [void]
|
|
48
|
+
def write(path, data)
|
|
49
|
+
dir = File.dirname(path)
|
|
50
|
+
FileUtils.mkpath(dir)
|
|
51
|
+
|
|
52
|
+
temp_path = "#{path}.tmp.#{Process.pid}"
|
|
53
|
+
File.open(temp_path, File::WRONLY | File::CREAT | File::TRUNC, 0o644) do |file|
|
|
54
|
+
file.flock(File::LOCK_EX)
|
|
55
|
+
file.write(JSON.pretty_generate(data))
|
|
56
|
+
file.fsync
|
|
57
|
+
end
|
|
58
|
+
File.rename(temp_path, path)
|
|
59
|
+
logger&.info("History recorded to #{path}")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def logger
|
|
65
|
+
::Rails.logger
|
|
66
|
+
rescue NameError
|
|
67
|
+
nil
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
class HistoryRecorder
|
|
7
|
+
# Resolves the best writable path for storing benchmark history.
|
|
8
|
+
# Checks env var, cwd, local share, and XDG data home in order.
|
|
9
|
+
class HistoryPathResolver
|
|
10
|
+
# Finds the best writable path for the history file.
|
|
11
|
+
#
|
|
12
|
+
# @return [String, nil] writable path or nil if none found
|
|
13
|
+
def self.resolve
|
|
14
|
+
new.resolve
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Finds the best writable path for the history file.
|
|
18
|
+
#
|
|
19
|
+
# @return [String, nil] writable path or nil if none found
|
|
20
|
+
def resolve
|
|
21
|
+
env_path || cwd_path || local_path || xdg_path || begin
|
|
22
|
+
warn('Warning: Could not find writable location for benchmarks.json')
|
|
23
|
+
nil
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def env_path
|
|
30
|
+
raw = ENV.fetch('SKILL_BENCH_HISTORY_FILE', '').to_s.strip
|
|
31
|
+
return nil if raw.empty?
|
|
32
|
+
|
|
33
|
+
expanded = File.expand_path(raw)
|
|
34
|
+
unless contained?(expanded)
|
|
35
|
+
warn "Warning: SKILL_BENCH_HISTORY_FILE '#{raw}' rejected (outside allowed directories or not writable)."
|
|
36
|
+
return nil
|
|
37
|
+
end
|
|
38
|
+
return nil unless prepare_and_writable?(expanded)
|
|
39
|
+
|
|
40
|
+
expanded
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def cwd_path
|
|
44
|
+
path = File.join(Dir.pwd, 'benchmarks.json')
|
|
45
|
+
return nil unless File.writable?(File.dirname(path))
|
|
46
|
+
|
|
47
|
+
path
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def local_path
|
|
51
|
+
path = File.join(Dir.home, '.local', 'share', 'skill_bench', 'benchmarks.json')
|
|
52
|
+
return nil unless prepare_and_writable?(path)
|
|
53
|
+
|
|
54
|
+
path
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def xdg_path
|
|
58
|
+
xdg_data_home = ENV.fetch('XDG_DATA_HOME', File.join(Dir.home, '.local', 'share'))
|
|
59
|
+
path = File.join(xdg_data_home, 'skill_bench', 'benchmarks.json')
|
|
60
|
+
return nil unless prepare_and_writable?(path)
|
|
61
|
+
|
|
62
|
+
path
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def contained?(path)
|
|
66
|
+
path_with_sep = path + File::SEPARATOR
|
|
67
|
+
allowed_prefixes.any? do |prefix|
|
|
68
|
+
expanded_prefix = File.expand_path(prefix) + File::SEPARATOR
|
|
69
|
+
path_with_sep.start_with?(expanded_prefix) || path == expanded_prefix.chomp(File::SEPARATOR)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def allowed_prefixes
|
|
74
|
+
[Dir.pwd, File.join(Dir.home, '.local', 'share', 'skill_bench')]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def prepare_and_writable?(path)
|
|
78
|
+
dir_name = File.dirname(path)
|
|
79
|
+
FileUtils.mkpath(dir_name)
|
|
80
|
+
File.writable?(dir_name)
|
|
81
|
+
rescue StandardError => e
|
|
82
|
+
SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
|
|
83
|
+
false
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
class HistoryRecorder
|
|
5
|
+
# Orchestrates recording evaluation results to the history file.
|
|
6
|
+
# Thin service that delegates path resolution and file I/O to
|
|
7
|
+
# HistoryPathResolver and HistoryFile respectively.
|
|
8
|
+
class PersistenceService
|
|
9
|
+
# Records evaluation results into a historical benchmarks file.
|
|
10
|
+
#
|
|
11
|
+
# @param results [Hash] The results from a Runner.call.
|
|
12
|
+
# @param source_path [String] The resolved source path used for the evaluation.
|
|
13
|
+
# @param model [String] The model name used for the evaluation.
|
|
14
|
+
# @return [Boolean] true if recorded successfully, false otherwise.
|
|
15
|
+
def self.record(results, source_path:, model:)
|
|
16
|
+
return false unless results[:success]
|
|
17
|
+
|
|
18
|
+
history_file = HistoryPathResolver.resolve
|
|
19
|
+
return false unless history_file
|
|
20
|
+
|
|
21
|
+
history = HistoryFile.load(history_file)
|
|
22
|
+
entry = {
|
|
23
|
+
timestamp: Time.now.iso8601,
|
|
24
|
+
source_path: source_path,
|
|
25
|
+
model: model,
|
|
26
|
+
summary: SummaryService.summarize(results[:tasks])
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
history << entry
|
|
30
|
+
HistoryFile.write(history_file, history)
|
|
31
|
+
true
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
SkillBench::ErrorLogger.log_error(e, 'HistoryRecorder')
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
class HistoryRecorder
|
|
7
|
+
# Service object for summarizing evaluation results.
|
|
8
|
+
# Handles score normalization and statistical calculations.
|
|
9
|
+
# Follows Single Responsibility Principle by isolating summary concerns.
|
|
10
|
+
class SummaryService
|
|
11
|
+
# Summarizes the results of multiple tasks.
|
|
12
|
+
#
|
|
13
|
+
# @param tasks [Array<Hash>] The list of task results.
|
|
14
|
+
# @return [Hash] A summary of scores including averages and improvement.
|
|
15
|
+
def self.summarize(tasks)
|
|
16
|
+
return {} if Array(tasks).empty?
|
|
17
|
+
|
|
18
|
+
scores = tasks.map { |task| normalize_score(task[:judge_score]) }
|
|
19
|
+
calculate_summary(scores)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Normalizes the raw judge score into a standardized Hash.
|
|
23
|
+
#
|
|
24
|
+
# @param raw_score [String, Hash, nil] The raw score from the judge.
|
|
25
|
+
# @return [Hash] The normalized score with :baseline_score and :context_score.
|
|
26
|
+
# @raise [JSON::ParserError] raised when the judge_score string contains invalid JSON (rescued internally).
|
|
27
|
+
def self.normalize_score(raw_score)
|
|
28
|
+
return {} unless raw_score
|
|
29
|
+
return raw_score if raw_score.is_a?(Hash)
|
|
30
|
+
|
|
31
|
+
begin
|
|
32
|
+
JSON.parse(raw_score, symbolize_names: true)
|
|
33
|
+
rescue JSON::ParserError
|
|
34
|
+
{}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Calculates statistical summary from a list of normalized scores.
|
|
39
|
+
#
|
|
40
|
+
# @param scores [Array<Hash>] List of normalized scores.
|
|
41
|
+
# @return [Hash] Summary statistics.
|
|
42
|
+
def self.calculate_summary(scores)
|
|
43
|
+
count = scores.size
|
|
44
|
+
baseline_total = 0.0
|
|
45
|
+
context_total = 0.0
|
|
46
|
+
|
|
47
|
+
scores.each do |score|
|
|
48
|
+
baseline_total += (score[:baseline_score] || 0).to_f
|
|
49
|
+
context_total += (score[:context_score] || 0).to_f
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
{
|
|
53
|
+
task_count: count,
|
|
54
|
+
average_baseline: (baseline_total / count).round(2),
|
|
55
|
+
average_context: (context_total / count).round(2),
|
|
56
|
+
improvement: ((context_total - baseline_total) / count).round(2)
|
|
57
|
+
}
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'history_recorder/persistence_service'
|
|
4
|
+
require_relative 'history_recorder/summary_service'
|
|
5
|
+
require_relative 'history_recorder/history_path_resolver'
|
|
6
|
+
require_relative 'history_recorder/history_file'
|
|
7
|
+
|
|
8
|
+
# Top-level namespace for the Rails Agent Evaluator.
|
|
9
|
+
module SkillBench
|
|
10
|
+
# Records evaluation results into a historical benchmarks file.
|
|
11
|
+
# Delegates to specialized services following Single Responsibility Principle.
|
|
12
|
+
class HistoryRecorder
|
|
13
|
+
# The default file where historical benchmarks are stored.
|
|
14
|
+
HISTORY_FILE = 'benchmarks.json'
|
|
15
|
+
|
|
16
|
+
# Records evaluation results into a historical benchmarks file.
|
|
17
|
+
# Delegates to PersistenceService.
|
|
18
|
+
def self.record(results, source_path:, model:)
|
|
19
|
+
PersistenceService.record(results, source_path: source_path, model: model)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Loads existing history from the benchmarks file.
|
|
23
|
+
# Delegates to HistoryFile.
|
|
24
|
+
def self.load_history(path = HISTORY_FILE)
|
|
25
|
+
HistoryFile.load(path)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Summarizes the results of multiple tasks.
|
|
29
|
+
# Delegates to SummaryService.
|
|
30
|
+
def self.summarize(tasks)
|
|
31
|
+
SummaryService.summarize(tasks)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Logs errors with backtrace.
|
|
35
|
+
# Delegates to ErrorLogger.
|
|
36
|
+
def self.log_error(exception)
|
|
37
|
+
SkillBench::ErrorLogger.log_error(exception, 'HistoryRecorder')
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'skill_bench/commands/run'
|
|
4
|
+
|
|
5
|
+
# Provides interactive CLI mode using gum-like menu system
|
|
6
|
+
module SkillBench
|
|
7
|
+
# Interactive CLI module for skill-bench
|
|
8
|
+
module Interactive
|
|
9
|
+
# Run the interactive CLI mode
|
|
10
|
+
# @return [Hash, nil] Result from Run.run, or nil if user exits
|
|
11
|
+
def self.run
|
|
12
|
+
choice = gum_choose
|
|
13
|
+
return nil unless choice
|
|
14
|
+
|
|
15
|
+
case choice
|
|
16
|
+
when 'Run Eval'
|
|
17
|
+
eval_name = select_eval
|
|
18
|
+
skill_name = select_skill
|
|
19
|
+
provider_name = select_provider
|
|
20
|
+
|
|
21
|
+
return nil unless eval_name && skill_name && provider_name
|
|
22
|
+
|
|
23
|
+
SkillBench::Commands::Run.run(
|
|
24
|
+
eval_name: eval_name,
|
|
25
|
+
skill_name: skill_name,
|
|
26
|
+
provider_name: provider_name
|
|
27
|
+
)
|
|
28
|
+
when 'Exit'
|
|
29
|
+
exit 0
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Display main menu using gum
|
|
34
|
+
# @return [String, nil] User's choice or nil
|
|
35
|
+
# @raise [NotImplementedError] Raised when gum integration is not enabled
|
|
36
|
+
def self.gum_choose
|
|
37
|
+
raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Select an eval from available evals
|
|
41
|
+
# @return [String, nil] Eval name or nil
|
|
42
|
+
# @raise [NotImplementedError] Raised when gum integration is not enabled
|
|
43
|
+
def self.select_eval
|
|
44
|
+
raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Select a skill from available skills
|
|
48
|
+
# @return [String, nil] Skill name or nil
|
|
49
|
+
# @raise [NotImplementedError] Raised when gum integration is not enabled
|
|
50
|
+
def self.select_skill
|
|
51
|
+
raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Select a provider from available providers
|
|
55
|
+
# @return [String, nil] Provider name or nil
|
|
56
|
+
# @raise [NotImplementedError] Raised when gum integration is not enabled
|
|
57
|
+
def self.select_provider
|
|
58
|
+
raise NotImplementedError, 'Interactive selection not implemented; enable gum integration'
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'securerandom'
|
|
4
|
+
require_relative '../client'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Judge
|
|
8
|
+
# Responsible for evaluating AI-generated code modifications.
|
|
9
|
+
#
|
|
10
|
+
# Accepts a structured judge prompt, calls the LLM client,
|
|
11
|
+
# and parses the response into a Judge::Response with per-dimension scores.
|
|
12
|
+
class Judge
|
|
13
|
+
# System prompt sent to the LLM judge defining its role and output format.
|
|
14
|
+
SYSTEM_PROMPT = 'You are an objective judge evaluating AI coding models. ' \
|
|
15
|
+
'Your goal is to score responses based strictly on the provided criteria. ' \
|
|
16
|
+
'Return only valid JSON.'
|
|
17
|
+
|
|
18
|
+
# Evaluates agent output via the LLM judge.
|
|
19
|
+
#
|
|
20
|
+
# @param prompt [String] The structured judge prompt.
|
|
21
|
+
# @param client_params [Hash] Optional parameters to pass to the client.
|
|
22
|
+
# @return [Hash] with :success [Boolean] and :response containing Judge::Response or error.
|
|
23
|
+
def self.call(prompt:, client_params: {})
|
|
24
|
+
new(prompt:, client_params:).call
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @param prompt [String] The structured judge prompt.
|
|
28
|
+
# @param client_params [Hash] Optional client parameters.
|
|
29
|
+
def initialize(prompt:, client_params:)
|
|
30
|
+
@prompt = prompt
|
|
31
|
+
@client_params = client_params
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Executes the evaluation process via the LLM client.
|
|
35
|
+
#
|
|
36
|
+
# @return [Hash] Service response with Judge::Response or error.
|
|
37
|
+
def call
|
|
38
|
+
judge_result = Client.call(
|
|
39
|
+
system_prompt: SYSTEM_PROMPT,
|
|
40
|
+
messages: [{ role: 'user', content: prompt }],
|
|
41
|
+
**client_params
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return judge_result unless judge_result[:success]
|
|
45
|
+
|
|
46
|
+
content = extract_content(judge_result)
|
|
47
|
+
return empty_response_result unless content
|
|
48
|
+
|
|
49
|
+
Response.call(json: content)
|
|
50
|
+
rescue StandardError => e
|
|
51
|
+
SkillBench::ErrorLogger.log_error(e, 'Judge Evaluation Error')
|
|
52
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
attr_reader :prompt, :client_params
|
|
58
|
+
|
|
59
|
+
def extract_content(judge_result)
|
|
60
|
+
response = judge_result[:response]
|
|
61
|
+
message = response[:message] || response['message']
|
|
62
|
+
return nil unless message.is_a?(Hash)
|
|
63
|
+
|
|
64
|
+
message[:content] || message['content']
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def empty_response_result
|
|
68
|
+
{ success: false, response: { error: { message: 'Empty response from judge' } } }
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Judge
|
|
5
|
+
# Builds structured prompts for the LLM judge.
|
|
6
|
+
#
|
|
7
|
+
# Assembles task description, evaluation criteria, skill context,
|
|
8
|
+
# and agent output into a single prompt for blind scoring.
|
|
9
|
+
class Prompt
|
|
10
|
+
# Builds the judge prompt.
|
|
11
|
+
#
|
|
12
|
+
# @param task [String] The task description from task.md.
|
|
13
|
+
# @param criteria [SkillBench::Criteria] The eval criteria with dimensions.
|
|
14
|
+
# @param skill_context [String, nil] XML-wrapped skill context (nil for baseline runs).
|
|
15
|
+
# @param agent_output [String] Git diff and agent summary.
|
|
16
|
+
# @return [Hash] Service response with prompt or error.
|
|
17
|
+
def self.call(task:, criteria:, skill_context:, agent_output:)
|
|
18
|
+
new(task:, criteria:, skill_context:, agent_output:).call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @param task [String] The task description.
|
|
22
|
+
# @param criteria [SkillBench::Criteria] The eval criteria.
|
|
23
|
+
# @param skill_context [String, nil] The skill context XML (nil for baseline runs).
|
|
24
|
+
# @param agent_output [String] The agent output.
|
|
25
|
+
def initialize(task:, criteria:, skill_context:, agent_output:)
|
|
26
|
+
@task = task
|
|
27
|
+
@criteria = criteria
|
|
28
|
+
@skill_context = skill_context
|
|
29
|
+
@agent_output = agent_output
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Assembles and returns the judge prompt.
|
|
33
|
+
#
|
|
34
|
+
# @return [Hash] Service response with prompt or error.
|
|
35
|
+
def call
|
|
36
|
+
return missing_task_result if task.nil? || task.strip.empty?
|
|
37
|
+
return missing_criteria_result if criteria.nil?
|
|
38
|
+
return missing_agent_output_result if agent_output.nil? || agent_output.to_s.strip.empty?
|
|
39
|
+
return missing_skill_context_result unless valid_skill_context?
|
|
40
|
+
|
|
41
|
+
prompt = assemble_prompt
|
|
42
|
+
{ success: true, response: { prompt: prompt } }
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
SkillBench::ErrorLogger.log_error(e, 'Judge::Prompt Build Error')
|
|
45
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
attr_reader :task, :criteria, :skill_context, :agent_output
|
|
51
|
+
|
|
52
|
+
def missing_task_result
|
|
53
|
+
{ success: false, response: { error: { message: 'Task is required' } } }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def missing_criteria_result
|
|
57
|
+
{ success: false, response: { error: { message: 'Criteria is required' } } }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def missing_agent_output_result
|
|
61
|
+
{ success: false, response: { error: { message: 'Agent output is required' } } }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def missing_skill_context_result
|
|
65
|
+
{ success: false, response: { error: { message: 'Skill context is required' } } }
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def valid_skill_context?
|
|
69
|
+
return true if skill_context.nil?
|
|
70
|
+
|
|
71
|
+
skill_context.is_a?(String) && !skill_context.strip.empty?
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def assemble_prompt
|
|
75
|
+
sections = [
|
|
76
|
+
task_section,
|
|
77
|
+
criteria_section,
|
|
78
|
+
skill_context_section,
|
|
79
|
+
agent_output_section,
|
|
80
|
+
instructions_section
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
sections.join("\n\n")
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def task_section
|
|
87
|
+
"## Task\n\n#{task}"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def criteria_section
|
|
91
|
+
lines = ['## Criteria']
|
|
92
|
+
lines << "\nContext: #{criteria.context}"
|
|
93
|
+
lines << "\nDimensions:"
|
|
94
|
+
|
|
95
|
+
criteria.dimensions.each do |dim|
|
|
96
|
+
lines << "- #{dim.name}: max_score=#{dim.max_score}, description=#{dim.description}"
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
lines.join("\n")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def skill_context_section
|
|
103
|
+
"## Skill Context\n\n#{skill_context}"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def agent_output_section
|
|
107
|
+
"## Agent Output\n\n#{agent_output}"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def instructions_section
|
|
111
|
+
<<~INSTRUCTIONS
|
|
112
|
+
## Instructions
|
|
113
|
+
|
|
114
|
+
Score each dimension independently. Return JSON with:
|
|
115
|
+
- "dimensions": object mapping each dimension name to { "score": number, "max_score": number, "reasoning": string }
|
|
116
|
+
- "overall_reasoning": string summarizing the evaluation
|
|
117
|
+
INSTRUCTIONS
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|