ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../models/skill'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Resolves a skill identifier to a Skill model instance.
|
|
8
|
+
# Supports both direct paths (containing '/') and skill names (searched recursively).
|
|
9
|
+
class SkillResolver
|
|
10
|
+
# Resolves a skill identifier to a Skill instance.
|
|
11
|
+
#
|
|
12
|
+
# @param identifier [String] Skill path or name
|
|
13
|
+
# @param base_path [String] Base directory for skill discovery (default: 'skills/')
|
|
14
|
+
# @return [SkillBench::Models::Skill] The resolved skill
|
|
15
|
+
# @raise [ArgumentError] if skill not found
|
|
16
|
+
def self.call(identifier, base_path = 'skills/')
|
|
17
|
+
new(identifier, base_path).call
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# @param identifier [String] Skill path or name
|
|
21
|
+
# @param base_path [String] Base directory for skill discovery
|
|
22
|
+
def initialize(identifier, base_path = 'skills/')
|
|
23
|
+
@identifier = identifier
|
|
24
|
+
@base_path = base_path
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Resolves the skill identifier.
|
|
28
|
+
#
|
|
29
|
+
# @return [SkillBench::Models::Skill] The resolved skill
|
|
30
|
+
# @raise [ArgumentError] if skill not found
|
|
31
|
+
def call
|
|
32
|
+
return resolve_by_path if identifier.include?('/')
|
|
33
|
+
|
|
34
|
+
resolve_by_name
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
attr_reader :identifier, :base_path
|
|
40
|
+
|
|
41
|
+
# Resolves a skill by direct file path.
|
|
42
|
+
#
|
|
43
|
+
# @return [SkillBench::Models::Skill] The resolved skill
|
|
44
|
+
# @raise [ArgumentError] if skill file not found at path or path escapes project boundary
|
|
45
|
+
def resolve_by_path
|
|
46
|
+
normalized_path = identifier.end_with?('SKILL.md') ? File.dirname(identifier) : identifier
|
|
47
|
+
absolute_path = File.expand_path(normalized_path)
|
|
48
|
+
cwd = File.expand_path(Dir.pwd)
|
|
49
|
+
cwd_with_sep = cwd + File::SEPARATOR
|
|
50
|
+
|
|
51
|
+
raise(ArgumentError, "Skill path escapes project boundary: #{identifier}") unless absolute_path == cwd || absolute_path.start_with?(cwd_with_sep)
|
|
52
|
+
|
|
53
|
+
skill_md = File.join(normalized_path, 'SKILL.md')
|
|
54
|
+
|
|
55
|
+
return Models::Skill.new(name: File.basename(normalized_path), path: normalized_path) if File.exist?(skill_md)
|
|
56
|
+
|
|
57
|
+
raise(ArgumentError, "Skill not found: #{identifier}")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Resolves a skill by name using recursive discovery.
|
|
61
|
+
#
|
|
62
|
+
# @return [SkillBench::Models::Skill] The resolved skill
|
|
63
|
+
# @raise [ArgumentError] if no skill with matching name found
|
|
64
|
+
def resolve_by_name
|
|
65
|
+
skills = Models::Skill.discover(base_path)
|
|
66
|
+
matches = skills.select { |skill| skill.name == identifier }
|
|
67
|
+
|
|
68
|
+
if matches.empty?
|
|
69
|
+
raise(ArgumentError, "Skill not found: #{identifier}")
|
|
70
|
+
elsif matches.size > 1
|
|
71
|
+
raise(ArgumentError, "Multiple skills found with name '#{identifier}': #{matches.map(&:path).join(', ')}")
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
matches.first
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
class TemplateRegistry
|
|
6
|
+
# Value object holding all template data for a single category.
|
|
7
|
+
CategoryData = Data.define(:requirements, :criteria, :pattern, :code_template)
|
|
8
|
+
|
|
9
|
+
REGISTRY = {
|
|
10
|
+
crud: CategoryData.new(
|
|
11
|
+
requirements: "- Implement Create, Read, Update, Delete operations\n- Use Service Object pattern with `.call`\n- Include input validation",
|
|
12
|
+
criteria: { focus: 'data integrity', required_tests: %w[create read update delete] },
|
|
13
|
+
pattern: 'Service Object implementing Create, Read, Update, Delete operations.',
|
|
14
|
+
code_template: "class {{skill_name}}\n def self.call(params)\n new(params).call\n end\nend"
|
|
15
|
+
),
|
|
16
|
+
api: CategoryData.new(
|
|
17
|
+
requirements: "- Implement API client with proper error handling\n- Use Faraday or Net::HTTP\n- Handle authentication and retries",
|
|
18
|
+
criteria: { focus: 'error handling', required_tests: %w[success failure timeout] },
|
|
19
|
+
pattern: 'Layered API client with Auth, Client, Fetcher, Builder, and Entity layers.',
|
|
20
|
+
code_template: "class {{skill_name}}\n def self.call(endpoint, params = {})\n new(endpoint, params).call\n end\nend"
|
|
21
|
+
),
|
|
22
|
+
background_job: CategoryData.new(
|
|
23
|
+
requirements: "- Implement as an ActiveJob or Sidekiq worker\n- Include retry logic and error handling\n- Ensure idempotency",
|
|
24
|
+
criteria: { focus: 'reliability', required_tests: %w[perform retry failure] },
|
|
25
|
+
pattern: 'Background job with retry logic, error handling, and idempotency.',
|
|
26
|
+
code_template: "class {{skill_name}} < ApplicationJob\n def perform(*args)\n # job logic\n end\nend"
|
|
27
|
+
),
|
|
28
|
+
controller: CategoryData.new(
|
|
29
|
+
requirements: "- Follow RESTful conventions\n- Use strong parameters\n- Include proper error responses",
|
|
30
|
+
criteria: { focus: 'REST compliance', required_tests: %w[index show create update destroy] },
|
|
31
|
+
pattern: 'RESTful controller with strong parameters and proper error responses.',
|
|
32
|
+
code_template: "class {{skill_name}}Controller < ApplicationController\n def index; end\n def show; end\nend"
|
|
33
|
+
),
|
|
34
|
+
model: CategoryData.new(
|
|
35
|
+
requirements: "- Define validations and associations\n- Add scopes for common queries\n- Include callback hooks where appropriate",
|
|
36
|
+
criteria: { focus: 'data modeling', required_tests: %w[validations associations scopes] },
|
|
37
|
+
pattern: 'ActiveRecord model with validations, associations, and scopes.',
|
|
38
|
+
code_template: "class {{skill_name}} < ApplicationRecord\n validates :name, presence: true\nend"
|
|
39
|
+
),
|
|
40
|
+
migration: CategoryData.new(
|
|
41
|
+
requirements: "- Write reversible migration\n- Include index definitions\n- Handle data migration if needed",
|
|
42
|
+
criteria: { focus: 'reversibility', required_tests: %w[up down] },
|
|
43
|
+
pattern: 'Reversible database migration with indexes and data handling.',
|
|
44
|
+
code_template: "class {{skill_name}} < ActiveRecord::Migration[7.1]\n def change\n # migration logic\n end\nend"
|
|
45
|
+
),
|
|
46
|
+
concern: CategoryData.new(
|
|
47
|
+
requirements: "- Extract shared behavior into a module\n- Use ActiveSupport::Concern\n- Keep interface minimal",
|
|
48
|
+
criteria: { focus: 'reusability', required_tests: %w[inclusion behavior] },
|
|
49
|
+
pattern: 'ActiveSupport::Concern extracting shared behavior.',
|
|
50
|
+
code_template: "module {{skill_name}}\n extend ActiveSupport::Concern\nend"
|
|
51
|
+
),
|
|
52
|
+
policy: CategoryData.new(
|
|
53
|
+
requirements: "- Implement authorization checks\n- Follow Pundit or similar patterns\n- Cover all CRUD actions",
|
|
54
|
+
criteria: { focus: 'authorization', required_tests: %w[permitted denied] },
|
|
55
|
+
pattern: 'Authorization policy covering all CRUD actions.',
|
|
56
|
+
code_template: "class {{skill_name}}Policy\n def initialize(user, record)\n @user = user\n @record = record\n end\nend"
|
|
57
|
+
),
|
|
58
|
+
form_object: CategoryData.new(
|
|
59
|
+
requirements: "- Encapsulate form logic outside the model\n- Include ActiveModel validations\n- Handle nested attributes",
|
|
60
|
+
criteria: { focus: 'validation', required_tests: %w[valid invalid submit] },
|
|
61
|
+
pattern: 'Form object encapsulating validation and persistence logic.',
|
|
62
|
+
code_template: "class {{skill_name}}\n include ActiveModel::Model\n include ActiveModel::Attributes\nend"
|
|
63
|
+
),
|
|
64
|
+
view_component: CategoryData.new(
|
|
65
|
+
requirements: "- Create a reusable view component\n- Include preview support\n- Add unit tests for rendering",
|
|
66
|
+
criteria: { focus: 'rendering', required_tests: %w[render slots preview] },
|
|
67
|
+
pattern: 'Reusable view component with previews and unit tests.',
|
|
68
|
+
code_template: "class {{skill_name}} < ViewComponent::Base\n def initialize(title:)\n @title = title\n end\nend"
|
|
69
|
+
)
|
|
70
|
+
}.freeze
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'template_registry/category_data'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Resolves and renders evaluation templates by type and category.
|
|
9
|
+
#
|
|
10
|
+
# Provides a registry of template strings for generating eval scaffolding
|
|
11
|
+
# (task descriptions, scoring criteria, and skill instructions) across
|
|
12
|
+
# supported Rails pattern categories. Supports variable interpolation
|
|
13
|
+
# using +{{variable_name}}+ syntax.
|
|
14
|
+
#
|
|
15
|
+
# @example Resolve a task template with variables
|
|
16
|
+
# TemplateRegistry.call(:task_md, :crud, skill_name: "UserCreator")
|
|
17
|
+
#
|
|
18
|
+
# @example Resolve criteria JSON
|
|
19
|
+
# TemplateRegistry.call(:criteria_json, :api)
|
|
20
|
+
class TemplateRegistry
|
|
21
|
+
TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
|
|
22
|
+
CATEGORIES = REGISTRY.keys.freeze
|
|
23
|
+
|
|
24
|
+
# @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
|
|
25
|
+
# @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
|
|
26
|
+
# @param variables [Hash{Symbol, String => String}] Variables for interpolation
|
|
27
|
+
# @return [String] The rendered template content
|
|
28
|
+
# @raise [ArgumentError] if template_type or category is invalid
|
|
29
|
+
def self.call(template_type, category, variables = {})
|
|
30
|
+
new(template_type, category, variables).call
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @param template_type [Symbol, String] Template type
|
|
34
|
+
# @param category [Symbol, String] Category
|
|
35
|
+
# @param variables [Hash{Symbol, String => String}] Variables for interpolation
|
|
36
|
+
def initialize(template_type, category, variables = {})
|
|
37
|
+
@template_type = template_type.to_sym
|
|
38
|
+
@category = category.to_sym
|
|
39
|
+
@variables = variables
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Resolves the template and applies variable interpolation.
|
|
43
|
+
#
|
|
44
|
+
# @return [String] The rendered template content
|
|
45
|
+
# @raise [ArgumentError] if template_type or category is invalid
|
|
46
|
+
def call
|
|
47
|
+
validate_template_type!
|
|
48
|
+
validate_category!
|
|
49
|
+
|
|
50
|
+
interpolate(build_template)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
attr_reader :template_type, :category, :variables
|
|
56
|
+
|
|
57
|
+
def validate_template_type!
|
|
58
|
+
return if TEMPLATE_TYPES.include?(template_type)
|
|
59
|
+
|
|
60
|
+
raise ArgumentError, "Invalid template type: #{template_type}. Valid types: #{TEMPLATE_TYPES.join(', ')}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def validate_category!
|
|
64
|
+
return if CATEGORIES.include?(category)
|
|
65
|
+
|
|
66
|
+
raise ArgumentError, "Invalid category: #{category}. Valid categories: #{CATEGORIES.join(', ')}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def category_data
|
|
70
|
+
REGISTRY.fetch(category)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def build_template
|
|
74
|
+
case template_type
|
|
75
|
+
when :task_md then build_task_md
|
|
76
|
+
when :criteria_json then build_criteria_json
|
|
77
|
+
when :skill_md then build_skill_md
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def interpolate(template)
|
|
82
|
+
variables.reduce(template.dup) do |result, (key, value)|
|
|
83
|
+
result.gsub("{{#{key}}}", value.to_s)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def build_task_md
|
|
88
|
+
<<~MARKDOWN
|
|
89
|
+
# Task: Implement {{skill_name}} (#{category})
|
|
90
|
+
|
|
91
|
+
## Objective
|
|
92
|
+
|
|
93
|
+
Implement a #{category.to_s.tr('_', ' ')} following Rails best practices and the project's established patterns.
|
|
94
|
+
|
|
95
|
+
## Requirements
|
|
96
|
+
|
|
97
|
+
#{category_data.requirements}
|
|
98
|
+
|
|
99
|
+
## Acceptance Criteria
|
|
100
|
+
|
|
101
|
+
- All tests pass (`bundle exec rake test`)
|
|
102
|
+
- Code follows project conventions
|
|
103
|
+
- YARD documentation for all public methods
|
|
104
|
+
- No rubocop or reek offenses
|
|
105
|
+
MARKDOWN
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def build_criteria_json
|
|
109
|
+
JSON.pretty_generate(
|
|
110
|
+
category: category.to_s,
|
|
111
|
+
dimensions: [
|
|
112
|
+
{ name: 'correctness', weight: 30, pass_threshold: 70 },
|
|
113
|
+
{ name: 'adherence', weight: 25, pass_threshold: 60 },
|
|
114
|
+
{ name: 'quality', weight: 20, pass_threshold: 60 },
|
|
115
|
+
{ name: 'tests', weight: 15, pass_threshold: 80 },
|
|
116
|
+
{ name: 'docs', weight: 10, pass_threshold: 50 }
|
|
117
|
+
],
|
|
118
|
+
minimum_delta: 5,
|
|
119
|
+
category_specific: category_data.criteria
|
|
120
|
+
)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def build_skill_md
|
|
124
|
+
<<~MARKDOWN
|
|
125
|
+
# Skill: {{skill_name}} (#{category})
|
|
126
|
+
|
|
127
|
+
## Pattern
|
|
128
|
+
|
|
129
|
+
#{category_data.pattern}
|
|
130
|
+
|
|
131
|
+
## Hard Rules
|
|
132
|
+
|
|
133
|
+
1. Follow TDD — write failing test first, then implement.
|
|
134
|
+
2. Use `.call` class method as entry point (Service Object pattern).
|
|
135
|
+
3. Each class has one responsibility (SRP).
|
|
136
|
+
4. YARD documentation on all public methods.
|
|
137
|
+
5. `rubocop -A` and `reek` must pass.
|
|
138
|
+
|
|
139
|
+
## Template
|
|
140
|
+
|
|
141
|
+
```ruby
|
|
142
|
+
#{category_data.code_template}
|
|
143
|
+
```
|
|
144
|
+
MARKDOWN
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'file_reader'
|
|
4
|
+
require_relative '../execution/context_hydrator'
|
|
5
|
+
require_relative '../agent/react_agent'
|
|
6
|
+
require_relative '../execution/sandbox'
|
|
7
|
+
require_relative '../judge/judge'
|
|
8
|
+
require_relative '../agent/runner'
|
|
9
|
+
require_relative '../execution/source_path_resolver'
|
|
10
|
+
require_relative '../error_logger'
|
|
11
|
+
|
|
12
|
+
module SkillBench
|
|
13
|
+
module Task
|
|
14
|
+
# Evaluates a single task by running baseline and context-hydrated evaluations.
|
|
15
|
+
# Orchestrates Agent::Runner calls and Judge::Judge scoring.
|
|
16
|
+
# @deprecated Use {SkillBench::Evaluation::Runner} instead.
|
|
17
|
+
class Evaluator
|
|
18
|
+
# Evaluates a single task.
|
|
19
|
+
#
|
|
20
|
+
# @param full_eval_path [Pathname] The path to the evaluation directory.
|
|
21
|
+
# @param base_path [Pathname] The base path for relative file resolution.
|
|
22
|
+
# @param skill_path [String, nil] Optional override for the source directory.
|
|
23
|
+
# @param client_params [Hash] Parameters to pass to the LLM client.
|
|
24
|
+
# @return [Hash] The result of the task evaluation.
|
|
25
|
+
def self.call(full_eval_path:, base_path:, skill_path: nil, client_params: {})
|
|
26
|
+
new(full_eval_path:, base_path:, skill_path:, client_params:).call
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param full_eval_path [Pathname] The path to the evaluation directory.
|
|
30
|
+
# @param base_path [Pathname] The base path for relative file resolution.
|
|
31
|
+
# @param skill_path [String, nil] Optional override for the source directory.
|
|
32
|
+
# @param client_params [Hash] Parameters to pass to the LLM client.
|
|
33
|
+
def initialize(full_eval_path:, base_path:, skill_path:, client_params:)
|
|
34
|
+
@full_eval_path = full_eval_path
|
|
35
|
+
@base_path = base_path
|
|
36
|
+
@skill_path = skill_path
|
|
37
|
+
@client_params = client_params
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Executes the task evaluation.
|
|
41
|
+
#
|
|
42
|
+
# @return [Hash] The result of the task evaluation.
|
|
43
|
+
def call
|
|
44
|
+
relative_path = @full_eval_path.relative_path_from(@base_path)
|
|
45
|
+
relative_path_str = relative_path.to_s
|
|
46
|
+
|
|
47
|
+
files_result = FileReader.call(@full_eval_path)
|
|
48
|
+
return files_result unless files_result[:success]
|
|
49
|
+
|
|
50
|
+
files_response = files_result[:response]
|
|
51
|
+
task_content = files_response[:task]
|
|
52
|
+
criteria_content = files_response[:criteria]
|
|
53
|
+
|
|
54
|
+
source_path = Execution::SourcePathResolver.call(
|
|
55
|
+
eval_folder_path: relative_path_str,
|
|
56
|
+
skill_path: @skill_path
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return { success: false, response: { error: { message: 'No source path inferred' } } } unless source_path
|
|
60
|
+
|
|
61
|
+
baseline_result, baseline_code_diff = Agent::Runner.call(
|
|
62
|
+
mode: :baseline,
|
|
63
|
+
full_eval_path: @full_eval_path,
|
|
64
|
+
task_content: task_content,
|
|
65
|
+
client_params: @client_params
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
context_result, context_code_diff = Agent::Runner.call(
|
|
69
|
+
mode: :context,
|
|
70
|
+
full_eval_path: @full_eval_path,
|
|
71
|
+
task_content: task_content,
|
|
72
|
+
client_params: @client_params,
|
|
73
|
+
source_path: source_path,
|
|
74
|
+
base_path: @base_path
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
judge_score = Judge::Judge.call(task_content, criteria_content, baseline_code_diff, context_code_diff, @client_params)
|
|
78
|
+
return judge_score unless judge_score[:success]
|
|
79
|
+
|
|
80
|
+
{
|
|
81
|
+
path: relative_path_str,
|
|
82
|
+
baseline: baseline_result,
|
|
83
|
+
baseline_diff: baseline_code_diff,
|
|
84
|
+
with_context: context_result,
|
|
85
|
+
context_diff: context_code_diff,
|
|
86
|
+
judge_score: judge_score
|
|
87
|
+
}
|
|
88
|
+
rescue StandardError => e
|
|
89
|
+
SkillBench::ErrorLogger.log_error(e, 'Task::Evaluator Error')
|
|
90
|
+
{ success: false, response: { error: { message: "Error evaluating task: #{e.message}" } } }
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative '../error_logger'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Task
|
|
8
|
+
# Reads task.md and criteria.json files for an evaluation task.
|
|
9
|
+
# Returns structured responses following service object contract.
|
|
10
|
+
class FileReader
|
|
11
|
+
# Reads the task and criteria files from the given evaluation path.
|
|
12
|
+
#
|
|
13
|
+
# @param full_eval_path [Pathname] The path to the evaluation directory.
|
|
14
|
+
# @return [Hash] with :success [Boolean] and :response containing file contents or error.
|
|
15
|
+
def self.call(full_eval_path)
|
|
16
|
+
new(full_eval_path).call
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @param full_eval_path [Pathname] The path to the evaluation directory.
|
|
20
|
+
def initialize(full_eval_path)
|
|
21
|
+
@full_eval_path = full_eval_path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Reads task.md and criteria.json files.
|
|
25
|
+
#
|
|
26
|
+
# @return [Hash] with :success [Boolean] and :response containing file contents or error.
|
|
27
|
+
def call
|
|
28
|
+
task_content = read_file('task.md')
|
|
29
|
+
return task_content unless task_content[:success]
|
|
30
|
+
|
|
31
|
+
criteria_content = read_file('criteria.json')
|
|
32
|
+
return criteria_content unless criteria_content[:success]
|
|
33
|
+
|
|
34
|
+
{
|
|
35
|
+
success: true,
|
|
36
|
+
response: {
|
|
37
|
+
task: task_content[:response][:content],
|
|
38
|
+
criteria: criteria_content[:response][:content]
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
rescue StandardError => e
|
|
42
|
+
SkillBench::ErrorLogger.log_error(e, 'Task::FileReader Error')
|
|
43
|
+
{ success: false, response: { error: { message: "Error reading task files: #{e.message}" } } }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Reads a single file from the evaluation path.
|
|
49
|
+
#
|
|
50
|
+
# @param filename [String] The name of the file to read.
|
|
51
|
+
# @return [Hash] with :success [Boolean] and :response containing content or error.
|
|
52
|
+
def read_file(filename)
|
|
53
|
+
file_path = @full_eval_path.join(filename)
|
|
54
|
+
unless file_path.exist?
|
|
55
|
+
return {
|
|
56
|
+
success: false,
|
|
57
|
+
response: { error: { message: "File not found: #{file_path}" } }
|
|
58
|
+
}
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
content = File.read(file_path)
|
|
62
|
+
{ success: true, response: { content: content } }
|
|
63
|
+
rescue StandardError => e
|
|
64
|
+
SkillBench::ErrorLogger.log_error(e, "Task::FileReader##{filename} Error")
|
|
65
|
+
{ success: false, response: { error: { message: "Error reading #{filename}: #{e.message}" } } }
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Tools
|
|
7
|
+
# Parses JSON arguments for tools, handling format errors gracefully.
|
|
8
|
+
class ArgumentParser
|
|
9
|
+
# Parses a JSON string of arguments.
|
|
10
|
+
#
|
|
11
|
+
# @param arguments [String] The JSON string to parse.
|
|
12
|
+
# @return [Hash, String] The parsed arguments hash, or an error message string.
|
|
13
|
+
def self.call(arguments)
|
|
14
|
+
JSON.parse(arguments)
|
|
15
|
+
rescue JSON::ParserError => e
|
|
16
|
+
"Error executing tool: Invalid JSON format for arguments. Please correct it. Details: #{e.message}"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Tools
|
|
7
|
+
# Base functionality for tools, providing common utilities like secure path resolution.
|
|
8
|
+
class Base
|
|
9
|
+
class << self
|
|
10
|
+
protected
|
|
11
|
+
|
|
12
|
+
# Sanitizes and resolves a relative path against the working directory.
|
|
13
|
+
# Ensures the resulting path stays within the boundaries of the working directory.
|
|
14
|
+
#
|
|
15
|
+
# @param path [String] The relative path to resolve.
|
|
16
|
+
# @param working_dir_path [Pathname, String] The pathname of the working directory.
|
|
17
|
+
# @return [Pathname] The fully expanded and secure path.
|
|
18
|
+
# @raise [ArgumentError] If path is invalid or attempts traversal.
|
|
19
|
+
def secure_path(path, working_dir_path)
|
|
20
|
+
validate_input!(path, working_dir_path)
|
|
21
|
+
|
|
22
|
+
working_dir = Pathname(working_dir_path).realpath
|
|
23
|
+
full_path = working_dir.join(path).cleanpath
|
|
24
|
+
working_dir_str = working_dir.to_s
|
|
25
|
+
|
|
26
|
+
# Ensure the path is still within the working directory
|
|
27
|
+
# We check against the string representation and ensure it's not escaping
|
|
28
|
+
# by adding the separator to the prefix check.
|
|
29
|
+
raise ArgumentError, "Path traversal attempt: #{path}" unless inside_dir?(full_path.to_s, working_dir_str)
|
|
30
|
+
|
|
31
|
+
verify_symlink_safety!(full_path, working_dir, working_dir_str, path)
|
|
32
|
+
|
|
33
|
+
full_path
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def validate_input!(path, working_dir_path)
|
|
39
|
+
raise ArgumentError, 'Path must be a string' unless path.is_a?(String)
|
|
40
|
+
raise ArgumentError, 'Working directory must be provided' unless working_dir_path
|
|
41
|
+
raise ArgumentError, 'Path cannot be empty' if path.strip.empty?
|
|
42
|
+
raise ArgumentError, 'Absolute paths are not allowed' if path.start_with?('/')
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def inside_dir?(path_str, dir_str)
|
|
46
|
+
path_str == dir_str || path_str.start_with?(dir_str + File::SEPARATOR)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def verify_symlink_safety!(full_path, working_dir, working_dir_str, original_path)
|
|
50
|
+
# Check every component of the path to prevent escaping via intermediate symlinks
|
|
51
|
+
current = full_path
|
|
52
|
+
while current != working_dir && current.to_s.length > working_dir_str.length
|
|
53
|
+
verify_component_safety!(current, working_dir_str, original_path)
|
|
54
|
+
current = current.dirname
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def verify_component_safety!(component, working_dir_str, original_path)
|
|
59
|
+
is_symlink = component.symlink?
|
|
60
|
+
return unless component.exist? || is_symlink
|
|
61
|
+
|
|
62
|
+
begin
|
|
63
|
+
real = component.realpath
|
|
64
|
+
raise ArgumentError, "Symlink escapes sandbox: #{original_path}" unless inside_dir?(real.to_s, working_dir_str)
|
|
65
|
+
rescue Errno::ENOENT
|
|
66
|
+
# Re-check symlink status to avoid TOCTOU if the file was replaced between initial check and realpath
|
|
67
|
+
raise ArgumentError, "Dangling symlink: #{original_path}" if component.symlink?
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative 'read_file'
|
|
5
|
+
require_relative 'write_file'
|
|
6
|
+
require_relative 'run_command'
|
|
7
|
+
require_relative 'argument_parser'
|
|
8
|
+
|
|
9
|
+
module SkillBench
|
|
10
|
+
module Tools
|
|
11
|
+
# Dispatches tool execution based on the tool name, coordinating parsing and invocation.
|
|
12
|
+
class Dispatcher
|
|
13
|
+
# Executes a specified tool with the given arguments within a working directory.
|
|
14
|
+
#
|
|
15
|
+
# @param name [String] The name of the tool to execute (e.g., 'read_file').
|
|
16
|
+
# @param arguments [String] A JSON string containing the arguments for the tool.
|
|
17
|
+
# @param working_dir [String] The base directory in which the tool should operate.
|
|
18
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
19
|
+
# @return tool execution result or raises exception.
|
|
20
|
+
# @raise [StandardError] when execution or argument parsing fails
|
|
21
|
+
def self.call(name, arguments, working_dir, container_id = nil)
|
|
22
|
+
args = ArgumentParser.call(arguments)
|
|
23
|
+
return args if args.is_a?(Hash) && args[:success] == false
|
|
24
|
+
|
|
25
|
+
working_dir_path = Pathname.new(working_dir).expand_path
|
|
26
|
+
|
|
27
|
+
execute_tool(name, args, working_dir_path, container_id)
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
log_error(e)
|
|
30
|
+
raise
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
class << self
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def log_error(exception)
|
|
37
|
+
msg = "#{exception.message}\n#{exception.backtrace.first(5).join("\n")}"
|
|
38
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
39
|
+
Rails.logger.error(msg)
|
|
40
|
+
elsif !defined?(Minitest)
|
|
41
|
+
warn("Dispatcher Error: #{msg}")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def execute_tool(name, args, working_dir_path, container_id)
|
|
46
|
+
path = args['path']
|
|
47
|
+
case name
|
|
48
|
+
when 'read_file'
|
|
49
|
+
ReadFile.call(path, working_dir_path)
|
|
50
|
+
when 'write_file'
|
|
51
|
+
WriteFile.call(path, args['content'], working_dir_path)
|
|
52
|
+
when 'run_command'
|
|
53
|
+
RunCommand.call(args['command'], working_dir_path, container_id)
|
|
54
|
+
else
|
|
55
|
+
raise StandardError, "Unknown tool '#{name}'"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|