ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/skill'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Resolves a skill identifier to a Skill model instance.
8
+ # Supports both direct paths (containing '/') and skill names (searched recursively).
9
+ class SkillResolver
10
+ # Resolves a skill identifier to a Skill instance.
11
+ #
12
+ # @param identifier [String] Skill path or name
13
+ # @param base_path [String] Base directory for skill discovery (default: 'skills/')
14
+ # @return [SkillBench::Models::Skill] The resolved skill
15
+ # @raise [ArgumentError] if skill not found
16
+ def self.call(identifier, base_path = 'skills/')
17
+ new(identifier, base_path).call
18
+ end
19
+
20
+ # @param identifier [String] Skill path or name
21
+ # @param base_path [String] Base directory for skill discovery
22
+ def initialize(identifier, base_path = 'skills/')
23
+ @identifier = identifier
24
+ @base_path = base_path
25
+ end
26
+
27
+ # Resolves the skill identifier.
28
+ #
29
+ # @return [SkillBench::Models::Skill] The resolved skill
30
+ # @raise [ArgumentError] if skill not found
31
+ def call
32
+ return resolve_by_path if identifier.include?('/')
33
+
34
+ resolve_by_name
35
+ end
36
+
37
+ private
38
+
39
+ attr_reader :identifier, :base_path
40
+
41
+ # Resolves a skill by direct file path.
42
+ #
43
+ # @return [SkillBench::Models::Skill] The resolved skill
44
+ # @raise [ArgumentError] if skill file not found at path or path escapes project boundary
45
+ def resolve_by_path
46
+ normalized_path = identifier.end_with?('SKILL.md') ? File.dirname(identifier) : identifier
47
+ absolute_path = File.expand_path(normalized_path)
48
+ cwd = File.expand_path(Dir.pwd)
49
+ cwd_with_sep = cwd + File::SEPARATOR
50
+
51
+ raise(ArgumentError, "Skill path escapes project boundary: #{identifier}") unless absolute_path == cwd || absolute_path.start_with?(cwd_with_sep)
52
+
53
+ skill_md = File.join(normalized_path, 'SKILL.md')
54
+
55
+ return Models::Skill.new(name: File.basename(normalized_path), path: normalized_path) if File.exist?(skill_md)
56
+
57
+ raise(ArgumentError, "Skill not found: #{identifier}")
58
+ end
59
+
60
+ # Resolves a skill by name using recursive discovery.
61
+ #
62
+ # @return [SkillBench::Models::Skill] The resolved skill
63
+ # @raise [ArgumentError] if no skill with matching name found
64
+ def resolve_by_name
65
+ skills = Models::Skill.discover(base_path)
66
+ matches = skills.select { |skill| skill.name == identifier }
67
+
68
+ if matches.empty?
69
+ raise(ArgumentError, "Skill not found: #{identifier}")
70
+ elsif matches.size > 1
71
+ raise(ArgumentError, "Multiple skills found with name '#{identifier}': #{matches.map(&:path).join(', ')}")
72
+ end
73
+
74
+ matches.first
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ class TemplateRegistry
6
+ # Value object holding all template data for a single category.
7
+ CategoryData = Data.define(:requirements, :criteria, :pattern, :code_template)
8
+
9
+ REGISTRY = {
10
+ crud: CategoryData.new(
11
+ requirements: "- Implement Create, Read, Update, Delete operations\n- Use Service Object pattern with `.call`\n- Include input validation",
12
+ criteria: { focus: 'data integrity', required_tests: %w[create read update delete] },
13
+ pattern: 'Service Object implementing Create, Read, Update, Delete operations.',
14
+ code_template: "class {{skill_name}}\n def self.call(params)\n new(params).call\n end\nend"
15
+ ),
16
+ api: CategoryData.new(
17
+ requirements: "- Implement API client with proper error handling\n- Use Faraday or Net::HTTP\n- Handle authentication and retries",
18
+ criteria: { focus: 'error handling', required_tests: %w[success failure timeout] },
19
+ pattern: 'Layered API client with Auth, Client, Fetcher, Builder, and Entity layers.',
20
+ code_template: "class {{skill_name}}\n def self.call(endpoint, params = {})\n new(endpoint, params).call\n end\nend"
21
+ ),
22
+ background_job: CategoryData.new(
23
+ requirements: "- Implement as an ActiveJob or Sidekiq worker\n- Include retry logic and error handling\n- Ensure idempotency",
24
+ criteria: { focus: 'reliability', required_tests: %w[perform retry failure] },
25
+ pattern: 'Background job with retry logic, error handling, and idempotency.',
26
+ code_template: "class {{skill_name}} < ApplicationJob\n def perform(*args)\n # job logic\n end\nend"
27
+ ),
28
+ controller: CategoryData.new(
29
+ requirements: "- Follow RESTful conventions\n- Use strong parameters\n- Include proper error responses",
30
+ criteria: { focus: 'REST compliance', required_tests: %w[index show create update destroy] },
31
+ pattern: 'RESTful controller with strong parameters and proper error responses.',
32
+ code_template: "class {{skill_name}}Controller < ApplicationController\n def index; end\n def show; end\nend"
33
+ ),
34
+ model: CategoryData.new(
35
+ requirements: "- Define validations and associations\n- Add scopes for common queries\n- Include callback hooks where appropriate",
36
+ criteria: { focus: 'data modeling', required_tests: %w[validations associations scopes] },
37
+ pattern: 'ActiveRecord model with validations, associations, and scopes.',
38
+ code_template: "class {{skill_name}} < ApplicationRecord\n validates :name, presence: true\nend"
39
+ ),
40
+ migration: CategoryData.new(
41
+ requirements: "- Write reversible migration\n- Include index definitions\n- Handle data migration if needed",
42
+ criteria: { focus: 'reversibility', required_tests: %w[up down] },
43
+ pattern: 'Reversible database migration with indexes and data handling.',
44
+ code_template: "class {{skill_name}} < ActiveRecord::Migration[7.1]\n def change\n # migration logic\n end\nend"
45
+ ),
46
+ concern: CategoryData.new(
47
+ requirements: "- Extract shared behavior into a module\n- Use ActiveSupport::Concern\n- Keep interface minimal",
48
+ criteria: { focus: 'reusability', required_tests: %w[inclusion behavior] },
49
+ pattern: 'ActiveSupport::Concern extracting shared behavior.',
50
+ code_template: "module {{skill_name}}\n extend ActiveSupport::Concern\nend"
51
+ ),
52
+ policy: CategoryData.new(
53
+ requirements: "- Implement authorization checks\n- Follow Pundit or similar patterns\n- Cover all CRUD actions",
54
+ criteria: { focus: 'authorization', required_tests: %w[permitted denied] },
55
+ pattern: 'Authorization policy covering all CRUD actions.',
56
+ code_template: "class {{skill_name}}Policy\n def initialize(user, record)\n @user = user\n @record = record\n end\nend"
57
+ ),
58
+ form_object: CategoryData.new(
59
+ requirements: "- Encapsulate form logic outside the model\n- Include ActiveModel validations\n- Handle nested attributes",
60
+ criteria: { focus: 'validation', required_tests: %w[valid invalid submit] },
61
+ pattern: 'Form object encapsulating validation and persistence logic.',
62
+ code_template: "class {{skill_name}}\n include ActiveModel::Model\n include ActiveModel::Attributes\nend"
63
+ ),
64
+ view_component: CategoryData.new(
65
+ requirements: "- Create a reusable view component\n- Include preview support\n- Add unit tests for rendering",
66
+ criteria: { focus: 'rendering', required_tests: %w[render slots preview] },
67
+ pattern: 'Reusable view component with previews and unit tests.',
68
+ code_template: "class {{skill_name}} < ViewComponent::Base\n def initialize(title:)\n @title = title\n end\nend"
69
+ )
70
+ }.freeze
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'template_registry/category_data'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Resolves and renders evaluation templates by type and category.
9
+ #
10
+ # Provides a registry of template strings for generating eval scaffolding
11
+ # (task descriptions, scoring criteria, and skill instructions) across
12
+ # supported Rails pattern categories. Supports variable interpolation
13
+ # using +{{variable_name}}+ syntax.
14
+ #
15
+ # @example Resolve a task template with variables
16
+ # TemplateRegistry.call(:task_md, :crud, skill_name: "UserCreator")
17
+ #
18
+ # @example Resolve criteria JSON
19
+ # TemplateRegistry.call(:criteria_json, :api)
20
+ class TemplateRegistry
21
+ TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
22
+ CATEGORIES = REGISTRY.keys.freeze
23
+
24
+ # @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
25
+ # @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
26
+ # @param variables [Hash{Symbol, String => String}] Variables for interpolation
27
+ # @return [String] The rendered template content
28
+ # @raise [ArgumentError] if template_type or category is invalid
29
+ def self.call(template_type, category, variables = {})
30
+ new(template_type, category, variables).call
31
+ end
32
+
33
+ # @param template_type [Symbol, String] Template type
34
+ # @param category [Symbol, String] Category
35
+ # @param variables [Hash{Symbol, String => String}] Variables for interpolation
36
+ def initialize(template_type, category, variables = {})
37
+ @template_type = template_type.to_sym
38
+ @category = category.to_sym
39
+ @variables = variables
40
+ end
41
+
42
+ # Resolves the template and applies variable interpolation.
43
+ #
44
+ # @return [String] The rendered template content
45
+ # @raise [ArgumentError] if template_type or category is invalid
46
+ def call
47
+ validate_template_type!
48
+ validate_category!
49
+
50
+ interpolate(build_template)
51
+ end
52
+
53
+ private
54
+
55
+ attr_reader :template_type, :category, :variables
56
+
57
+ def validate_template_type!
58
+ return if TEMPLATE_TYPES.include?(template_type)
59
+
60
+ raise ArgumentError, "Invalid template type: #{template_type}. Valid types: #{TEMPLATE_TYPES.join(', ')}"
61
+ end
62
+
63
+ def validate_category!
64
+ return if CATEGORIES.include?(category)
65
+
66
+ raise ArgumentError, "Invalid category: #{category}. Valid categories: #{CATEGORIES.join(', ')}"
67
+ end
68
+
69
+ def category_data
70
+ REGISTRY.fetch(category)
71
+ end
72
+
73
+ def build_template
74
+ case template_type
75
+ when :task_md then build_task_md
76
+ when :criteria_json then build_criteria_json
77
+ when :skill_md then build_skill_md
78
+ end
79
+ end
80
+
81
+ def interpolate(template)
82
+ variables.reduce(template.dup) do |result, (key, value)|
83
+ result.gsub("{{#{key}}}", value.to_s)
84
+ end
85
+ end
86
+
87
+ def build_task_md
88
+ <<~MARKDOWN
89
+ # Task: Implement {{skill_name}} (#{category})
90
+
91
+ ## Objective
92
+
93
+ Implement a #{category.to_s.tr('_', ' ')} following Rails best practices and the project's established patterns.
94
+
95
+ ## Requirements
96
+
97
+ #{category_data.requirements}
98
+
99
+ ## Acceptance Criteria
100
+
101
+ - All tests pass (`bundle exec rake test`)
102
+ - Code follows project conventions
103
+ - YARD documentation for all public methods
104
+ - No rubocop or reek offenses
105
+ MARKDOWN
106
+ end
107
+
108
+ def build_criteria_json
109
+ JSON.pretty_generate(
110
+ category: category.to_s,
111
+ dimensions: [
112
+ { name: 'correctness', weight: 30, pass_threshold: 70 },
113
+ { name: 'adherence', weight: 25, pass_threshold: 60 },
114
+ { name: 'quality', weight: 20, pass_threshold: 60 },
115
+ { name: 'tests', weight: 15, pass_threshold: 80 },
116
+ { name: 'docs', weight: 10, pass_threshold: 50 }
117
+ ],
118
+ minimum_delta: 5,
119
+ category_specific: category_data.criteria
120
+ )
121
+ end
122
+
123
+ def build_skill_md
124
+ <<~MARKDOWN
125
+ # Skill: {{skill_name}} (#{category})
126
+
127
+ ## Pattern
128
+
129
+ #{category_data.pattern}
130
+
131
+ ## Hard Rules
132
+
133
+ 1. Follow TDD — write failing test first, then implement.
134
+ 2. Use `.call` class method as entry point (Service Object pattern).
135
+ 3. Each class has one responsibility (SRP).
136
+ 4. YARD documentation on all public methods.
137
+ 5. `rubocop -A` and `reek` must pass.
138
+
139
+ ## Template
140
+
141
+ ```ruby
142
+ #{category_data.code_template}
143
+ ```
144
+ MARKDOWN
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'file_reader'
4
+ require_relative '../execution/context_hydrator'
5
+ require_relative '../agent/react_agent'
6
+ require_relative '../execution/sandbox'
7
+ require_relative '../judge/judge'
8
+ require_relative '../agent/runner'
9
+ require_relative '../execution/source_path_resolver'
10
+ require_relative '../error_logger'
11
+
12
+ module SkillBench
13
+ module Task
14
+ # Evaluates a single task by running baseline and context-hydrated evaluations.
15
+ # Orchestrates Agent::Runner calls and Judge::Judge scoring.
16
+ # @deprecated Use {SkillBench::Evaluation::Runner} instead.
17
+ class Evaluator
18
+ # Evaluates a single task.
19
+ #
20
+ # @param full_eval_path [Pathname] The path to the evaluation directory.
21
+ # @param base_path [Pathname] The base path for relative file resolution.
22
+ # @param skill_path [String, nil] Optional override for the source directory.
23
+ # @param client_params [Hash] Parameters to pass to the LLM client.
24
+ # @return [Hash] The result of the task evaluation.
25
+ def self.call(full_eval_path:, base_path:, skill_path: nil, client_params: {})
26
+ new(full_eval_path:, base_path:, skill_path:, client_params:).call
27
+ end
28
+
29
+ # @param full_eval_path [Pathname] The path to the evaluation directory.
30
+ # @param base_path [Pathname] The base path for relative file resolution.
31
+ # @param skill_path [String, nil] Optional override for the source directory.
32
+ # @param client_params [Hash] Parameters to pass to the LLM client.
33
+ def initialize(full_eval_path:, base_path:, skill_path:, client_params:)
34
+ @full_eval_path = full_eval_path
35
+ @base_path = base_path
36
+ @skill_path = skill_path
37
+ @client_params = client_params
38
+ end
39
+
40
+ # Executes the task evaluation.
41
+ #
42
+ # @return [Hash] The result of the task evaluation.
43
+ def call
44
+ relative_path = @full_eval_path.relative_path_from(@base_path)
45
+ relative_path_str = relative_path.to_s
46
+
47
+ files_result = FileReader.call(@full_eval_path)
48
+ return files_result unless files_result[:success]
49
+
50
+ files_response = files_result[:response]
51
+ task_content = files_response[:task]
52
+ criteria_content = files_response[:criteria]
53
+
54
+ source_path = Execution::SourcePathResolver.call(
55
+ eval_folder_path: relative_path_str,
56
+ skill_path: @skill_path
57
+ )
58
+
59
+ return { success: false, response: { error: { message: 'No source path inferred' } } } unless source_path
60
+
61
+ baseline_result, baseline_code_diff = Agent::Runner.call(
62
+ mode: :baseline,
63
+ full_eval_path: @full_eval_path,
64
+ task_content: task_content,
65
+ client_params: @client_params
66
+ )
67
+
68
+ context_result, context_code_diff = Agent::Runner.call(
69
+ mode: :context,
70
+ full_eval_path: @full_eval_path,
71
+ task_content: task_content,
72
+ client_params: @client_params,
73
+ source_path: source_path,
74
+ base_path: @base_path
75
+ )
76
+
77
+ judge_score = Judge::Judge.call(task_content, criteria_content, baseline_code_diff, context_code_diff, @client_params)
78
+ return judge_score unless judge_score[:success]
79
+
80
+ {
81
+ path: relative_path_str,
82
+ baseline: baseline_result,
83
+ baseline_diff: baseline_code_diff,
84
+ with_context: context_result,
85
+ context_diff: context_code_diff,
86
+ judge_score: judge_score
87
+ }
88
+ rescue StandardError => e
89
+ SkillBench::ErrorLogger.log_error(e, 'Task::Evaluator Error')
90
+ { success: false, response: { error: { message: "Error evaluating task: #{e.message}" } } }
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative '../error_logger'
5
+
6
+ module SkillBench
7
+ module Task
8
+ # Reads task.md and criteria.json files for an evaluation task.
9
+ # Returns structured responses following service object contract.
10
+ class FileReader
11
+ # Reads the task and criteria files from the given evaluation path.
12
+ #
13
+ # @param full_eval_path [Pathname] The path to the evaluation directory.
14
+ # @return [Hash] with :success [Boolean] and :response containing file contents or error.
15
+ def self.call(full_eval_path)
16
+ new(full_eval_path).call
17
+ end
18
+
19
+ # @param full_eval_path [Pathname] The path to the evaluation directory.
20
+ def initialize(full_eval_path)
21
+ @full_eval_path = full_eval_path
22
+ end
23
+
24
+ # Reads task.md and criteria.json files.
25
+ #
26
+ # @return [Hash] with :success [Boolean] and :response containing file contents or error.
27
+ def call
28
+ task_content = read_file('task.md')
29
+ return task_content unless task_content[:success]
30
+
31
+ criteria_content = read_file('criteria.json')
32
+ return criteria_content unless criteria_content[:success]
33
+
34
+ {
35
+ success: true,
36
+ response: {
37
+ task: task_content[:response][:content],
38
+ criteria: criteria_content[:response][:content]
39
+ }
40
+ }
41
+ rescue StandardError => e
42
+ SkillBench::ErrorLogger.log_error(e, 'Task::FileReader Error')
43
+ { success: false, response: { error: { message: "Error reading task files: #{e.message}" } } }
44
+ end
45
+
46
+ private
47
+
48
+ # Reads a single file from the evaluation path.
49
+ #
50
+ # @param filename [String] The name of the file to read.
51
+ # @return [Hash] with :success [Boolean] and :response containing content or error.
52
+ def read_file(filename)
53
+ file_path = @full_eval_path.join(filename)
54
+ unless file_path.exist?
55
+ return {
56
+ success: false,
57
+ response: { error: { message: "File not found: #{file_path}" } }
58
+ }
59
+ end
60
+
61
+ content = File.read(file_path)
62
+ { success: true, response: { content: content } }
63
+ rescue StandardError => e
64
+ SkillBench::ErrorLogger.log_error(e, "Task::FileReader##{filename} Error")
65
+ { success: false, response: { error: { message: "Error reading #{filename}: #{e.message}" } } }
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Namespace for the task subsystem.
5
+ #
6
+ # The task subsystem manages individual evaluation tasks,
7
+ # including file reading and task evaluation orchestration.
8
+ module Task
9
+ end
10
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Tools
7
+ # Parses JSON arguments for tools, handling format errors gracefully.
8
+ class ArgumentParser
9
+ # Parses a JSON string of arguments.
10
+ #
11
+ # @param arguments [String] The JSON string to parse.
12
+ # @return [Hash, String] The parsed arguments hash, or an error message string.
13
+ def self.call(arguments)
14
+ JSON.parse(arguments)
15
+ rescue JSON::ParserError => e
16
+ "Error executing tool: Invalid JSON format for arguments. Please correct it. Details: #{e.message}"
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
5
+ module SkillBench
6
+ module Tools
7
+ # Base functionality for tools, providing common utilities like secure path resolution.
8
+ class Base
9
+ class << self
10
+ protected
11
+
12
+ # Sanitizes and resolves a relative path against the working directory.
13
+ # Ensures the resulting path stays within the boundaries of the working directory.
14
+ #
15
+ # @param path [String] The relative path to resolve.
16
+ # @param working_dir_path [Pathname, String] The pathname of the working directory.
17
+ # @return [Pathname] The fully expanded and secure path.
18
+ # @raise [ArgumentError] If path is invalid or attempts traversal.
19
+ def secure_path(path, working_dir_path)
20
+ validate_input!(path, working_dir_path)
21
+
22
+ working_dir = Pathname(working_dir_path).realpath
23
+ full_path = working_dir.join(path).cleanpath
24
+ working_dir_str = working_dir.to_s
25
+
26
+ # Ensure the path is still within the working directory
27
+ # We check against the string representation and ensure it's not escaping
28
+ # by adding the separator to the prefix check.
29
+ raise ArgumentError, "Path traversal attempt: #{path}" unless inside_dir?(full_path.to_s, working_dir_str)
30
+
31
+ verify_symlink_safety!(full_path, working_dir, working_dir_str, path)
32
+
33
+ full_path
34
+ end
35
+
36
+ private
37
+
38
+ def validate_input!(path, working_dir_path)
39
+ raise ArgumentError, 'Path must be a string' unless path.is_a?(String)
40
+ raise ArgumentError, 'Working directory must be provided' unless working_dir_path
41
+ raise ArgumentError, 'Path cannot be empty' if path.strip.empty?
42
+ raise ArgumentError, 'Absolute paths are not allowed' if path.start_with?('/')
43
+ end
44
+
45
+ def inside_dir?(path_str, dir_str)
46
+ path_str == dir_str || path_str.start_with?(dir_str + File::SEPARATOR)
47
+ end
48
+
49
+ def verify_symlink_safety!(full_path, working_dir, working_dir_str, original_path)
50
+ # Check every component of the path to prevent escaping via intermediate symlinks
51
+ current = full_path
52
+ while current != working_dir && current.to_s.length > working_dir_str.length
53
+ verify_component_safety!(current, working_dir_str, original_path)
54
+ current = current.dirname
55
+ end
56
+ end
57
+
58
+ def verify_component_safety!(component, working_dir_str, original_path)
59
+ is_symlink = component.symlink?
60
+ return unless component.exist? || is_symlink
61
+
62
+ begin
63
+ real = component.realpath
64
+ raise ArgumentError, "Symlink escapes sandbox: #{original_path}" unless inside_dir?(real.to_s, working_dir_str)
65
+ rescue Errno::ENOENT
66
+ # Re-check symlink status to avoid TOCTOU if the file was replaced between initial check and realpath
67
+ raise ArgumentError, "Dangling symlink: #{original_path}" if component.symlink?
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative 'read_file'
5
+ require_relative 'write_file'
6
+ require_relative 'run_command'
7
+ require_relative 'argument_parser'
8
+
9
+ module SkillBench
10
+ module Tools
11
+ # Dispatches tool execution based on the tool name, coordinating parsing and invocation.
12
+ class Dispatcher
13
+ # Executes a specified tool with the given arguments within a working directory.
14
+ #
15
+ # @param name [String] The name of the tool to execute (e.g., 'read_file').
16
+ # @param arguments [String] A JSON string containing the arguments for the tool.
17
+ # @param working_dir [String] The base directory in which the tool should operate.
18
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
19
+ # @return tool execution result or raises exception.
20
+ # @raise [StandardError] when execution or argument parsing fails
21
+ def self.call(name, arguments, working_dir, container_id = nil)
22
+ args = ArgumentParser.call(arguments)
23
+ return args if args.is_a?(Hash) && args[:success] == false
24
+
25
+ working_dir_path = Pathname.new(working_dir).expand_path
26
+
27
+ execute_tool(name, args, working_dir_path, container_id)
28
+ rescue StandardError => e
29
+ log_error(e)
30
+ raise
31
+ end
32
+
33
+ class << self
34
+ private
35
+
36
+ def log_error(exception)
37
+ msg = "#{exception.message}\n#{exception.backtrace.first(5).join("\n")}"
38
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
39
+ Rails.logger.error(msg)
40
+ elsif !defined?(Minitest)
41
+ warn("Dispatcher Error: #{msg}")
42
+ end
43
+ end
44
+
45
+ def execute_tool(name, args, working_dir_path, container_id)
46
+ path = args['path']
47
+ case name
48
+ when 'read_file'
49
+ ReadFile.call(path, working_dir_path)
50
+ when 'write_file'
51
+ WriteFile.call(path, args['content'], working_dir_path)
52
+ when 'run_command'
53
+ RunCommand.call(args['command'], working_dir_path, container_id)
54
+ else
55
+ raise StandardError, "Unknown tool '#{name}'"
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end