ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,191 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+ require_relative '../services/skill_resolver'
6
+ require_relative '../error_logger'
7
+ require_relative '../models/config'
8
+ require_relative '../models/criteria_validator'
9
+
10
+ module SkillBench
11
+ module Evaluation
12
+ # Generates an eval (task.md + criteria.json) from a skill's documentation.
13
+ class Generator
14
+ # Prompt template used to generate evals from skill documentation via LLM.
15
+ GENERATION_PROMPT = <<~PROMPT
16
+ You are an evaluation designer for a skill-benchmarking tool.
17
+
18
+ Given a skill's documentation, create an eval scenario that tests whether an AI agent
19
+ can apply the skill correctly. Output ONLY a JSON object with this exact structure:
20
+
21
+ {
22
+ "task": "A detailed task description for the agent to perform. Be specific about what the agent should build or do.",
23
+ "context": "A brief description of what this eval measures.",
24
+ "dimensions": [
25
+ { "name": "correctness", "max_score": 30 },
26
+ { "name": "skill_adherence", "max_score": 25 },
27
+ { "name": "code_quality", "max_score": 20 },
28
+ { "name": "test_coverage", "max_score": 15 },
29
+ { "name": "documentation", "max_score": 10 }
30
+ ],
31
+ "pass_threshold": 70,
32
+ "minimum_delta": 10
33
+ }
34
+
35
+ Rules:
36
+ - dimension max_scores MUST sum to exactly 100
37
+ - pass_threshold should be between 60 and 80
38
+ - minimum_delta should be between 5 and 15
39
+ - task should be specific enough that an agent can attempt it in under 5 minutes
40
+ - the eval should test whether the agent follows the patterns from the skill
41
+
42
+ Skill documentation:
43
+ PROMPT
44
+
45
+ # @param skill_name [String] Name of the skill to base the eval on.
46
+ # @param eval_name [String] Name for the new eval directory.
47
+ def initialize(skill_name:, eval_name:)
48
+ @skill_name = skill_name
49
+ @eval_name = eval_name
50
+ end
51
+
52
+ # Generates the eval files.
53
+ #
54
+ # @return [Hash] Service response.
55
+ def call
56
+ sanitized = sanitize_eval_name(eval_name)
57
+ return invalid_name_result unless sanitized
58
+
59
+ skill = resolve_skill
60
+ return skill_not_found_result unless skill
61
+
62
+ skill_content = read_skill_content(skill.path)
63
+ generated = generate_eval(skill_content)
64
+ return generated unless generated[:success]
65
+
66
+ write_eval_files(sanitized, generated[:response][:data])
67
+
68
+ criteria_path = File.join('evals', sanitized, 'criteria.json')
69
+ validation = SkillBench::Models::CriteriaValidator.call(path: criteria_path)
70
+ unless validation[:success]
71
+ FileUtils.rm_rf(File.join('evals', sanitized))
72
+ return validation
73
+ end
74
+
75
+ { success: true, response: { eval_path: "evals/#{sanitized}" } }
76
+ rescue StandardError => e
77
+ SkillBench::ErrorLogger.log_error(e, 'Evaluation::Generator Error')
78
+ { success: false, response: { error: { message: e.message } } }
79
+ end
80
+
81
+ private
82
+
83
+ attr_reader :skill_name, :eval_name
84
+
85
+ def sanitize_eval_name(name)
86
+ stripped = name&.strip
87
+ return nil if stripped.nil? || stripped.empty?
88
+ return nil if stripped == '.'
89
+ return nil if stripped.include?('..') || stripped.start_with?('/') || stripped =~ %r{[\\/:]}
90
+
91
+ stripped
92
+ end
93
+
94
+ def invalid_name_result
95
+ { success: false, response: { error: { message: "Invalid eval name: #{eval_name}" } } }
96
+ end
97
+
98
+ def resolve_skill
99
+ Services::SkillResolver.call(skill_name)
100
+ rescue ArgumentError
101
+ nil
102
+ end
103
+
104
+ def skill_not_found_result
105
+ { success: false, response: { error: { message: "Skill not found: #{skill_name}" } } }
106
+ end
107
+
108
+ def read_skill_content(skill_path)
109
+ skill_md = File.join(skill_path, 'SKILL.md')
110
+ File.exist?(skill_md) ? File.read(skill_md) : ''
111
+ end
112
+
113
+ def generate_eval(skill_content)
114
+ prompt = GENERATION_PROMPT + "\n\n#{skill_content}"
115
+
116
+ provider = load_provider
117
+ return mock_generate if provider.nil? || provider.name == 'mock'
118
+
119
+ client_class = SkillBench::Clients::ProviderRegistry.for(provider.runtime.to_sym)
120
+ response = client_class.call(
121
+ system_prompt: '',
122
+ messages: [{ role: 'user', content: prompt }],
123
+ model: provider.llm,
124
+ **provider.merged_config
125
+ )
126
+
127
+ return { success: false, response: { error: { message: 'LLM generation failed' } } } unless response[:success]
128
+
129
+ parse_generated_json(response[:result])
130
+ end
131
+
132
+ def load_provider
133
+ config = SkillBench::Models::Config.load
134
+ config.to_provider
135
+ rescue Errno::ENOENT
136
+ nil
137
+ end
138
+
139
+ def mock_generate
140
+ parse_generated_json(<<~JSON)
141
+ {
142
+ "task": "Apply the skill patterns to solve a representative task.",
143
+ "context": "Evaluate skill application",
144
+ "dimensions": [
145
+ { "name": "correctness", "max_score": 30 },
146
+ { "name": "skill_adherence", "max_score": 25 },
147
+ { "name": "code_quality", "max_score": 20 },
148
+ { "name": "test_coverage", "max_score": 15 },
149
+ { "name": "documentation", "max_score": 10 }
150
+ ],
151
+ "pass_threshold": 70,
152
+ "minimum_delta": 10
153
+ }
154
+ JSON
155
+ end
156
+
157
+ def parse_generated_json(json_text)
158
+ data = JSON.parse(json_text)
159
+ { success: true, response: { data: data } }
160
+ rescue JSON::ParserError => e
161
+ { success: false, response: { error: { message: "Failed to parse generated eval: #{e.message}" } } }
162
+ end
163
+
164
+ def write_eval_files(sanitized_name, data)
165
+ eval_dir = File.join('evals', sanitized_name)
166
+ FileUtils.mkdir_p(eval_dir)
167
+
168
+ File.write(File.join(eval_dir, 'task.md'), data['task'] || data[:task] || '')
169
+ File.write(File.join(eval_dir, 'criteria.json'), JSON.pretty_generate(build_criteria_hash(data)))
170
+ end
171
+
172
+ def build_criteria_hash(data)
173
+ {
174
+ context: data.fetch('context', data[:context] || ''),
175
+ dimensions: data.fetch('dimensions', data[:dimensions] || []),
176
+ pass_threshold: extract_numeric(data, 'pass_threshold', 70),
177
+ minimum_delta: extract_numeric(data, 'minimum_delta', 10)
178
+ }
179
+ end
180
+
181
+ def extract_numeric(data, key, default)
182
+ return data[key] if data.key?(key)
183
+
184
+ sym = key.to_sym
185
+ return data[sym] if data.key?(sym)
186
+
187
+ default
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Evaluation
5
+ # Orchestrates the evaluation pipeline.
6
+ #
7
+ # Coordinates blind judging of baseline and context agent outputs,
8
+ # then computes deltas and determines the final verdict.
9
+ class Runner
10
+ # Runs the evaluation pipeline.
11
+ #
12
+ # @param task [String] The task description.
13
+ # @param criteria [SkillBench::Criteria] The eval criteria.
14
+ # @param skill_context [String] The skill context XML.
15
+ # @param baseline_output [String] The baseline agent output.
16
+ # @param context_output [String] The context agent output.
17
+ # @param judge_params [Hash] Provider config passed to the Judge as client_params (api_key, model, provider).
18
+ # @return [Hash] Service response with report or error.
19
+ def self.call(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params: {})
20
+ new(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params:).call
21
+ end
22
+
23
+ # @param task [String] The task description.
24
+ # @param criteria [SkillBench::Criteria] The eval criteria.
25
+ # @param skill_context [String] The skill context XML.
26
+ # @param baseline_output [String] The baseline agent output.
27
+ # @param context_output [String] The context agent output.
28
+ # @param judge_params [Hash] Provider config passed to the Judge as client_params.
29
+ def initialize(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params: {})
30
+ @task = task
31
+ @criteria = criteria
32
+ @skill_context = skill_context
33
+ @baseline_output = baseline_output
34
+ @context_output = context_output
35
+ @judge_params = judge_params.is_a?(Hash) ? judge_params : {}
36
+ end
37
+
38
+ # Orchestrates judging and delta computation.
39
+ #
40
+ # @return [Hash] Service response with report or error.
41
+ def call
42
+ baseline_judge = judge_run(baseline_output, nil)
43
+ return baseline_judge unless baseline_judge[:success]
44
+
45
+ context_judge = judge_run(context_output, skill_context)
46
+ return context_judge unless context_judge[:success]
47
+
48
+ compute_deltas(baseline_judge, context_judge)
49
+ rescue StandardError => e
50
+ SkillBench::ErrorLogger.log_error(e, 'Evaluation::Runner Error')
51
+ { success: false, response: { error: { message: e.message } } }
52
+ end
53
+
54
+ private
55
+
56
+ attr_reader :task, :criteria, :skill_context, :baseline_output, :context_output, :judge_params
57
+
58
+ def judge_run(output, context)
59
+ prompt_result = Judge::Prompt.call(
60
+ task: task,
61
+ criteria: criteria,
62
+ skill_context: context,
63
+ agent_output: output
64
+ )
65
+ return prompt_result unless prompt_result[:success]
66
+
67
+ Judge::Judge.call(prompt: prompt_result[:response][:prompt], client_params: judge_params)
68
+ end
69
+
70
+ def compute_deltas(baseline_judge, context_judge)
71
+ baseline_dims = baseline_judge[:response][:judge_response].dimensions
72
+ context_dims = context_judge[:response][:judge_response].dimensions
73
+
74
+ delta_result = DeltaReport.call(baseline: baseline_dims, context: context_dims, criteria: criteria)
75
+ return delta_result unless delta_result[:success]
76
+
77
+ { success: true, response: { report: delta_result[:response][:delta_report] } }
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Namespace for the evaluation orchestration subsystem.
5
+ #
6
+ # Coordinates evaluation workflows across multiple tasks,
7
+ # including blind judging and delta computation.
8
+ module Evaluation
9
+ end
10
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'cgi'
5
+
6
+ module SkillBench
7
+ module Execution
8
+ # Responsible for loading source context files from a given path
9
+ # and wrapping them in XML tags for injection into the LLM system prompt.
10
+ class ContextHydrator
11
+ # Error message returned when context hydration fails.
12
+ HYDRATION_FAILED = 'Failed to hydrate context from source path'
13
+ # File extensions considered for context hydration.
14
+ TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
15
+ # Maximum file size (in bytes) for files included in context hydration.
16
+ MAX_FILE_SIZE = 50_000
17
+
18
+ # Loads and formats source context files.
19
+ #
20
+ # @param params [Hash] The configuration for context hydration.
21
+ # @option params [String] :source_path The path to the source directory containing readable files.
22
+ # @option params [String] :skill_path Deprecated alias for `:source_path`.
23
+ # @option params [Pathname, String] :base_path (optional) The base path to resolve the source directory against.
24
+ # @return [Hash] A result hash with :success, and :response containing the XML formatted context.
25
+ # @raise [TypeError] when the provided source or base path cannot be converted into a pathname.
26
+ def self.call(params)
27
+ new(**params).call
28
+ end
29
+
30
+ # @param source_path [String] The path to the source directory containing readable files.
31
+ # @param skill_path [String] Deprecated alias for source_path.
32
+ # @param base_path [Pathname, String] The base path to resolve the source directory against.
33
+ # @return [void]
34
+ # @raise [TypeError] when the provided source or base path cannot be converted into a pathname.
35
+ def initialize(source_path: nil, skill_path: nil, base_path: nil)
36
+ @source_path = source_path || skill_path
37
+ @base_path = base_path || Pathname.new(Dir.pwd)
38
+ end
39
+
40
+ # Performs the hydration process.
41
+ #
42
+ # @return [Hash] The standardized result hash indicating success or failure.
43
+ def call
44
+ return missing_path_result unless @source_path
45
+
46
+ full_path = @base_path.join(@source_path).expand_path
47
+ base_expanded = @base_path.expand_path
48
+
49
+ return missing_path_result unless full_path.to_path.start_with?(base_expanded.to_path)
50
+ return missing_path_result unless full_path.exist? && full_path.directory?
51
+
52
+ context_files = collect_context_files(full_path)
53
+ xml_context = build_xml(context_files)
54
+
55
+ { success: true, response: { context: xml_context } }
56
+ rescue StandardError => e
57
+ SkillBench::ErrorLogger.log_error(e, 'Hydration Error')
58
+ { success: false, response: { error: { message: e.message } } }
59
+ end
60
+
61
+ private
62
+
63
+ def missing_path_result
64
+ { success: false, response: { error: { message: "Source path #{@source_path} does not exist or is not a directory" } } }
65
+ end
66
+
67
+ def collect_context_files(full_path)
68
+ pattern = full_path.join("*{#{TEXT_EXTENSIONS.join(',')}}").to_s
69
+ Dir.glob(pattern).reject { |f| File.symlink?(f) }
70
+ .select { |f| File.size(f) <= MAX_FILE_SIZE }
71
+ .sort
72
+ end
73
+
74
+ # Builds the XML structure wrapping the contents of the context files.
75
+ #
76
+ # @param context_files [Array<String>] List of absolute paths to context files.
77
+ # @return [String] The combined XML representation of the file contents.
78
+ def build_xml(context_files)
79
+ return '' if context_files.empty?
80
+
81
+ xml = ['<agent_context>']
82
+
83
+ context_files.each do |file_path|
84
+ relative_path = Pathname.new(file_path).relative_path_from(@base_path).to_s
85
+ content = File.read(file_path)
86
+
87
+ xml << " <file path=\"#{CGI.escapeHTML(relative_path)}\">"
88
+ xml << CGI.escapeHTML(content).gsub(/^/, ' ')
89
+ xml << ' </file>'
90
+ end
91
+
92
+ xml << '</agent_context>'
93
+ xml.join("\n")
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'tmpdir'
5
+ require 'open3'
6
+
7
+ module SkillBench
8
+ module Execution
9
+ # Manages isolated sandbox environments for running agent evaluations.
10
+ # Handles copying files, initializing git, and capturing diffs.
11
+ # Now supports Docker container isolation for secure command execution.
12
+ class Sandbox
13
+ attr_reader :path, :container_id
14
+
15
+ # Runs a block of code within a temporary, isolated sandbox directory.
16
+ # The sandbox is initialized as a git repository and optionally wrapped in a Docker container.
17
+ #
18
+ # @param source_dir [String, Pathname] The directory to copy into the sandbox.
19
+ # @yieldparam sandbox [SkillBench::Execution::Sandbox] The sandbox instance.
20
+ # @return [Object] The result of the yielded block.
21
+ # @raise [SystemCallError] when file operations or directory creation fails.
22
+ # @raise [RuntimeError] when Docker commands fail.
23
+ def self.run(source_dir, &)
24
+ new(source_dir).run(&)
25
+ end
26
+
27
+ # @param source_dir [String, Pathname] The directory to copy into the sandbox.
28
+ def initialize(source_dir)
29
+ @source_dir = source_dir
30
+ @path = nil
31
+ @container_id = nil
32
+ end
33
+
34
+ # Executes the sandbox environment setup and yields the sandbox instance.
35
+ #
36
+ # @yieldparam sandbox [SkillBench::Execution::Sandbox] The sandbox instance.
37
+ # @return [Object] The result of the yielded block.
38
+ # @raise [SystemCallError] when file operations or directory creation fails.
39
+ # @raise [RuntimeError] when Docker commands fail.
40
+ def run
41
+ Dir.mktmpdir('evaluator_sandbox_') do |sandbox_dir|
42
+ @path = sandbox_dir
43
+ copy_source_files(sandbox_dir)
44
+
45
+ setup_git
46
+
47
+ start_container if docker_available?
48
+ begin
49
+ yield self
50
+ ensure
51
+ stop_container
52
+ end
53
+ end
54
+ end
55
+
56
+ # Captures the git diff of changes made within the sandbox.
57
+ #
58
+ # @param sandbox_dir [String] The path to the sandbox directory.
59
+ # @return [String] The git diff, or a message indicating no changes.
60
+ # @raise [SystemCallError] when git commands fail.
61
+ def self.capture_diff(sandbox_dir)
62
+ sandbox_path = File.realpath(sandbox_dir)
63
+ tmp_prefix = File.realpath(Dir.tmpdir) + File::SEPARATOR
64
+ raise "Sandbox directory #{sandbox_dir} is outside temp directory" unless sandbox_path.start_with?(tmp_prefix)
65
+
66
+ return 'No code changes made.' unless File.directory?(File.join(sandbox_path, '.git'))
67
+
68
+ raise "Failed to stage changes in #{sandbox_path}" unless system('git', 'add', '.', chdir: sandbox_path)
69
+
70
+ diff, status = Open3.capture2('git', 'diff', '--cached', chdir: sandbox_path)
71
+ raise "Failed to capture diff in #{sandbox_path}" unless status.success?
72
+
73
+ diff.strip.empty? ? 'No code changes made.' : diff
74
+ end
75
+
76
+ private
77
+
78
+ def setup_git
79
+ cmds = [
80
+ ['git', 'init', '--quiet'],
81
+ ['git', 'config', 'user.email', 'evaluator@tessl.io'],
82
+ ['git', 'config', 'user.name', 'Evaluator Sandbox'],
83
+ ['git', 'add', '.'],
84
+ ['git', 'commit', '--quiet', '-m', 'Initial commit']
85
+ ]
86
+
87
+ cmds.each do |argv|
88
+ raise "Git command failed: #{argv.join(' ')}" unless system(*argv, chdir: @path)
89
+ end
90
+ end
91
+
92
+ # Copies source files into the sandbox, including dotfiles.
93
+ # Validates symlinks to prevent path traversal.
94
+ #
95
+ # @param sandbox_dir [String] The destination sandbox directory.
96
+ # @raise [RuntimeError] when a symlink points outside the source directory.
97
+ def copy_source_files(sandbox_dir)
98
+ source_real = File.realpath(@source_dir)
99
+ copy_tree(@source_dir, sandbox_dir, source_real)
100
+ end
101
+
102
+ def copy_tree(src_dir, dst_dir, source_real)
103
+ Dir.entries(src_dir).each do |entry|
104
+ next if %w[. ..].include?(entry)
105
+
106
+ src = File.join(src_dir, entry)
107
+ dst = File.join(dst_dir, entry)
108
+
109
+ if File.symlink?(src)
110
+ real = File.realpath(src)
111
+ raise "Symlink #{entry} points outside source directory" unless real.start_with?("#{source_real}/")
112
+
113
+ copy_item(real, dst, source_real)
114
+ elsif File.directory?(src)
115
+ copy_item(src, dst, source_real)
116
+ else
117
+ FileUtils.cp(src, dst)
118
+ end
119
+ end
120
+ end
121
+
122
+ def copy_item(src, dst, source_real)
123
+ FileUtils.mkdir_p(dst)
124
+ if File.directory?(src)
125
+ copy_tree(src, dst, source_real)
126
+ else
127
+ FileUtils.cp(src, dst)
128
+ end
129
+ end
130
+
131
+ # Checks if Docker is available and the sandbox Dockerfile exists.
132
+ #
133
+ # @return [Boolean] true if Docker is available, false otherwise.
134
+ def docker_available?
135
+ docker_dir = File.expand_path('docker', __dir__)
136
+ return false unless File.directory?(docker_dir)
137
+
138
+ _stdout, _stderr, status = Open3.capture3('docker', 'info')
139
+ status.success?
140
+ rescue Errno::ENOENT
141
+ false
142
+ end
143
+
144
+ # Starts a Docker container for isolated command execution.
145
+ # Builds the image only if it does not already exist.
146
+ #
147
+ # @raise [RuntimeError] when the Docker image cannot be built or the container fails to start.
148
+ def start_container
149
+ image_name = 'evaluator-sandbox'
150
+ docker_dir = File.expand_path('docker', __dir__)
151
+
152
+ # Build image (Docker layer cache handles no-op builds)
153
+ raise "Failed to build Docker image #{image_name}" unless system('docker', 'build', '-t', image_name, docker_dir, '--quiet')
154
+
155
+ # Start a detached container mounting the sandbox dir to /sandbox
156
+ stdout, stderr, status = Open3.capture3(
157
+ 'docker', 'run', '-d', '--rm', '-v', "#{@path}:/sandbox", image_name
158
+ )
159
+
160
+ raise "Failed to start Docker container: #{stderr}" unless status.success?
161
+
162
+ @container_id = stdout.strip
163
+ end
164
+
165
+ def stop_container
166
+ return unless @container_id
167
+
168
+ # Stop and remove the container (it's --rm so stopping also removes it)
169
+ # We don't fail-fast on stop to avoid swallowing the original error if this is in an ensure block
170
+ system('docker', 'stop', @container_id, out: File::NULL, err: File::NULL)
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Execution
5
+ # Resolves the source skill or workflow path for a given evaluation target.
6
+ class SourcePathResolver
7
+ # Resolves the source path using either an explicit override or the eval directory convention.
8
+ #
9
+ # @param eval_folder_path [String] Relative path to the eval directory.
10
+ # @param skill_path [String, nil] Optional explicit override for the source directory.
11
+ # @return [String, nil] The resolved source path relative to the evaluator repo root, or nil if unmappable.
12
+ # @example Infer a skill source path (NEW format):
13
+ # SkillBench::Execution::SourcePathResolver.call(
14
+ # eval_folder_path: 'evals/skills/rails-code-review/review-order'
15
+ # )
16
+ # # => "skills/rails-code-review"
17
+ # @example Infer a skill source path (OLD format, returns category):
18
+ # SkillBench::Execution::SourcePathResolver.call(
19
+ # eval_folder_path: 'evals/skills/code-quality/rails-code-review/review-order'
20
+ # )
21
+ # # => "skills/code-quality/rails-code-review"
22
+ def self.call(eval_folder_path:, skill_path: nil)
23
+ return skill_path if skill_path && !skill_path.empty?
24
+
25
+ segments = eval_folder_path.to_s.split('/').reject(&:empty?)
26
+
27
+ resolve_skills_path(segments) || resolve_workflows_path(segments)
28
+ end
29
+
30
+ private_class_method def self.resolve_skills_path(segments)
31
+ return nil unless (index = segments.rindex('skills'))
32
+
33
+ remaining = segments[(index + 1)..]
34
+ resolve_old_format_skills(remaining) || resolve_new_format_skills(remaining)
35
+ end
36
+
37
+ private_class_method def self.resolve_old_format_skills(remaining)
38
+ return nil unless remaining.size >= 3
39
+
40
+ category = remaining[0]
41
+ skill_name = remaining[1]
42
+ "skills/#{category}/#{skill_name}"
43
+ end
44
+
45
+ private_class_method def self.resolve_new_format_skills(remaining)
46
+ return nil unless remaining.size >= 1
47
+
48
+ skill_name = remaining[0]
49
+ "skills/#{skill_name}"
50
+ end
51
+
52
+ private_class_method def self.resolve_workflows_path(segments)
53
+ return nil unless (index = segments.rindex('workflows'))
54
+
55
+ workflow_name = segments[index + 1]
56
+ "workflows/#{workflow_name}" if workflow_name
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Namespace for the execution environment subsystem.
5
+ #
6
+ # Provides isolated execution environments for agent evaluation,
7
+ # including sandbox management and context hydration.
8
+ module Execution
9
+ end
10
+ end