ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require_relative '../services/skill_resolver'
|
|
6
|
+
require_relative '../error_logger'
|
|
7
|
+
require_relative '../models/config'
|
|
8
|
+
require_relative '../models/criteria_validator'
|
|
9
|
+
|
|
10
|
+
module SkillBench
|
|
11
|
+
module Evaluation
|
|
12
|
+
# Generates an eval (task.md + criteria.json) from a skill's documentation.
|
|
13
|
+
class Generator
|
|
14
|
+
# Prompt template used to generate evals from skill documentation via LLM.
|
|
15
|
+
GENERATION_PROMPT = <<~PROMPT
|
|
16
|
+
You are an evaluation designer for a skill-benchmarking tool.
|
|
17
|
+
|
|
18
|
+
Given a skill's documentation, create an eval scenario that tests whether an AI agent
|
|
19
|
+
can apply the skill correctly. Output ONLY a JSON object with this exact structure:
|
|
20
|
+
|
|
21
|
+
{
|
|
22
|
+
"task": "A detailed task description for the agent to perform. Be specific about what the agent should build or do.",
|
|
23
|
+
"context": "A brief description of what this eval measures.",
|
|
24
|
+
"dimensions": [
|
|
25
|
+
{ "name": "correctness", "max_score": 30 },
|
|
26
|
+
{ "name": "skill_adherence", "max_score": 25 },
|
|
27
|
+
{ "name": "code_quality", "max_score": 20 },
|
|
28
|
+
{ "name": "test_coverage", "max_score": 15 },
|
|
29
|
+
{ "name": "documentation", "max_score": 10 }
|
|
30
|
+
],
|
|
31
|
+
"pass_threshold": 70,
|
|
32
|
+
"minimum_delta": 10
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
Rules:
|
|
36
|
+
- dimension max_scores MUST sum to exactly 100
|
|
37
|
+
- pass_threshold should be between 60 and 80
|
|
38
|
+
- minimum_delta should be between 5 and 15
|
|
39
|
+
- task should be specific enough that an agent can attempt it in under 5 minutes
|
|
40
|
+
- the eval should test whether the agent follows the patterns from the skill
|
|
41
|
+
|
|
42
|
+
Skill documentation:
|
|
43
|
+
PROMPT
|
|
44
|
+
|
|
45
|
+
# @param skill_name [String] Name of the skill to base the eval on.
|
|
46
|
+
# @param eval_name [String] Name for the new eval directory.
|
|
47
|
+
def initialize(skill_name:, eval_name:)
|
|
48
|
+
@skill_name = skill_name
|
|
49
|
+
@eval_name = eval_name
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Generates the eval files.
|
|
53
|
+
#
|
|
54
|
+
# @return [Hash] Service response.
|
|
55
|
+
def call
|
|
56
|
+
sanitized = sanitize_eval_name(eval_name)
|
|
57
|
+
return invalid_name_result unless sanitized
|
|
58
|
+
|
|
59
|
+
skill = resolve_skill
|
|
60
|
+
return skill_not_found_result unless skill
|
|
61
|
+
|
|
62
|
+
skill_content = read_skill_content(skill.path)
|
|
63
|
+
generated = generate_eval(skill_content)
|
|
64
|
+
return generated unless generated[:success]
|
|
65
|
+
|
|
66
|
+
write_eval_files(sanitized, generated[:response][:data])
|
|
67
|
+
|
|
68
|
+
criteria_path = File.join('evals', sanitized, 'criteria.json')
|
|
69
|
+
validation = SkillBench::Models::CriteriaValidator.call(path: criteria_path)
|
|
70
|
+
unless validation[:success]
|
|
71
|
+
FileUtils.rm_rf(File.join('evals', sanitized))
|
|
72
|
+
return validation
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
{ success: true, response: { eval_path: "evals/#{sanitized}" } }
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
SkillBench::ErrorLogger.log_error(e, 'Evaluation::Generator Error')
|
|
78
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
attr_reader :skill_name, :eval_name
|
|
84
|
+
|
|
85
|
+
def sanitize_eval_name(name)
|
|
86
|
+
stripped = name&.strip
|
|
87
|
+
return nil if stripped.nil? || stripped.empty?
|
|
88
|
+
return nil if stripped == '.'
|
|
89
|
+
return nil if stripped.include?('..') || stripped.start_with?('/') || stripped =~ %r{[\\/:]}
|
|
90
|
+
|
|
91
|
+
stripped
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def invalid_name_result
|
|
95
|
+
{ success: false, response: { error: { message: "Invalid eval name: #{eval_name}" } } }
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def resolve_skill
|
|
99
|
+
Services::SkillResolver.call(skill_name)
|
|
100
|
+
rescue ArgumentError
|
|
101
|
+
nil
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def skill_not_found_result
|
|
105
|
+
{ success: false, response: { error: { message: "Skill not found: #{skill_name}" } } }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def read_skill_content(skill_path)
|
|
109
|
+
skill_md = File.join(skill_path, 'SKILL.md')
|
|
110
|
+
File.exist?(skill_md) ? File.read(skill_md) : ''
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def generate_eval(skill_content)
|
|
114
|
+
prompt = GENERATION_PROMPT + "\n\n#{skill_content}"
|
|
115
|
+
|
|
116
|
+
provider = load_provider
|
|
117
|
+
return mock_generate if provider.nil? || provider.name == 'mock'
|
|
118
|
+
|
|
119
|
+
client_class = SkillBench::Clients::ProviderRegistry.for(provider.runtime.to_sym)
|
|
120
|
+
response = client_class.call(
|
|
121
|
+
system_prompt: '',
|
|
122
|
+
messages: [{ role: 'user', content: prompt }],
|
|
123
|
+
model: provider.llm,
|
|
124
|
+
**provider.merged_config
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return { success: false, response: { error: { message: 'LLM generation failed' } } } unless response[:success]
|
|
128
|
+
|
|
129
|
+
parse_generated_json(response[:result])
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def load_provider
|
|
133
|
+
config = SkillBench::Models::Config.load
|
|
134
|
+
config.to_provider
|
|
135
|
+
rescue Errno::ENOENT
|
|
136
|
+
nil
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def mock_generate
|
|
140
|
+
parse_generated_json(<<~JSON)
|
|
141
|
+
{
|
|
142
|
+
"task": "Apply the skill patterns to solve a representative task.",
|
|
143
|
+
"context": "Evaluate skill application",
|
|
144
|
+
"dimensions": [
|
|
145
|
+
{ "name": "correctness", "max_score": 30 },
|
|
146
|
+
{ "name": "skill_adherence", "max_score": 25 },
|
|
147
|
+
{ "name": "code_quality", "max_score": 20 },
|
|
148
|
+
{ "name": "test_coverage", "max_score": 15 },
|
|
149
|
+
{ "name": "documentation", "max_score": 10 }
|
|
150
|
+
],
|
|
151
|
+
"pass_threshold": 70,
|
|
152
|
+
"minimum_delta": 10
|
|
153
|
+
}
|
|
154
|
+
JSON
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def parse_generated_json(json_text)
|
|
158
|
+
data = JSON.parse(json_text)
|
|
159
|
+
{ success: true, response: { data: data } }
|
|
160
|
+
rescue JSON::ParserError => e
|
|
161
|
+
{ success: false, response: { error: { message: "Failed to parse generated eval: #{e.message}" } } }
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def write_eval_files(sanitized_name, data)
|
|
165
|
+
eval_dir = File.join('evals', sanitized_name)
|
|
166
|
+
FileUtils.mkdir_p(eval_dir)
|
|
167
|
+
|
|
168
|
+
File.write(File.join(eval_dir, 'task.md'), data['task'] || data[:task] || '')
|
|
169
|
+
File.write(File.join(eval_dir, 'criteria.json'), JSON.pretty_generate(build_criteria_hash(data)))
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def build_criteria_hash(data)
|
|
173
|
+
{
|
|
174
|
+
context: data.fetch('context', data[:context] || ''),
|
|
175
|
+
dimensions: data.fetch('dimensions', data[:dimensions] || []),
|
|
176
|
+
pass_threshold: extract_numeric(data, 'pass_threshold', 70),
|
|
177
|
+
minimum_delta: extract_numeric(data, 'minimum_delta', 10)
|
|
178
|
+
}
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def extract_numeric(data, key, default)
|
|
182
|
+
return data[key] if data.key?(key)
|
|
183
|
+
|
|
184
|
+
sym = key.to_sym
|
|
185
|
+
return data[sym] if data.key?(sym)
|
|
186
|
+
|
|
187
|
+
default
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Evaluation
|
|
5
|
+
# Orchestrates the evaluation pipeline.
|
|
6
|
+
#
|
|
7
|
+
# Coordinates blind judging of baseline and context agent outputs,
|
|
8
|
+
# then computes deltas and determines the final verdict.
|
|
9
|
+
class Runner
|
|
10
|
+
# Runs the evaluation pipeline.
|
|
11
|
+
#
|
|
12
|
+
# @param task [String] The task description.
|
|
13
|
+
# @param criteria [SkillBench::Criteria] The eval criteria.
|
|
14
|
+
# @param skill_context [String] The skill context XML.
|
|
15
|
+
# @param baseline_output [String] The baseline agent output.
|
|
16
|
+
# @param context_output [String] The context agent output.
|
|
17
|
+
# @param judge_params [Hash] Provider config passed to the Judge as client_params (api_key, model, provider).
|
|
18
|
+
# @return [Hash] Service response with report or error.
|
|
19
|
+
def self.call(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params: {})
|
|
20
|
+
new(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params:).call
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# @param task [String] The task description.
|
|
24
|
+
# @param criteria [SkillBench::Criteria] The eval criteria.
|
|
25
|
+
# @param skill_context [String] The skill context XML.
|
|
26
|
+
# @param baseline_output [String] The baseline agent output.
|
|
27
|
+
# @param context_output [String] The context agent output.
|
|
28
|
+
# @param judge_params [Hash] Provider config passed to the Judge as client_params.
|
|
29
|
+
def initialize(task:, criteria:, skill_context:, baseline_output:, context_output:, judge_params: {})
|
|
30
|
+
@task = task
|
|
31
|
+
@criteria = criteria
|
|
32
|
+
@skill_context = skill_context
|
|
33
|
+
@baseline_output = baseline_output
|
|
34
|
+
@context_output = context_output
|
|
35
|
+
@judge_params = judge_params.is_a?(Hash) ? judge_params : {}
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Orchestrates judging and delta computation.
|
|
39
|
+
#
|
|
40
|
+
# @return [Hash] Service response with report or error.
|
|
41
|
+
def call
|
|
42
|
+
baseline_judge = judge_run(baseline_output, nil)
|
|
43
|
+
return baseline_judge unless baseline_judge[:success]
|
|
44
|
+
|
|
45
|
+
context_judge = judge_run(context_output, skill_context)
|
|
46
|
+
return context_judge unless context_judge[:success]
|
|
47
|
+
|
|
48
|
+
compute_deltas(baseline_judge, context_judge)
|
|
49
|
+
rescue StandardError => e
|
|
50
|
+
SkillBench::ErrorLogger.log_error(e, 'Evaluation::Runner Error')
|
|
51
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
attr_reader :task, :criteria, :skill_context, :baseline_output, :context_output, :judge_params
|
|
57
|
+
|
|
58
|
+
def judge_run(output, context)
|
|
59
|
+
prompt_result = Judge::Prompt.call(
|
|
60
|
+
task: task,
|
|
61
|
+
criteria: criteria,
|
|
62
|
+
skill_context: context,
|
|
63
|
+
agent_output: output
|
|
64
|
+
)
|
|
65
|
+
return prompt_result unless prompt_result[:success]
|
|
66
|
+
|
|
67
|
+
Judge::Judge.call(prompt: prompt_result[:response][:prompt], client_params: judge_params)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def compute_deltas(baseline_judge, context_judge)
|
|
71
|
+
baseline_dims = baseline_judge[:response][:judge_response].dimensions
|
|
72
|
+
context_dims = context_judge[:response][:judge_response].dimensions
|
|
73
|
+
|
|
74
|
+
delta_result = DeltaReport.call(baseline: baseline_dims, context: context_dims, criteria: criteria)
|
|
75
|
+
return delta_result unless delta_result[:success]
|
|
76
|
+
|
|
77
|
+
{ success: true, response: { report: delta_result[:response][:delta_report] } }
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require 'cgi'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Execution
|
|
8
|
+
# Responsible for loading source context files from a given path
|
|
9
|
+
# and wrapping them in XML tags for injection into the LLM system prompt.
|
|
10
|
+
class ContextHydrator
|
|
11
|
+
# Error message returned when context hydration fails.
|
|
12
|
+
HYDRATION_FAILED = 'Failed to hydrate context from source path'
|
|
13
|
+
# File extensions considered for context hydration.
|
|
14
|
+
TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
|
|
15
|
+
# Maximum file size (in bytes) for files included in context hydration.
|
|
16
|
+
MAX_FILE_SIZE = 50_000
|
|
17
|
+
|
|
18
|
+
# Loads and formats source context files.
|
|
19
|
+
#
|
|
20
|
+
# @param params [Hash] The configuration for context hydration.
|
|
21
|
+
# @option params [String] :source_path The path to the source directory containing readable files.
|
|
22
|
+
# @option params [String] :skill_path Deprecated alias for `:source_path`.
|
|
23
|
+
# @option params [Pathname, String] :base_path (optional) The base path to resolve the source directory against.
|
|
24
|
+
# @return [Hash] A result hash with :success, and :response containing the XML formatted context.
|
|
25
|
+
# @raise [TypeError] when the provided source or base path cannot be converted into a pathname.
|
|
26
|
+
def self.call(params)
|
|
27
|
+
new(**params).call
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param source_path [String] The path to the source directory containing readable files.
|
|
31
|
+
# @param skill_path [String] Deprecated alias for source_path.
|
|
32
|
+
# @param base_path [Pathname, String] The base path to resolve the source directory against.
|
|
33
|
+
# @return [void]
|
|
34
|
+
# @raise [TypeError] when the provided source or base path cannot be converted into a pathname.
|
|
35
|
+
def initialize(source_path: nil, skill_path: nil, base_path: nil)
|
|
36
|
+
@source_path = source_path || skill_path
|
|
37
|
+
@base_path = base_path || Pathname.new(Dir.pwd)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Performs the hydration process.
|
|
41
|
+
#
|
|
42
|
+
# @return [Hash] The standardized result hash indicating success or failure.
|
|
43
|
+
def call
|
|
44
|
+
return missing_path_result unless @source_path
|
|
45
|
+
|
|
46
|
+
full_path = @base_path.join(@source_path).expand_path
|
|
47
|
+
base_expanded = @base_path.expand_path
|
|
48
|
+
|
|
49
|
+
return missing_path_result unless full_path.to_path.start_with?(base_expanded.to_path)
|
|
50
|
+
return missing_path_result unless full_path.exist? && full_path.directory?
|
|
51
|
+
|
|
52
|
+
context_files = collect_context_files(full_path)
|
|
53
|
+
xml_context = build_xml(context_files)
|
|
54
|
+
|
|
55
|
+
{ success: true, response: { context: xml_context } }
|
|
56
|
+
rescue StandardError => e
|
|
57
|
+
SkillBench::ErrorLogger.log_error(e, 'Hydration Error')
|
|
58
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def missing_path_result
|
|
64
|
+
{ success: false, response: { error: { message: "Source path #{@source_path} does not exist or is not a directory" } } }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def collect_context_files(full_path)
|
|
68
|
+
pattern = full_path.join("*{#{TEXT_EXTENSIONS.join(',')}}").to_s
|
|
69
|
+
Dir.glob(pattern).reject { |f| File.symlink?(f) }
|
|
70
|
+
.select { |f| File.size(f) <= MAX_FILE_SIZE }
|
|
71
|
+
.sort
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Builds the XML structure wrapping the contents of the context files.
|
|
75
|
+
#
|
|
76
|
+
# @param context_files [Array<String>] List of absolute paths to context files.
|
|
77
|
+
# @return [String] The combined XML representation of the file contents.
|
|
78
|
+
def build_xml(context_files)
|
|
79
|
+
return '' if context_files.empty?
|
|
80
|
+
|
|
81
|
+
xml = ['<agent_context>']
|
|
82
|
+
|
|
83
|
+
context_files.each do |file_path|
|
|
84
|
+
relative_path = Pathname.new(file_path).relative_path_from(@base_path).to_s
|
|
85
|
+
content = File.read(file_path)
|
|
86
|
+
|
|
87
|
+
xml << " <file path=\"#{CGI.escapeHTML(relative_path)}\">"
|
|
88
|
+
xml << CGI.escapeHTML(content).gsub(/^/, ' ')
|
|
89
|
+
xml << ' </file>'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
xml << '</agent_context>'
|
|
93
|
+
xml.join("\n")
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'tmpdir'
|
|
5
|
+
require 'open3'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Execution
|
|
9
|
+
# Manages isolated sandbox environments for running agent evaluations.
|
|
10
|
+
# Handles copying files, initializing git, and capturing diffs.
|
|
11
|
+
# Now supports Docker container isolation for secure command execution.
|
|
12
|
+
class Sandbox
|
|
13
|
+
attr_reader :path, :container_id
|
|
14
|
+
|
|
15
|
+
# Runs a block of code within a temporary, isolated sandbox directory.
|
|
16
|
+
# The sandbox is initialized as a git repository and optionally wrapped in a Docker container.
|
|
17
|
+
#
|
|
18
|
+
# @param source_dir [String, Pathname] The directory to copy into the sandbox.
|
|
19
|
+
# @yieldparam sandbox [SkillBench::Execution::Sandbox] The sandbox instance.
|
|
20
|
+
# @return [Object] The result of the yielded block.
|
|
21
|
+
# @raise [SystemCallError] when file operations or directory creation fails.
|
|
22
|
+
# @raise [RuntimeError] when Docker commands fail.
|
|
23
|
+
def self.run(source_dir, &)
|
|
24
|
+
new(source_dir).run(&)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @param source_dir [String, Pathname] The directory to copy into the sandbox.
|
|
28
|
+
def initialize(source_dir)
|
|
29
|
+
@source_dir = source_dir
|
|
30
|
+
@path = nil
|
|
31
|
+
@container_id = nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Executes the sandbox environment setup and yields the sandbox instance.
|
|
35
|
+
#
|
|
36
|
+
# @yieldparam sandbox [SkillBench::Execution::Sandbox] The sandbox instance.
|
|
37
|
+
# @return [Object] The result of the yielded block.
|
|
38
|
+
# @raise [SystemCallError] when file operations or directory creation fails.
|
|
39
|
+
# @raise [RuntimeError] when Docker commands fail.
|
|
40
|
+
def run
|
|
41
|
+
Dir.mktmpdir('evaluator_sandbox_') do |sandbox_dir|
|
|
42
|
+
@path = sandbox_dir
|
|
43
|
+
copy_source_files(sandbox_dir)
|
|
44
|
+
|
|
45
|
+
setup_git
|
|
46
|
+
|
|
47
|
+
start_container if docker_available?
|
|
48
|
+
begin
|
|
49
|
+
yield self
|
|
50
|
+
ensure
|
|
51
|
+
stop_container
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Captures the git diff of changes made within the sandbox.
|
|
57
|
+
#
|
|
58
|
+
# @param sandbox_dir [String] The path to the sandbox directory.
|
|
59
|
+
# @return [String] The git diff, or a message indicating no changes.
|
|
60
|
+
# @raise [SystemCallError] when git commands fail.
|
|
61
|
+
def self.capture_diff(sandbox_dir)
|
|
62
|
+
sandbox_path = File.realpath(sandbox_dir)
|
|
63
|
+
tmp_prefix = File.realpath(Dir.tmpdir) + File::SEPARATOR
|
|
64
|
+
raise "Sandbox directory #{sandbox_dir} is outside temp directory" unless sandbox_path.start_with?(tmp_prefix)
|
|
65
|
+
|
|
66
|
+
return 'No code changes made.' unless File.directory?(File.join(sandbox_path, '.git'))
|
|
67
|
+
|
|
68
|
+
raise "Failed to stage changes in #{sandbox_path}" unless system('git', 'add', '.', chdir: sandbox_path)
|
|
69
|
+
|
|
70
|
+
diff, status = Open3.capture2('git', 'diff', '--cached', chdir: sandbox_path)
|
|
71
|
+
raise "Failed to capture diff in #{sandbox_path}" unless status.success?
|
|
72
|
+
|
|
73
|
+
diff.strip.empty? ? 'No code changes made.' : diff
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def setup_git
|
|
79
|
+
cmds = [
|
|
80
|
+
['git', 'init', '--quiet'],
|
|
81
|
+
['git', 'config', 'user.email', 'evaluator@tessl.io'],
|
|
82
|
+
['git', 'config', 'user.name', 'Evaluator Sandbox'],
|
|
83
|
+
['git', 'add', '.'],
|
|
84
|
+
['git', 'commit', '--quiet', '-m', 'Initial commit']
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
cmds.each do |argv|
|
|
88
|
+
raise "Git command failed: #{argv.join(' ')}" unless system(*argv, chdir: @path)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Copies source files into the sandbox, including dotfiles.
|
|
93
|
+
# Validates symlinks to prevent path traversal.
|
|
94
|
+
#
|
|
95
|
+
# @param sandbox_dir [String] The destination sandbox directory.
|
|
96
|
+
# @raise [RuntimeError] when a symlink points outside the source directory.
|
|
97
|
+
def copy_source_files(sandbox_dir)
|
|
98
|
+
source_real = File.realpath(@source_dir)
|
|
99
|
+
copy_tree(@source_dir, sandbox_dir, source_real)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def copy_tree(src_dir, dst_dir, source_real)
|
|
103
|
+
Dir.entries(src_dir).each do |entry|
|
|
104
|
+
next if %w[. ..].include?(entry)
|
|
105
|
+
|
|
106
|
+
src = File.join(src_dir, entry)
|
|
107
|
+
dst = File.join(dst_dir, entry)
|
|
108
|
+
|
|
109
|
+
if File.symlink?(src)
|
|
110
|
+
real = File.realpath(src)
|
|
111
|
+
raise "Symlink #{entry} points outside source directory" unless real.start_with?("#{source_real}/")
|
|
112
|
+
|
|
113
|
+
copy_item(real, dst, source_real)
|
|
114
|
+
elsif File.directory?(src)
|
|
115
|
+
copy_item(src, dst, source_real)
|
|
116
|
+
else
|
|
117
|
+
FileUtils.cp(src, dst)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def copy_item(src, dst, source_real)
|
|
123
|
+
FileUtils.mkdir_p(dst)
|
|
124
|
+
if File.directory?(src)
|
|
125
|
+
copy_tree(src, dst, source_real)
|
|
126
|
+
else
|
|
127
|
+
FileUtils.cp(src, dst)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Checks if Docker is available and the sandbox Dockerfile exists.
|
|
132
|
+
#
|
|
133
|
+
# @return [Boolean] true if Docker is available, false otherwise.
|
|
134
|
+
def docker_available?
|
|
135
|
+
docker_dir = File.expand_path('docker', __dir__)
|
|
136
|
+
return false unless File.directory?(docker_dir)
|
|
137
|
+
|
|
138
|
+
_stdout, _stderr, status = Open3.capture3('docker', 'info')
|
|
139
|
+
status.success?
|
|
140
|
+
rescue Errno::ENOENT
|
|
141
|
+
false
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Starts a Docker container for isolated command execution.
|
|
145
|
+
# Builds the image only if it does not already exist.
|
|
146
|
+
#
|
|
147
|
+
# @raise [RuntimeError] when the Docker image cannot be built or the container fails to start.
|
|
148
|
+
def start_container
|
|
149
|
+
image_name = 'evaluator-sandbox'
|
|
150
|
+
docker_dir = File.expand_path('docker', __dir__)
|
|
151
|
+
|
|
152
|
+
# Build image (Docker layer cache handles no-op builds)
|
|
153
|
+
raise "Failed to build Docker image #{image_name}" unless system('docker', 'build', '-t', image_name, docker_dir, '--quiet')
|
|
154
|
+
|
|
155
|
+
# Start a detached container mounting the sandbox dir to /sandbox
|
|
156
|
+
stdout, stderr, status = Open3.capture3(
|
|
157
|
+
'docker', 'run', '-d', '--rm', '-v', "#{@path}:/sandbox", image_name
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
raise "Failed to start Docker container: #{stderr}" unless status.success?
|
|
161
|
+
|
|
162
|
+
@container_id = stdout.strip
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def stop_container
|
|
166
|
+
return unless @container_id
|
|
167
|
+
|
|
168
|
+
# Stop and remove the container (it's --rm so stopping also removes it)
|
|
169
|
+
# We don't fail-fast on stop to avoid swallowing the original error if this is in an ensure block
|
|
170
|
+
system('docker', 'stop', @container_id, out: File::NULL, err: File::NULL)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Execution
|
|
5
|
+
# Resolves the source skill or workflow path for a given evaluation target.
|
|
6
|
+
class SourcePathResolver
|
|
7
|
+
# Resolves the source path using either an explicit override or the eval directory convention.
|
|
8
|
+
#
|
|
9
|
+
# @param eval_folder_path [String] Relative path to the eval directory.
|
|
10
|
+
# @param skill_path [String, nil] Optional explicit override for the source directory.
|
|
11
|
+
# @return [String, nil] The resolved source path relative to the evaluator repo root, or nil if unmappable.
|
|
12
|
+
# @example Infer a skill source path (NEW format):
|
|
13
|
+
# SkillBench::Execution::SourcePathResolver.call(
|
|
14
|
+
# eval_folder_path: 'evals/skills/rails-code-review/review-order'
|
|
15
|
+
# )
|
|
16
|
+
# # => "skills/rails-code-review"
|
|
17
|
+
# @example Infer a skill source path (OLD format, returns category):
|
|
18
|
+
# SkillBench::Execution::SourcePathResolver.call(
|
|
19
|
+
# eval_folder_path: 'evals/skills/code-quality/rails-code-review/review-order'
|
|
20
|
+
# )
|
|
21
|
+
# # => "skills/code-quality/rails-code-review"
|
|
22
|
+
def self.call(eval_folder_path:, skill_path: nil)
|
|
23
|
+
return skill_path if skill_path && !skill_path.empty?
|
|
24
|
+
|
|
25
|
+
segments = eval_folder_path.to_s.split('/').reject(&:empty?)
|
|
26
|
+
|
|
27
|
+
resolve_skills_path(segments) || resolve_workflows_path(segments)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private_class_method def self.resolve_skills_path(segments)
|
|
31
|
+
return nil unless (index = segments.rindex('skills'))
|
|
32
|
+
|
|
33
|
+
remaining = segments[(index + 1)..]
|
|
34
|
+
resolve_old_format_skills(remaining) || resolve_new_format_skills(remaining)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private_class_method def self.resolve_old_format_skills(remaining)
|
|
38
|
+
return nil unless remaining.size >= 3
|
|
39
|
+
|
|
40
|
+
category = remaining[0]
|
|
41
|
+
skill_name = remaining[1]
|
|
42
|
+
"skills/#{category}/#{skill_name}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private_class_method def self.resolve_new_format_skills(remaining)
|
|
46
|
+
return nil unless remaining.size >= 1
|
|
47
|
+
|
|
48
|
+
skill_name = remaining[0]
|
|
49
|
+
"skills/#{skill_name}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private_class_method def self.resolve_workflows_path(segments)
|
|
53
|
+
return nil unless (index = segments.rindex('workflows'))
|
|
54
|
+
|
|
55
|
+
workflow_name = segments[index + 1]
|
|
56
|
+
"workflows/#{workflow_name}" if workflow_name
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Namespace for the execution environment subsystem.
|
|
5
|
+
#
|
|
6
|
+
# Provides isolated execution environments for agent evaluation,
|
|
7
|
+
# including sandbox management and context hydration.
|
|
8
|
+
module Execution
|
|
9
|
+
end
|
|
10
|
+
end
|