ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
data/bin/skill-bench
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
|
5
|
+
require 'skill_bench'
|
|
6
|
+
|
|
7
|
+
begin
|
|
8
|
+
exit SkillBench::CLI.call(ARGV)
|
|
9
|
+
rescue Interrupt
|
|
10
|
+
warn "\nInterrupted."
|
|
11
|
+
exit 130
|
|
12
|
+
rescue StandardError => e
|
|
13
|
+
warn "Error: #{e.message}"
|
|
14
|
+
exit 1
|
|
15
|
+
end
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# SkillBench Architecture
|
|
2
|
+
|
|
3
|
+
Ruby Skill Bench provides a reproducible and isolated environment for testing AI agents. It consists of several decoupled components that orchestrate the evaluation flow.
|
|
4
|
+
|
|
5
|
+
## High-Level Flow
|
|
6
|
+
|
|
7
|
+
1. **`RunnerService`**: The entry point. Resolves eval, skill, and provider, then runs baseline and context agents.
|
|
8
|
+
2. **`Sandbox`**: Creates a temporary directory, copies task files, and initializes a Git repository for clean, reproducible runs.
|
|
9
|
+
3. **`ContextHydrator`**: Loads skill documentation (.md, .rb, .json, .yml, .yaml, .txt up to 50KB each) and wraps it in XML for the agent's system prompt.
|
|
10
|
+
4. **`ReactAgent`**: Autonomous agent following a **Thought → Tool → Observation** loop.
|
|
11
|
+
5. **`EvaluationRunner`**: Orchestrates blind judging — builds `JudgePrompt` for baseline and context outputs, calls `Judge` twice, then computes deltas via `DeltaReport`.
|
|
12
|
+
6. **`DeltaReport`**: Computes per-dimension deltas and determines verdict based on `pass_threshold` and `minimum_delta`.
|
|
13
|
+
7. **`Client`**: Provider-agnostic abstraction for LLM backends.
|
|
14
|
+
|
|
15
|
+
## Key Components
|
|
16
|
+
|
|
17
|
+
### `SkillBench::Services::RunnerService`
|
|
18
|
+
|
|
19
|
+
- Resolves eval, skill, and provider configuration.
|
|
20
|
+
- Runs baseline agent (no skill context) and context agent (with skill context).
|
|
21
|
+
- Delegates judging and delta computation to `EvaluationRunner`.
|
|
22
|
+
- Falls back to mock provider when config is unavailable.
|
|
23
|
+
|
|
24
|
+
### `SkillBench::EvaluationRunner`
|
|
25
|
+
|
|
26
|
+
- Builds `JudgePrompt` for baseline and context outputs.
|
|
27
|
+
- Calls `Judge` twice (blind scoring).
|
|
28
|
+
- Uses `DeltaReport` to compute per-dimension deltas and final verdict.
|
|
29
|
+
|
|
30
|
+
### `SkillBench::DeltaReport`
|
|
31
|
+
|
|
32
|
+
- Computes baseline vs context deltas per dimension.
|
|
33
|
+
- Verdict requires: `context_total >= pass_threshold` AND `total_delta >= minimum_delta`.
|
|
34
|
+
|
|
35
|
+
### `SkillBench::CLI` Commands
|
|
36
|
+
|
|
37
|
+
- `InitCommand` — Creates `skill-bench.json` configuration
|
|
38
|
+
- `RunCommand` — Executes evaluations
|
|
39
|
+
- `SkillCommand` — Scaffolds new skills with templates
|
|
40
|
+
- `EvalCommand` — Creates evaluation scenarios
|
|
41
|
+
|
|
42
|
+
### `SkillBench::Services::TemplateRegistry`
|
|
43
|
+
|
|
44
|
+
- Provides pre-built templates for generating eval scaffolding
|
|
45
|
+
- Supports three template types: `task_md`, `criteria_json`, `skill_md`
|
|
46
|
+
- Offers 10 Rails pattern categories: `crud`, `api`, `background_job`, `controller`, `model`, `migration`, `concern`, `policy`, `form_object`, `view_component`
|
|
47
|
+
- Enables variable interpolation using `{{variable_name}}` syntax
|
|
48
|
+
- Used for programmatic eval creation and tool building
|
|
49
|
+
|
|
50
|
+
### `SkillBench::Sandbox`
|
|
51
|
+
|
|
52
|
+
- Uses `Dir.mktmpdir` for isolation.
|
|
53
|
+
- Captures state changes using `git diff`.
|
|
54
|
+
- Validates sandbox path to prevent directory traversal.
|
|
55
|
+
- Cleans up automatically after execution.
|
|
56
|
+
|
|
57
|
+
### `SkillBench::ReactAgent`
|
|
58
|
+
|
|
59
|
+
- Implements a stateful loop.
|
|
60
|
+
- Supports tool usage (e.g., `read_file`, `write_file`, `run_shell_command`).
|
|
61
|
+
- Manages conversation history.
|
|
62
|
+
|
|
63
|
+
### `SkillBench::Clients::BaseClient`
|
|
64
|
+
|
|
65
|
+
- Implements the **Template Method** pattern.
|
|
66
|
+
- Handles Faraday connection setup and timeouts.
|
|
67
|
+
- Centralizes error logging and response normalization.
|
|
68
|
+
- Delegates to `ResponseParser`, `ResponseErrorHandler`, and `RequestBuilder`
|
|
69
|
+
|
|
70
|
+
### `SkillBench::OutputFormatter`
|
|
71
|
+
|
|
72
|
+
- Formats results as human-readable text, JSON, or JUnit XML
|
|
73
|
+
- Human format displays a dimension table with baseline, context, and delta columns
|
|
74
|
+
- Escapes XML output to prevent injection
|
|
75
|
+
- Provides exit codes for CI/CD integration
|
|
76
|
+
|
|
77
|
+
### `SkillBench::ErrorLogger`
|
|
78
|
+
|
|
79
|
+
- Shared error logging module for all service objects
|
|
80
|
+
- Logs error message and full backtrace
|
|
81
|
+
- Uses `Rails.logger` when available, falls back to `warn`
|
|
82
|
+
|
|
83
|
+
## Data Flow: What Passes Between Components
|
|
84
|
+
|
|
85
|
+
Understanding what data moves between components helps debug issues and write better evals.
|
|
86
|
+
|
|
87
|
+
### Flow 1: RunnerService → EvaluationRunner
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
# RunnerService builds this and passes it to EvaluationRunner.call
|
|
91
|
+
evaluation = {
|
|
92
|
+
task: "Create a UserRegistrationService...", # from task.md
|
|
93
|
+
criteria: <Criteria object>, # from criteria.json
|
|
94
|
+
skill_context: "<agent_context>...SKILL.md...</agent_context>", # from ContextHydrator
|
|
95
|
+
baseline_output: '{"result":"...","status":":success"}', # from baseline agent run
|
|
96
|
+
context_output: '{"result":"...","status":":success"}' # from context agent run
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Flow 2: EvaluationRunner → Judge (two calls)
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# First call — baseline (no skill context)
|
|
104
|
+
JudgePrompt.call(
|
|
105
|
+
task: task,
|
|
106
|
+
criteria: criteria,
|
|
107
|
+
skill_context: "", # empty string for baseline
|
|
108
|
+
agent_output: baseline_output
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Second call — context (with skill context)
|
|
112
|
+
JudgePrompt.call(
|
|
113
|
+
task: task,
|
|
114
|
+
criteria: criteria,
|
|
115
|
+
skill_context: skill_context, # XML-wrapped SKILL.md
|
|
116
|
+
agent_output: context_output
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Flow 3: Judge → JudgeResponse
|
|
121
|
+
|
|
122
|
+
The judge returns a JSON string like:
|
|
123
|
+
|
|
124
|
+
```json
|
|
125
|
+
{
|
|
126
|
+
"dimensions": {
|
|
127
|
+
"correctness": { "score": 28, "max_score": 30, "reasoning": "All requirements met." },
|
|
128
|
+
"skill_adherence": { "score": 22, "max_score": 25, "reasoning": "Used .call pattern correctly." }
|
|
129
|
+
},
|
|
130
|
+
"overall_reasoning": "Solid implementation."
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
`JudgeResponse` parses this, validates that scores are numeric and within bounds, and returns a structured object.
|
|
135
|
+
|
|
136
|
+
### Flow 4: DeltaReport → Output
|
|
137
|
+
|
|
138
|
+
```ruby
|
|
139
|
+
# DeltaReport receives two JudgeResponse objects
|
|
140
|
+
baseline = {
|
|
141
|
+
'correctness' => { score: 12, max_score: 30 },
|
|
142
|
+
'skill_adherence' => { score: 5, max_score: 25 }
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
context = {
|
|
146
|
+
'correctness' => { score: 28, max_score: 30 },
|
|
147
|
+
'skill_adherence' => { score: 22, max_score: 25 }
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
# Produces:
|
|
151
|
+
deltas = {
|
|
152
|
+
'correctness' => 16, # 28 - 12
|
|
153
|
+
'skill_adherence' => 17 # 22 - 5
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
baseline_total = 17 # 12 + 5
|
|
157
|
+
context_total = 50 # 28 + 22
|
|
158
|
+
verdict = context_total >= pass_threshold && (context_total - baseline_total) >= minimum_delta
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Directory Structure
|
|
162
|
+
|
|
163
|
+
The evaluator relies on a strict directory convention:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
project-root/
|
|
167
|
+
├── skill-bench.json # Provider configuration
|
|
168
|
+
├── skills/
|
|
169
|
+
│ └── my-service/
|
|
170
|
+
│ └── SKILL.md # Skill instructions
|
|
171
|
+
├── evals/
|
|
172
|
+
│ └── my-first-eval/
|
|
173
|
+
│ ├── task.md # Agent prompt
|
|
174
|
+
│ └── criteria.json # Scoring rules
|
|
175
|
+
└── .skill-bench-history.json # Benchmark history (auto-generated)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Skill Discovery
|
|
179
|
+
|
|
180
|
+
Skills are discovered recursively. These are all valid:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
skills/my-service/SKILL.md
|
|
184
|
+
skills/api/rest-collection/SKILL.md
|
|
185
|
+
skills/workflows/tdd-loop/SKILL.md
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
The `SkillResolver` walks `skills/` recursively and matches by directory name.
|
|
189
|
+
|
|
190
|
+
### Eval Discovery
|
|
191
|
+
|
|
192
|
+
Evals are resolved in this order:
|
|
193
|
+
|
|
194
|
+
1. If the path contains `/`, use it as-is (e.g., `evals/my-eval`)
|
|
195
|
+
2. Otherwise, prepend `evals/` (e.g., `my-eval` → `evals/my-eval`)
|
|
196
|
+
|
|
197
|
+
The eval directory must contain at minimum:
|
|
198
|
+
|
|
199
|
+
- `task.md` — the agent prompt
|
|
200
|
+
- `criteria.json` — the scoring rules (optional; defaults to empty criteria if missing)
|