ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# Testing Guide: Evaluations & Workflows
|
|
2
|
+
|
|
3
|
+
This guide explains how to run evaluations and how to create new evaluation tasks for skills and workflows.
|
|
4
|
+
|
|
5
|
+
## Running Evaluations
|
|
6
|
+
|
|
7
|
+
The primary tool for running evaluations is the `skill-bench` CLI.
|
|
8
|
+
|
|
9
|
+
### Basic Usage
|
|
10
|
+
|
|
11
|
+
To run a specific evaluation task:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
skill-bench run my-eval --skill=my-skill
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Provider is read from `skill-bench.json` — no `--provider` flag needed.
|
|
18
|
+
|
|
19
|
+
### Output Formats
|
|
20
|
+
|
|
21
|
+
**Human-readable (default):**
|
|
22
|
+
|
|
23
|
+
```text
|
|
24
|
+
═══════════════════════════════════════════════════════
|
|
25
|
+
Eval: my-eval
|
|
26
|
+
Skill: my-skill
|
|
27
|
+
Provider: openai
|
|
28
|
+
═══════════════════════════════════════════════════════
|
|
29
|
+
|
|
30
|
+
DIMENSION BASELINE CONTEXT DELTA
|
|
31
|
+
──────────────────────── ───────── ───────── ───────
|
|
32
|
+
Correctness (30) 12 28 +16
|
|
33
|
+
Skill Adherence (25) 5 22 +17
|
|
34
|
+
Code Quality (20) 10 16 +6
|
|
35
|
+
Test Coverage (15) 3 13 +10
|
|
36
|
+
Documentation (10) 2 8 +6
|
|
37
|
+
──────────────────────── ───────── ───────── ───────
|
|
38
|
+
TOTAL 32/100 87/100 +55
|
|
39
|
+
|
|
40
|
+
VERDICT: PASS (threshold: 70, minimum delta: 10)
|
|
41
|
+
═══════════════════════════════════════════════════════
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**JSON:**
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
skill-bench run my-eval --skill=my-skill --format json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**JUnit XML:**
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
skill-bench run my-eval --skill=my-skill --format=junit
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Batch Processing
|
|
57
|
+
|
|
58
|
+
To run an eval with a path containing a slash:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
skill-bench run evals/my-eval --skill=my-skill
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
The evaluator resolves the path automatically.
|
|
65
|
+
|
|
66
|
+
### Overriding Skill Context
|
|
67
|
+
|
|
68
|
+
By default, the evaluator infers the skill path from the evaluation path. If you need to test an evaluation against a different skill:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
skill-bench run my-eval --skill=skills/custom-skill
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Creating New Evaluations
|
|
75
|
+
|
|
76
|
+
An evaluation task consists of a directory containing at least two files: `task.md` and `criteria.json`.
|
|
77
|
+
|
|
78
|
+
### 1. The Task (`task.md`)
|
|
79
|
+
|
|
80
|
+
This file contains the instructions for the AI agent. It should describe a specific problem to solve or a feature to implement.
|
|
81
|
+
|
|
82
|
+
**Best Practices:**
|
|
83
|
+
|
|
84
|
+
- Provide clear context and requirements.
|
|
85
|
+
- Include a description of the current codebase state.
|
|
86
|
+
- Specify the desired outcome.
|
|
87
|
+
- List acceptance criteria as numbered items (the judge checks these).
|
|
88
|
+
|
|
89
|
+
**Example — Good task.md:**
|
|
90
|
+
|
|
91
|
+
```markdown
|
|
92
|
+
Create a `PasswordValidator` class that:
|
|
93
|
+
|
|
94
|
+
1. Accepts a `password` string
|
|
95
|
+
2. Validates minimum length of 8 characters
|
|
96
|
+
3. Validates presence of at least one uppercase letter
|
|
97
|
+
4. Validates presence of at least one digit
|
|
98
|
+
5. Returns `{ valid: true }` or `{ valid: false, errors: [...] }`
|
|
99
|
+
6. Includes RSpec tests with 100% branch coverage
|
|
100
|
+
7. Uses `# frozen_string_literal: true`
|
|
101
|
+
8. Has YARD docs for the class and all public methods
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Why this works:** Each numbered item is a discrete acceptance criterion the judge can verify independently. Vague tasks like "create a password validator" produce inconsistent scores because the judge has to guess what "good" means.
|
|
105
|
+
|
|
106
|
+
### 2. The Criteria (`criteria.json`)
|
|
107
|
+
|
|
108
|
+
This file defines the evaluation dimensions, weights, and thresholds:
|
|
109
|
+
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"context": "Evaluate whether the skill helps build a proper API REST collection",
|
|
113
|
+
"dimensions": [
|
|
114
|
+
{ "name": "correctness", "max_score": 30 },
|
|
115
|
+
{ "name": "skill_adherence", "max_score": 25 },
|
|
116
|
+
{ "name": "code_quality", "max_score": 20 },
|
|
117
|
+
{ "name": "test_coverage", "max_score": 15 },
|
|
118
|
+
{ "name": "documentation", "max_score": 10 }
|
|
119
|
+
],
|
|
120
|
+
"pass_threshold": 70,
|
|
121
|
+
"minimum_delta": 10
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Fields:**
|
|
126
|
+
|
|
127
|
+
| Field | Type | Required | Description |
|
|
128
|
+
|-------|------|----------|-------------|
|
|
129
|
+
| `context` | string | Yes | Shown to the judge. Describes what the eval measures. |
|
|
130
|
+
| `dimensions` | array | Yes | Array of `{ name, max_score }` objects. Must include all 5 core dimensions. `max_score` values must sum to exactly 100. |
|
|
131
|
+
| `pass_threshold` | integer | No | Minimum context score to pass. Default: 70. |
|
|
132
|
+
| `minimum_delta` | integer | No | Minimum improvement over baseline to pass. Default: 10. |
|
|
133
|
+
| `description` (per dimension) | string | No | Overrides the built-in default description for that dimension. |
|
|
134
|
+
|
|
135
|
+
**Custom dimension descriptions** are especially useful when a skill has specific hard rules. For example, if your skill requires the `.call` pattern, you can tell the judge exactly what to look for:
|
|
136
|
+
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"name": "skill_adherence",
|
|
140
|
+
"max_score": 25,
|
|
141
|
+
"description": "Did the agent create a class with a `.call` class method that returns `{ success: bool, response: { ... } }`?"
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
This produces more consistent scores than the generic default description.
|
|
146
|
+
|
|
147
|
+
### 3. What the Judge Sees
|
|
148
|
+
|
|
149
|
+
Understanding the judge prompt helps you write better tasks and criteria. The judge receives a structured prompt with four sections:
|
|
150
|
+
|
|
151
|
+
```text
|
|
152
|
+
## Task
|
|
153
|
+
[Contents of task.md]
|
|
154
|
+
|
|
155
|
+
## Criteria
|
|
156
|
+
Context: [Contents of criteria.json context]
|
|
157
|
+
Dimensions:
|
|
158
|
+
- correctness: max_score=30, description=...
|
|
159
|
+
- skill_adherence: max_score=25, description=...
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
## Skill Context
|
|
163
|
+
[Contents of SKILL.md wrapped in XML]
|
|
164
|
+
|
|
165
|
+
## Agent Output
|
|
166
|
+
[Git diff + file listing + reasoning excerpt]
|
|
167
|
+
|
|
168
|
+
## Instructions
|
|
169
|
+
Score each dimension independently. Return JSON with:
|
|
170
|
+
- "dimensions": object mapping each dimension name to { "score": number, "max_score": number, "reasoning": string }
|
|
171
|
+
- "overall_reasoning": string summarizing the evaluation
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Important:** The judge is called **twice** per eval — once for baseline output (no skill context section) and once for context output (with skill context). The judge never sees both outputs in the same call. This prevents the judge from being biased by direct comparison.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Evaluating Workflows vs. Skills
|
|
179
|
+
|
|
180
|
+
### Atomic Skills
|
|
181
|
+
|
|
182
|
+
Skills are isolated blocks of logic (e.g., a specific API pattern). Evaluations for skills should focus strictly on the adherence to the patterns defined in the skill's `SKILL.md`.
|
|
183
|
+
|
|
184
|
+
**Recommended weights for atomic skills:**
|
|
185
|
+
|
|
186
|
+
```json
|
|
187
|
+
{
|
|
188
|
+
"dimensions": [
|
|
189
|
+
{ "name": "correctness", "max_score": 30 },
|
|
190
|
+
{ "name": "skill_adherence", "max_score": 30 },
|
|
191
|
+
{ "name": "code_quality", "max_score": 20 },
|
|
192
|
+
{ "name": "test_coverage", "max_score": 10 },
|
|
193
|
+
{ "name": "documentation", "max_score": 10 }
|
|
194
|
+
],
|
|
195
|
+
"pass_threshold": 70,
|
|
196
|
+
"minimum_delta": 10
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Skill Adherence is weighted highest because the core question is "did the skill help?"
|
|
201
|
+
|
|
202
|
+
### Workflows
|
|
203
|
+
|
|
204
|
+
Workflows are sequences of skills or complex orchestrations (e.g., the full TDD loop). Evaluations for workflows should focus on the process, the ordering of tasks, and the successful completion of a multi-step objective.
|
|
205
|
+
|
|
206
|
+
**Recommended weights for workflows:**
|
|
207
|
+
|
|
208
|
+
```json
|
|
209
|
+
{
|
|
210
|
+
"dimensions": [
|
|
211
|
+
{ "name": "correctness", "max_score": 35 },
|
|
212
|
+
{ "name": "skill_adherence", "max_score": 20 },
|
|
213
|
+
{ "name": "code_quality", "max_score": 20 },
|
|
214
|
+
{ "name": "test_coverage", "max_score": 15 },
|
|
215
|
+
{ "name": "documentation", "max_score": 10 }
|
|
216
|
+
],
|
|
217
|
+
"pass_threshold": 65,
|
|
218
|
+
"minimum_delta": 15
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Correctness is weighted higher because workflows are judged on end-to-end success. The `minimum_delta` is also higher (15 vs 10) because workflows are expected to show stronger skill impact.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Interpreting the Output
|
|
227
|
+
|
|
228
|
+
### Human-Readable Format
|
|
229
|
+
|
|
230
|
+
```text
|
|
231
|
+
═══════════════════════════════════════════════════════
|
|
232
|
+
Eval: my-eval
|
|
233
|
+
Skill: my-skill
|
|
234
|
+
Provider: openai
|
|
235
|
+
═══════════════════════════════════════════════════════
|
|
236
|
+
|
|
237
|
+
DIMENSION BASELINE CONTEXT DELTA
|
|
238
|
+
──────────────────────── ───────── ───────── ───────
|
|
239
|
+
Correctness (30) 12 28 +16
|
|
240
|
+
Skill Adherence (25) 5 22 +17
|
|
241
|
+
Code Quality (20) 10 16 +6
|
|
242
|
+
Test Coverage (15) 3 13 +10
|
|
243
|
+
Documentation (10) 2 8 +6
|
|
244
|
+
──────────────────────── ───────── ───────── ───────
|
|
245
|
+
TOTAL 32/100 87/100 +55
|
|
246
|
+
|
|
247
|
+
TREND: baseline ↑ (+2), context ↑ (+7)
|
|
248
|
+
VERDICT: PASS (threshold: 70, minimum delta: 10)
|
|
249
|
+
═══════════════════════════════════════════════════════
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
**Reading the table:**
|
|
253
|
+
|
|
254
|
+
- **BASELINE:** What the agent produced *without* the skill. Think of this as "raw" ability.
|
|
255
|
+
- **CONTEXT:** What the agent produced *with* the skill. Think of this as "aided" ability.
|
|
256
|
+
- **DELTA:** The improvement. `+16` means the skill added 16 points to that dimension.
|
|
257
|
+
- **TOTAL:** Sum of all dimension scores. The `/100` reminds you of the maximum.
|
|
258
|
+
|
|
259
|
+
**Verdict logic:**
|
|
260
|
+
|
|
261
|
+
```ruby
|
|
262
|
+
pass = context_total >= pass_threshold && total_delta >= minimum_delta
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
Both must be true. This prevents two failure modes:
|
|
266
|
+
|
|
267
|
+
1. **High absolute, no improvement:** baseline=80, context=80, delta=0 → FAIL (skill didn't help)
|
|
268
|
+
2. **Low absolute, small improvement:** baseline=10, context=20, delta=10 → FAIL (still terrible)
|
|
269
|
+
|
|
270
|
+
**TREND line:**
|
|
271
|
+
|
|
272
|
+
```text
|
|
273
|
+
TREND: baseline ↑ (+2), context ↑ (+7)
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
This compares the current run against the **previous run of the same eval + skill** (stored in `.skill-bench-history.json`).
|
|
277
|
+
|
|
278
|
+
- `↑` = improved since last run
|
|
279
|
+
- `↓` = regressed since last run
|
|
280
|
+
- `→` = unchanged
|
|
281
|
+
|
|
282
|
+
The numbers in parentheses are the point differences. This helps you track whether your skill is getting better over time.
|
|
283
|
+
|
|
284
|
+
### JSON Format
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
skill-bench run my-eval --skill=my-skill --format json
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Returns a structured hash with:
|
|
291
|
+
|
|
292
|
+
- `eval_name`, `skill_name`, `provider_name`
|
|
293
|
+
- `report` containing: `verdict`, `baseline_total`, `context_total`, `deltas`, `baseline_scores`, `context_scores`, `criteria`
|
|
294
|
+
- `trend` (if history exists): `baseline_trend`, `context_trend`, `baseline_delta`, `context_delta`, `previous_run`
|
|
295
|
+
|
|
296
|
+
Useful for CI/CD pipelines and automated reporting.
|
|
297
|
+
|
|
298
|
+
### JUnit XML Format
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
skill-bench run my-eval --skill=my-skill --format junit
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
Returns standard JUnit XML. Useful for GitHub Actions, Jenkins, and other CI systems that parse JUnit reports.
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## Running the Test Suite
|
|
309
|
+
|
|
310
|
+
The project uses Minitest with 440+ tests covering:
|
|
311
|
+
|
|
312
|
+
- Core evaluation engine (`test/evaluator/`)
|
|
313
|
+
- CLI commands and models (`test/agent_eval/`)
|
|
314
|
+
- Provider clients (`test/clients/`)
|
|
315
|
+
- Skill services (`test/skills/`)
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
# Run all tests
|
|
319
|
+
bundle exec rake test
|
|
320
|
+
|
|
321
|
+
# Run with coverage report
|
|
322
|
+
bundle exec rake test COVERAGE=true
|
|
323
|
+
|
|
324
|
+
# Run specific test file
|
|
325
|
+
bundle exec ruby -Itest test/integration_test.rb
|
|
326
|
+
|
|
327
|
+
# Run lint checks
|
|
328
|
+
bundle exec rake rubocop
|
|
329
|
+
bundle exec rake reek
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
### Test Isolation
|
|
333
|
+
|
|
334
|
+
Tests use temporary directories and restore the original working directory:
|
|
335
|
+
|
|
336
|
+
```ruby
|
|
337
|
+
def setup
|
|
338
|
+
@original_dir = Dir.pwd
|
|
339
|
+
@tmp_dir = Dir.mktmpdir('test')
|
|
340
|
+
Dir.chdir(@tmp_dir)
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def teardown
|
|
344
|
+
Dir.chdir(@original_dir)
|
|
345
|
+
FileUtils.rm_rf(@tmp_dir)
|
|
346
|
+
end
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Environment Variable Handling
|
|
350
|
+
|
|
351
|
+
Tests that modify ENV must restore original values:
|
|
352
|
+
|
|
353
|
+
```ruby
|
|
354
|
+
def test_something
|
|
355
|
+
original_key = ENV.fetch('SKILL_BENCH_OPENAI_API_KEY', nil)
|
|
356
|
+
ENV.delete('SKILL_BENCH_OPENAI_API_KEY')
|
|
357
|
+
# ... test code ...
|
|
358
|
+
ensure
|
|
359
|
+
ENV['SKILL_BENCH_OPENAI_API_KEY'] = original_key if original_key
|
|
360
|
+
end
|
|
361
|
+
```
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'step'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Agent
|
|
7
|
+
class ReactAgent
|
|
8
|
+
# Executes the ReAct loop iterations until completion or max iterations.
|
|
9
|
+
class LoopRunner
|
|
10
|
+
# Executes the loop.
|
|
11
|
+
#
|
|
12
|
+
# @param initial_prompt [String] The user task the agent must complete.
|
|
13
|
+
# @param max_iterations [Integer] The maximum allowed steps before aborting.
|
|
14
|
+
# @param config [Hash] The configuration for the Step execution.
|
|
15
|
+
# @return [Hash] A result hash indicating success or failure.
|
|
16
|
+
def self.call(initial_prompt, max_iterations, config)
|
|
17
|
+
messages = [{ role: 'user', content: initial_prompt }]
|
|
18
|
+
iterations_log = []
|
|
19
|
+
step_count = 0
|
|
20
|
+
|
|
21
|
+
while step_count < max_iterations
|
|
22
|
+
step_count += 1
|
|
23
|
+
|
|
24
|
+
step_result = Step.call(messages, config)
|
|
25
|
+
iteration = step_result[:iteration]
|
|
26
|
+
iterations_log << attach_step_number(iteration, step_count) if iteration
|
|
27
|
+
|
|
28
|
+
unless step_result[:continue]
|
|
29
|
+
final_result = step_result[:result] || { success: false, response: { error: { message: 'Step returned no result' } } }
|
|
30
|
+
return merge_iterations(final_result, iterations_log)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
messages = step_result[:messages]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
merge_iterations(
|
|
37
|
+
{ success: false, response: { error: { message: Agent::ReactAgent::MAX_ITERATIONS_REACHED } } },
|
|
38
|
+
iterations_log
|
|
39
|
+
)
|
|
40
|
+
rescue StandardError => e
|
|
41
|
+
SkillBench::ErrorLogger.log_error(e, 'ReactAgent Error')
|
|
42
|
+
merge_iterations(
|
|
43
|
+
{ success: false, response: { error: { message: e.message } } },
|
|
44
|
+
iterations_log
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Attaches the step number to an iteration hash.
|
|
49
|
+
#
|
|
50
|
+
# @param iteration [Hash] The iteration metadata from a Step.
|
|
51
|
+
# @param step_count [Integer] The current step number.
|
|
52
|
+
# @return [Hash] The iteration with :step_number added.
|
|
53
|
+
def self.attach_step_number(iteration, step_count)
|
|
54
|
+
iteration.merge(step_number: step_count)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Merges the collected iterations into the result response.
|
|
58
|
+
#
|
|
59
|
+
# @param result [Hash] The final result hash from the loop.
|
|
60
|
+
# @param iterations_log [Array<Hash>] Collected iteration metadata.
|
|
61
|
+
# @return [Hash] The result with :iterations injected into :response.
|
|
62
|
+
def self.merge_iterations(result, iterations_log)
|
|
63
|
+
response = result[:response] || {}
|
|
64
|
+
result.merge(response: response.merge(iterations: iterations_log))
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../client'
|
|
4
|
+
require_relative 'tool_executor'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Agent
|
|
8
|
+
class ReactAgent
|
|
9
|
+
# Service object responsible for executing a single step of the ReAct loop.
|
|
10
|
+
class Step
|
|
11
|
+
# Executes one iteration of reasoning and potential tool usage.
|
|
12
|
+
#
|
|
13
|
+
# @param messages [Array<Hash>] The conversation history.
|
|
14
|
+
# @param config [Hash] Configuration for this step (client params, system prompt, working dir).
|
|
15
|
+
# @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished), and :messages.
|
|
16
|
+
def self.call(messages, config)
|
|
17
|
+
messages = messages.dup
|
|
18
|
+
client_result = Client.call(
|
|
19
|
+
system_prompt: config[:system_prompt],
|
|
20
|
+
messages: messages,
|
|
21
|
+
tools: Tools.definitions,
|
|
22
|
+
**config[:client_params]
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
unless client_result[:success]
|
|
26
|
+
error_msg = client_result.dig(:response, :error, :message) || 'Unknown error'
|
|
27
|
+
return {
|
|
28
|
+
continue: false,
|
|
29
|
+
result: client_result,
|
|
30
|
+
iteration: build_iteration(thought: '', tools_used: [], observation_summary: error_msg)
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
response_msg = client_result.dig(:response, :message)
|
|
35
|
+
unless response_msg
|
|
36
|
+
return {
|
|
37
|
+
continue: false,
|
|
38
|
+
result: { success: false, response: { error: { message: 'Empty response from LLM' } } },
|
|
39
|
+
iteration: build_iteration(thought: '', tools_used: [], observation_summary: 'Empty response from LLM')
|
|
40
|
+
}
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
messages << response_msg
|
|
44
|
+
|
|
45
|
+
tool_calls = response_msg['tool_calls']
|
|
46
|
+
content = response_msg['content']
|
|
47
|
+
tool_calls_array = Array(tool_calls)
|
|
48
|
+
thought = content.to_s
|
|
49
|
+
|
|
50
|
+
if tool_calls_array.empty?
|
|
51
|
+
return {
|
|
52
|
+
continue: false,
|
|
53
|
+
result: { success: true, response: { content: content } },
|
|
54
|
+
iteration: build_iteration(thought: thought, tools_used: [], observation_summary: '')
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
if thought.strip.length.positive?
|
|
59
|
+
warn "\n=== Agent Thought ==="
|
|
60
|
+
warn content
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
tool_results = ToolExecutor.call(tool_calls, config[:working_dir], config[:container_id])
|
|
64
|
+
messages.concat(tool_results)
|
|
65
|
+
|
|
66
|
+
tools_used = tool_calls_array.map { |tc| tc.dig('function', 'name') }.compact
|
|
67
|
+
observation_summary = Array(tool_results).map { |tr| tr[:content] || tr['content'] }.compact.join(', ')
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
continue: true,
|
|
71
|
+
messages: messages,
|
|
72
|
+
iteration: build_iteration(thought: thought, tools_used: tools_used, observation_summary: observation_summary)
|
|
73
|
+
}
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Builds an iteration metadata hash.
|
|
77
|
+
#
|
|
78
|
+
# @param thought [String] The agent's reasoning for this step.
|
|
79
|
+
# @param tools_used [Array<String>] Names of tools invoked.
|
|
80
|
+
# @param observation_summary [String] Summary of tool results.
|
|
81
|
+
# @return [Hash] Iteration metadata.
|
|
82
|
+
def self.build_iteration(thought:, tools_used:, observation_summary:)
|
|
83
|
+
{
|
|
84
|
+
thought: thought,
|
|
85
|
+
tools_used: tools_used,
|
|
86
|
+
observation_summary: observation_summary
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../tools'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Agent
|
|
7
|
+
class ReactAgent
|
|
8
|
+
# Service object responsible for executing a list of tool calls and returning the results
|
|
9
|
+
# formatted as messages to be appended to the conversation history.
|
|
10
|
+
class ToolExecutor
|
|
11
|
+
# Executes the provided tool calls.
|
|
12
|
+
#
|
|
13
|
+
# @param tool_calls [Array<Hash>] The tool calls requested by the LLM.
|
|
14
|
+
# @param working_dir [String] The directory where tools should operate.
|
|
15
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
16
|
+
# @return [Array<Hash>] An array of message hashes containing tool results.
|
|
17
|
+
def self.call(tool_calls, working_dir, container_id = nil)
|
|
18
|
+
tool_calls.map do |tool_call|
|
|
19
|
+
function_name = tool_call.dig('function', 'name')
|
|
20
|
+
next tool_error_message(tool_call, 'Missing function name') unless function_name
|
|
21
|
+
|
|
22
|
+
warn "=== Calling Tool: #{function_name} ===" unless defined?(Minitest)
|
|
23
|
+
|
|
24
|
+
result = execute_tool(tool_call, working_dir, container_id)
|
|
25
|
+
if result.is_a?(Hash) && result[:role] == 'tool'
|
|
26
|
+
result
|
|
27
|
+
else
|
|
28
|
+
error_msg = result.dig(:response, :error, :message) || 'Unknown tool error'
|
|
29
|
+
tool_error_message(tool_call, error_msg)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Executes a single tool call and returns the result message.
|
|
35
|
+
#
|
|
36
|
+
# @param tool_call [Hash] The tool call hash.
|
|
37
|
+
# @param working_dir [String] The directory where tools should operate.
|
|
38
|
+
# @param container_id [String, nil] The Docker container ID.
|
|
39
|
+
# @return [Hash] Tool result message or error hash.
|
|
40
|
+
def self.execute_tool(tool_call, working_dir, container_id)
|
|
41
|
+
function_name = tool_call.dig('function', 'name')
|
|
42
|
+
arguments = tool_call.dig('function', 'arguments')
|
|
43
|
+
|
|
44
|
+
result = Tools.execute(function_name, arguments, working_dir, container_id)
|
|
45
|
+
|
|
46
|
+
{
|
|
47
|
+
role: 'tool',
|
|
48
|
+
tool_call_id: tool_call['id'],
|
|
49
|
+
content: result
|
|
50
|
+
}
|
|
51
|
+
rescue StandardError => e
|
|
52
|
+
SkillBench::ErrorLogger.log_error(e, "Tool execution failed: #{function_name}")
|
|
53
|
+
tool_error_result(tool_call, e.message)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Builds a tool error message for the conversation history.
|
|
57
|
+
#
|
|
58
|
+
# @param tool_call [Hash] The tool call hash.
|
|
59
|
+
# @param message [String] The error message.
|
|
60
|
+
# @return [Hash] Tool message with error content.
|
|
61
|
+
def self.tool_error_message(tool_call, message)
|
|
62
|
+
{
|
|
63
|
+
role: 'tool',
|
|
64
|
+
tool_call_id: tool_call['id'],
|
|
65
|
+
content: "Error: #{message}"
|
|
66
|
+
}
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Builds an error result for a failed tool call.
|
|
70
|
+
#
|
|
71
|
+
# @param tool_call [Hash] The tool call hash.
|
|
72
|
+
# @param message [String] The error message.
|
|
73
|
+
# @return [Hash] Error result hash.
|
|
74
|
+
def self.tool_error_result(tool_call, message)
|
|
75
|
+
{
|
|
76
|
+
success: false,
|
|
77
|
+
response: {
|
|
78
|
+
error: {
|
|
79
|
+
message: "Tool call failed: #{message}",
|
|
80
|
+
tool_call: tool_call
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'react_agent/step'
|
|
4
|
+
require_relative 'react_agent/loop_runner'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Agent
|
|
8
|
+
# An agent that follows the ReAct (Reasoning and Acting) loop pattern.
|
|
9
|
+
# It executes a given task by repeatedly thinking, invoking tools, and observing the results
|
|
10
|
+
# until it finishes the task or reaches the maximum number of iterations.
|
|
11
|
+
class ReactAgent
|
|
12
|
+
# Error message returned when the ReAct loop reaches max iterations.
|
|
13
|
+
MAX_ITERATIONS_REACHED = 'Reached max iterations without finishing.'
|
|
14
|
+
|
|
15
|
+
# Starts the ReAct loop for a specific task.
|
|
16
|
+
#
|
|
17
|
+
# @param params [Hash] The configuration for the agent.
|
|
18
|
+
# @option params [String] :system_prompt The instructions establishing the agent's persona and rules.
|
|
19
|
+
# @option params [String] :initial_prompt The user task the agent must complete.
|
|
20
|
+
# @option params [Integer] :max_iterations (25) The maximum allowed steps before aborting.
|
|
21
|
+
# @option params [String] :working_dir (Dir.pwd) The directory where tools should operate.
|
|
22
|
+
# @option params [Hash] :client_params ({}) Configuration passed to the Client (e.g., model).
|
|
23
|
+
# @return [Hash] A result hash with :success, and :response payload containing the final answer.
|
|
24
|
+
def self.call(params)
|
|
25
|
+
new(params).call
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @param params [Hash] The configuration for the agent.
|
|
29
|
+
def initialize(params)
|
|
30
|
+
@system_prompt = params[:system_prompt]
|
|
31
|
+
@initial_prompt = params[:initial_prompt]
|
|
32
|
+
@max_iterations = params[:max_iterations] || 25
|
|
33
|
+
@working_dir = params[:working_dir] || Dir.pwd
|
|
34
|
+
@container_id = params[:container_id]
|
|
35
|
+
@client_params = params[:client_params] || {}
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Executes the ReAct loop.
|
|
39
|
+
#
|
|
40
|
+
# @return [Hash] The standardized result hash indicating success or failure.
|
|
41
|
+
def call
|
|
42
|
+
config = build_step_config
|
|
43
|
+
LoopRunner.call(@initial_prompt, @max_iterations, config)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def build_step_config
|
|
49
|
+
{
|
|
50
|
+
system_prompt: @system_prompt,
|
|
51
|
+
client_params: @client_params,
|
|
52
|
+
working_dir: @working_dir,
|
|
53
|
+
container_id: @container_id
|
|
54
|
+
}
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|