ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,361 @@
1
+ # Testing Guide: Evaluations & Workflows
2
+
3
+ This guide explains how to run evaluations and how to create new evaluation tasks for skills and workflows.
4
+
5
+ ## Running Evaluations
6
+
7
+ The primary tool for running evaluations is the `skill-bench` CLI.
8
+
9
+ ### Basic Usage
10
+
11
+ To run a specific evaluation task:
12
+
13
+ ```bash
14
+ skill-bench run my-eval --skill=my-skill
15
+ ```
16
+
17
+ Provider is read from `skill-bench.json` — no `--provider` flag needed.
18
+
19
+ ### Output Formats
20
+
21
+ **Human-readable (default):**
22
+
23
+ ```text
24
+ ═══════════════════════════════════════════════════════
25
+ Eval: my-eval
26
+ Skill: my-skill
27
+ Provider: openai
28
+ ═══════════════════════════════════════════════════════
29
+
30
+ DIMENSION BASELINE CONTEXT DELTA
31
+ ──────────────────────── ───────── ───────── ───────
32
+ Correctness (30) 12 28 +16
33
+ Skill Adherence (25) 5 22 +17
34
+ Code Quality (20) 10 16 +6
35
+ Test Coverage (15) 3 13 +10
36
+ Documentation (10) 2 8 +6
37
+ ──────────────────────── ───────── ───────── ───────
38
+ TOTAL 32/100 87/100 +55
39
+
40
+ VERDICT: PASS (threshold: 70, minimum delta: 10)
41
+ ═══════════════════════════════════════════════════════
42
+ ```
43
+
44
+ **JSON:**
45
+
46
+ ```bash
47
+ skill-bench run my-eval --skill=my-skill --format json
48
+ ```
49
+
50
+ **JUnit XML:**
51
+
52
+ ```bash
53
+ skill-bench run my-eval --skill=my-skill --format=junit
54
+ ```
55
+
56
+ ### Batch Processing
57
+
58
+ To run an eval with a path containing a slash:
59
+
60
+ ```bash
61
+ skill-bench run evals/my-eval --skill=my-skill
62
+ ```
63
+
64
+ The evaluator resolves the path automatically.
65
+
66
+ ### Overriding Skill Context
67
+
68
+ By default, the evaluator infers the skill path from the evaluation path. If you need to test an evaluation against a different skill:
69
+
70
+ ```bash
71
+ skill-bench run my-eval --skill=skills/custom-skill
72
+ ```
73
+
74
+ ## Creating New Evaluations
75
+
76
+ An evaluation task consists of a directory containing at least two files: `task.md` and `criteria.json`.
77
+
78
+ ### 1. The Task (`task.md`)
79
+
80
+ This file contains the instructions for the AI agent. It should describe a specific problem to solve or a feature to implement.
81
+
82
+ **Best Practices:**
83
+
84
+ - Provide clear context and requirements.
85
+ - Include a description of the current codebase state.
86
+ - Specify the desired outcome.
87
+ - List acceptance criteria as numbered items (the judge checks these).
88
+
89
+ **Example — Good task.md:**
90
+
91
+ ```markdown
92
+ Create a `PasswordValidator` class that:
93
+
94
+ 1. Accepts a `password` string
95
+ 2. Validates minimum length of 8 characters
96
+ 3. Validates presence of at least one uppercase letter
97
+ 4. Validates presence of at least one digit
98
+ 5. Returns `{ valid: true }` or `{ valid: false, errors: [...] }`
99
+ 6. Includes RSpec tests with 100% branch coverage
100
+ 7. Uses `# frozen_string_literal: true`
101
+ 8. Has YARD docs for the class and all public methods
102
+ ```
103
+
104
+ **Why this works:** Each numbered item is a discrete acceptance criterion the judge can verify independently. Vague tasks like "create a password validator" produce inconsistent scores because the judge has to guess what "good" means.
105
+
106
+ ### 2. The Criteria (`criteria.json`)
107
+
108
+ This file defines the evaluation dimensions, weights, and thresholds:
109
+
110
+ ```json
111
+ {
112
+ "context": "Evaluate whether the skill helps build a proper API REST collection",
113
+ "dimensions": [
114
+ { "name": "correctness", "max_score": 30 },
115
+ { "name": "skill_adherence", "max_score": 25 },
116
+ { "name": "code_quality", "max_score": 20 },
117
+ { "name": "test_coverage", "max_score": 15 },
118
+ { "name": "documentation", "max_score": 10 }
119
+ ],
120
+ "pass_threshold": 70,
121
+ "minimum_delta": 10
122
+ }
123
+ ```
124
+
125
+ **Fields:**
126
+
127
+ | Field | Type | Required | Description |
128
+ |-------|------|----------|-------------|
129
+ | `context` | string | Yes | Shown to the judge. Describes what the eval measures. |
130
+ | `dimensions` | array | Yes | Array of `{ name, max_score }` objects. Must include all 5 core dimensions. `max_score` values must sum to exactly 100. |
131
+ | `pass_threshold` | integer | No | Minimum context score to pass. Default: 70. |
132
+ | `minimum_delta` | integer | No | Minimum improvement over baseline to pass. Default: 10. |
133
+ | `description` (per dimension) | string | No | Overrides the built-in default description for that dimension. |
134
+
135
+ **Custom dimension descriptions** are especially useful when a skill has specific hard rules. For example, if your skill requires the `.call` pattern, you can tell the judge exactly what to look for:
136
+
137
+ ```json
138
+ {
139
+ "name": "skill_adherence",
140
+ "max_score": 25,
141
+ "description": "Did the agent create a class with a `.call` class method that returns `{ success: bool, response: { ... } }`?"
142
+ }
143
+ ```
144
+
145
+ This produces more consistent scores than the generic default description.
146
+
147
+ ### 3. What the Judge Sees
148
+
149
+ Understanding the judge prompt helps you write better tasks and criteria. The judge receives a structured prompt with four sections:
150
+
151
+ ```text
152
+ ## Task
153
+ [Contents of task.md]
154
+
155
+ ## Criteria
156
+ Context: [Contents of criteria.json context]
157
+ Dimensions:
158
+ - correctness: max_score=30, description=...
159
+ - skill_adherence: max_score=25, description=...
160
+ ...
161
+
162
+ ## Skill Context
163
+ [Contents of SKILL.md wrapped in XML]
164
+
165
+ ## Agent Output
166
+ [Git diff + file listing + reasoning excerpt]
167
+
168
+ ## Instructions
169
+ Score each dimension independently. Return JSON with:
170
+ - "dimensions": object mapping each dimension name to { "score": number, "max_score": number, "reasoning": string }
171
+ - "overall_reasoning": string summarizing the evaluation
172
+ ```
173
+
174
+ **Important:** The judge is called **twice** per eval — once for baseline output (no skill context section) and once for context output (with skill context). The judge never sees both outputs in the same call. This prevents the judge from being biased by direct comparison.
175
+
176
+ ---
177
+
178
+ ## Evaluating Workflows vs. Skills
179
+
180
+ ### Atomic Skills
181
+
182
+ Skills are isolated blocks of logic (e.g., a specific API pattern). Evaluations for skills should focus strictly on the adherence to the patterns defined in the skill's `SKILL.md`.
183
+
184
+ **Recommended weights for atomic skills:**
185
+
186
+ ```json
187
+ {
188
+ "dimensions": [
189
+ { "name": "correctness", "max_score": 30 },
190
+ { "name": "skill_adherence", "max_score": 30 },
191
+ { "name": "code_quality", "max_score": 20 },
192
+ { "name": "test_coverage", "max_score": 10 },
193
+ { "name": "documentation", "max_score": 10 }
194
+ ],
195
+ "pass_threshold": 70,
196
+ "minimum_delta": 10
197
+ }
198
+ ```
199
+
200
+ Skill Adherence is weighted highest because the core question is "did the skill help?"
201
+
202
+ ### Workflows
203
+
204
+ Workflows are sequences of skills or complex orchestrations (e.g., the full TDD loop). Evaluations for workflows should focus on the process, the ordering of tasks, and the successful completion of a multi-step objective.
205
+
206
+ **Recommended weights for workflows:**
207
+
208
+ ```json
209
+ {
210
+ "dimensions": [
211
+ { "name": "correctness", "max_score": 35 },
212
+ { "name": "skill_adherence", "max_score": 20 },
213
+ { "name": "code_quality", "max_score": 20 },
214
+ { "name": "test_coverage", "max_score": 15 },
215
+ { "name": "documentation", "max_score": 10 }
216
+ ],
217
+ "pass_threshold": 65,
218
+ "minimum_delta": 15
219
+ }
220
+ ```
221
+
222
+ Correctness is weighted higher because workflows are judged on end-to-end success. The `minimum_delta` is also higher (15 vs 10) because workflows are expected to show stronger skill impact.
223
+
224
+ ---
225
+
226
+ ## Interpreting the Output
227
+
228
+ ### Human-Readable Format
229
+
230
+ ```text
231
+ ═══════════════════════════════════════════════════════
232
+ Eval: my-eval
233
+ Skill: my-skill
234
+ Provider: openai
235
+ ═══════════════════════════════════════════════════════
236
+
237
+ DIMENSION BASELINE CONTEXT DELTA
238
+ ──────────────────────── ───────── ───────── ───────
239
+ Correctness (30) 12 28 +16
240
+ Skill Adherence (25) 5 22 +17
241
+ Code Quality (20) 10 16 +6
242
+ Test Coverage (15) 3 13 +10
243
+ Documentation (10) 2 8 +6
244
+ ──────────────────────── ───────── ───────── ───────
245
+ TOTAL 32/100 87/100 +55
246
+
247
+ TREND: baseline ↑ (+2), context ↑ (+7)
248
+ VERDICT: PASS (threshold: 70, minimum delta: 10)
249
+ ═══════════════════════════════════════════════════════
250
+ ```
251
+
252
+ **Reading the table:**
253
+
254
+ - **BASELINE:** What the agent produced *without* the skill. Think of this as "raw" ability.
255
+ - **CONTEXT:** What the agent produced *with* the skill. Think of this as "aided" ability.
256
+ - **DELTA:** The improvement. `+16` means the skill added 16 points to that dimension.
257
+ - **TOTAL:** Sum of all dimension scores. The `/100` reminds you of the maximum.
258
+
259
+ **Verdict logic:**
260
+
261
+ ```ruby
262
+ pass = context_total >= pass_threshold && total_delta >= minimum_delta
263
+ ```
264
+
265
+ Both must be true. This prevents two failure modes:
266
+
267
+ 1. **High absolute, no improvement:** baseline=80, context=80, delta=0 → FAIL (skill didn't help)
268
+ 2. **Low absolute, small improvement:** baseline=10, context=20, delta=10 → FAIL (still terrible)
269
+
270
+ **TREND line:**
271
+
272
+ ```text
273
+ TREND: baseline ↑ (+2), context ↑ (+7)
274
+ ```
275
+
276
+ This compares the current run against the **previous run of the same eval + skill** (stored in `.skill-bench-history.json`).
277
+
278
+ - `↑` = improved since last run
279
+ - `↓` = regressed since last run
280
+ - `→` = unchanged
281
+
282
+ The numbers in parentheses are the point differences. This helps you track whether your skill is getting better over time.
283
+
284
+ ### JSON Format
285
+
286
+ ```bash
287
+ skill-bench run my-eval --skill=my-skill --format json
288
+ ```
289
+
290
+ Returns a structured hash with:
291
+
292
+ - `eval_name`, `skill_name`, `provider_name`
293
+ - `report` containing: `verdict`, `baseline_total`, `context_total`, `deltas`, `baseline_scores`, `context_scores`, `criteria`
294
+ - `trend` (if history exists): `baseline_trend`, `context_trend`, `baseline_delta`, `context_delta`, `previous_run`
295
+
296
+ Useful for CI/CD pipelines and automated reporting.
297
+
298
+ ### JUnit XML Format
299
+
300
+ ```bash
301
+ skill-bench run my-eval --skill=my-skill --format junit
302
+ ```
303
+
304
+ Returns standard JUnit XML. Useful for GitHub Actions, Jenkins, and other CI systems that parse JUnit reports.
305
+
306
+ ---
307
+
308
+ ## Running the Test Suite
309
+
310
+ The project uses Minitest with 440+ tests covering:
311
+
312
+ - Core evaluation engine (`test/evaluator/`)
313
+ - CLI commands and models (`test/agent_eval/`)
314
+ - Provider clients (`test/clients/`)
315
+ - Skill services (`test/skills/`)
316
+
317
+ ```bash
318
+ # Run all tests
319
+ bundle exec rake test
320
+
321
+ # Run with coverage report
322
+ bundle exec rake test COVERAGE=true
323
+
324
+ # Run specific test file
325
+ bundle exec ruby -Itest test/integration_test.rb
326
+
327
+ # Run lint checks
328
+ bundle exec rake rubocop
329
+ bundle exec rake reek
330
+ ```
331
+
332
+ ### Test Isolation
333
+
334
+ Tests use temporary directories and restore the original working directory:
335
+
336
+ ```ruby
337
+ def setup
338
+ @original_dir = Dir.pwd
339
+ @tmp_dir = Dir.mktmpdir('test')
340
+ Dir.chdir(@tmp_dir)
341
+ end
342
+
343
+ def teardown
344
+ Dir.chdir(@original_dir)
345
+ FileUtils.rm_rf(@tmp_dir)
346
+ end
347
+ ```
348
+
349
+ ### Environment Variable Handling
350
+
351
+ Tests that modify ENV must restore original values:
352
+
353
+ ```ruby
354
+ def test_something
355
+ original_key = ENV.fetch('SKILL_BENCH_OPENAI_API_KEY', nil)
356
+ ENV.delete('SKILL_BENCH_OPENAI_API_KEY')
357
+ # ... test code ...
358
+ ensure
359
+ ENV['SKILL_BENCH_OPENAI_API_KEY'] = original_key if original_key
360
+ end
361
+ ```
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'step'
4
+
5
+ module SkillBench
6
+ module Agent
7
+ class ReactAgent
8
+ # Executes the ReAct loop iterations until completion or max iterations.
9
+ class LoopRunner
10
+ # Executes the loop.
11
+ #
12
+ # @param initial_prompt [String] The user task the agent must complete.
13
+ # @param max_iterations [Integer] The maximum allowed steps before aborting.
14
+ # @param config [Hash] The configuration for the Step execution.
15
+ # @return [Hash] A result hash indicating success or failure.
16
+ def self.call(initial_prompt, max_iterations, config)
17
+ messages = [{ role: 'user', content: initial_prompt }]
18
+ iterations_log = []
19
+ step_count = 0
20
+
21
+ while step_count < max_iterations
22
+ step_count += 1
23
+
24
+ step_result = Step.call(messages, config)
25
+ iteration = step_result[:iteration]
26
+ iterations_log << attach_step_number(iteration, step_count) if iteration
27
+
28
+ unless step_result[:continue]
29
+ final_result = step_result[:result] || { success: false, response: { error: { message: 'Step returned no result' } } }
30
+ return merge_iterations(final_result, iterations_log)
31
+ end
32
+
33
+ messages = step_result[:messages]
34
+ end
35
+
36
+ merge_iterations(
37
+ { success: false, response: { error: { message: Agent::ReactAgent::MAX_ITERATIONS_REACHED } } },
38
+ iterations_log
39
+ )
40
+ rescue StandardError => e
41
+ SkillBench::ErrorLogger.log_error(e, 'ReactAgent Error')
42
+ merge_iterations(
43
+ { success: false, response: { error: { message: e.message } } },
44
+ iterations_log
45
+ )
46
+ end
47
+
48
+ # Attaches the step number to an iteration hash.
49
+ #
50
+ # @param iteration [Hash] The iteration metadata from a Step.
51
+ # @param step_count [Integer] The current step number.
52
+ # @return [Hash] The iteration with :step_number added.
53
+ def self.attach_step_number(iteration, step_count)
54
+ iteration.merge(step_number: step_count)
55
+ end
56
+
57
+ # Merges the collected iterations into the result response.
58
+ #
59
+ # @param result [Hash] The final result hash from the loop.
60
+ # @param iterations_log [Array<Hash>] Collected iteration metadata.
61
+ # @return [Hash] The result with :iterations injected into :response.
62
+ def self.merge_iterations(result, iterations_log)
63
+ response = result[:response] || {}
64
+ result.merge(response: response.merge(iterations: iterations_log))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../client'
4
+ require_relative 'tool_executor'
5
+
6
+ module SkillBench
7
+ module Agent
8
+ class ReactAgent
9
+ # Service object responsible for executing a single step of the ReAct loop.
10
+ class Step
11
+ # Executes one iteration of reasoning and potential tool usage.
12
+ #
13
+ # @param messages [Array<Hash>] The conversation history.
14
+ # @param config [Hash] Configuration for this step (client params, system prompt, working dir).
15
+ # @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished), and :messages.
16
+ def self.call(messages, config)
17
+ messages = messages.dup
18
+ client_result = Client.call(
19
+ system_prompt: config[:system_prompt],
20
+ messages: messages,
21
+ tools: Tools.definitions,
22
+ **config[:client_params]
23
+ )
24
+
25
+ unless client_result[:success]
26
+ error_msg = client_result.dig(:response, :error, :message) || 'Unknown error'
27
+ return {
28
+ continue: false,
29
+ result: client_result,
30
+ iteration: build_iteration(thought: '', tools_used: [], observation_summary: error_msg)
31
+ }
32
+ end
33
+
34
+ response_msg = client_result.dig(:response, :message)
35
+ unless response_msg
36
+ return {
37
+ continue: false,
38
+ result: { success: false, response: { error: { message: 'Empty response from LLM' } } },
39
+ iteration: build_iteration(thought: '', tools_used: [], observation_summary: 'Empty response from LLM')
40
+ }
41
+ end
42
+
43
+ messages << response_msg
44
+
45
+ tool_calls = response_msg['tool_calls']
46
+ content = response_msg['content']
47
+ tool_calls_array = Array(tool_calls)
48
+ thought = content.to_s
49
+
50
+ if tool_calls_array.empty?
51
+ return {
52
+ continue: false,
53
+ result: { success: true, response: { content: content } },
54
+ iteration: build_iteration(thought: thought, tools_used: [], observation_summary: '')
55
+ }
56
+ end
57
+
58
+ if thought.strip.length.positive?
59
+ warn "\n=== Agent Thought ==="
60
+ warn content
61
+ end
62
+
63
+ tool_results = ToolExecutor.call(tool_calls, config[:working_dir], config[:container_id])
64
+ messages.concat(tool_results)
65
+
66
+ tools_used = tool_calls_array.map { |tc| tc.dig('function', 'name') }.compact
67
+ observation_summary = Array(tool_results).map { |tr| tr[:content] || tr['content'] }.compact.join(', ')
68
+
69
+ {
70
+ continue: true,
71
+ messages: messages,
72
+ iteration: build_iteration(thought: thought, tools_used: tools_used, observation_summary: observation_summary)
73
+ }
74
+ end
75
+
76
+ # Builds an iteration metadata hash.
77
+ #
78
+ # @param thought [String] The agent's reasoning for this step.
79
+ # @param tools_used [Array<String>] Names of tools invoked.
80
+ # @param observation_summary [String] Summary of tool results.
81
+ # @return [Hash] Iteration metadata.
82
+ def self.build_iteration(thought:, tools_used:, observation_summary:)
83
+ {
84
+ thought: thought,
85
+ tools_used: tools_used,
86
+ observation_summary: observation_summary
87
+ }
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../tools'
4
+
5
+ module SkillBench
6
+ module Agent
7
+ class ReactAgent
8
+ # Service object responsible for executing a list of tool calls and returning the results
9
+ # formatted as messages to be appended to the conversation history.
10
+ class ToolExecutor
11
+ # Executes the provided tool calls.
12
+ #
13
+ # @param tool_calls [Array<Hash>] The tool calls requested by the LLM.
14
+ # @param working_dir [String] The directory where tools should operate.
15
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
16
+ # @return [Array<Hash>] An array of message hashes containing tool results.
17
+ def self.call(tool_calls, working_dir, container_id = nil)
18
+ tool_calls.map do |tool_call|
19
+ function_name = tool_call.dig('function', 'name')
20
+ next tool_error_message(tool_call, 'Missing function name') unless function_name
21
+
22
+ warn "=== Calling Tool: #{function_name} ===" unless defined?(Minitest)
23
+
24
+ result = execute_tool(tool_call, working_dir, container_id)
25
+ if result.is_a?(Hash) && result[:role] == 'tool'
26
+ result
27
+ else
28
+ error_msg = result.dig(:response, :error, :message) || 'Unknown tool error'
29
+ tool_error_message(tool_call, error_msg)
30
+ end
31
+ end
32
+ end
33
+
34
+ # Executes a single tool call and returns the result message.
35
+ #
36
+ # @param tool_call [Hash] The tool call hash.
37
+ # @param working_dir [String] The directory where tools should operate.
38
+ # @param container_id [String, nil] The Docker container ID.
39
+ # @return [Hash] Tool result message or error hash.
40
+ def self.execute_tool(tool_call, working_dir, container_id)
41
+ function_name = tool_call.dig('function', 'name')
42
+ arguments = tool_call.dig('function', 'arguments')
43
+
44
+ result = Tools.execute(function_name, arguments, working_dir, container_id)
45
+
46
+ {
47
+ role: 'tool',
48
+ tool_call_id: tool_call['id'],
49
+ content: result
50
+ }
51
+ rescue StandardError => e
52
+ SkillBench::ErrorLogger.log_error(e, "Tool execution failed: #{function_name}")
53
+ tool_error_result(tool_call, e.message)
54
+ end
55
+
56
+ # Builds a tool error message for the conversation history.
57
+ #
58
+ # @param tool_call [Hash] The tool call hash.
59
+ # @param message [String] The error message.
60
+ # @return [Hash] Tool message with error content.
61
+ def self.tool_error_message(tool_call, message)
62
+ {
63
+ role: 'tool',
64
+ tool_call_id: tool_call['id'],
65
+ content: "Error: #{message}"
66
+ }
67
+ end
68
+
69
+ # Builds an error result for a failed tool call.
70
+ #
71
+ # @param tool_call [Hash] The tool call hash.
72
+ # @param message [String] The error message.
73
+ # @return [Hash] Error result hash.
74
+ def self.tool_error_result(tool_call, message)
75
+ {
76
+ success: false,
77
+ response: {
78
+ error: {
79
+ message: "Tool call failed: #{message}",
80
+ tool_call: tool_call
81
+ }
82
+ }
83
+ }
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'react_agent/step'
4
+ require_relative 'react_agent/loop_runner'
5
+
6
+ module SkillBench
7
+ module Agent
8
+ # An agent that follows the ReAct (Reasoning and Acting) loop pattern.
9
+ # It executes a given task by repeatedly thinking, invoking tools, and observing the results
10
+ # until it finishes the task or reaches the maximum number of iterations.
11
+ class ReactAgent
12
+ # Error message returned when the ReAct loop reaches max iterations.
13
+ MAX_ITERATIONS_REACHED = 'Reached max iterations without finishing.'
14
+
15
+ # Starts the ReAct loop for a specific task.
16
+ #
17
+ # @param params [Hash] The configuration for the agent.
18
+ # @option params [String] :system_prompt The instructions establishing the agent's persona and rules.
19
+ # @option params [String] :initial_prompt The user task the agent must complete.
20
+ # @option params [Integer] :max_iterations (25) The maximum allowed steps before aborting.
21
+ # @option params [String] :working_dir (Dir.pwd) The directory where tools should operate.
22
+ # @option params [Hash] :client_params ({}) Configuration passed to the Client (e.g., model).
23
+ # @return [Hash] A result hash with :success, and :response payload containing the final answer.
24
+ def self.call(params)
25
+ new(params).call
26
+ end
27
+
28
+ # @param params [Hash] The configuration for the agent.
29
+ def initialize(params)
30
+ @system_prompt = params[:system_prompt]
31
+ @initial_prompt = params[:initial_prompt]
32
+ @max_iterations = params[:max_iterations] || 25
33
+ @working_dir = params[:working_dir] || Dir.pwd
34
+ @container_id = params[:container_id]
35
+ @client_params = params[:client_params] || {}
36
+ end
37
+
38
+ # Executes the ReAct loop.
39
+ #
40
+ # @return [Hash] The standardized result hash indicating success or failure.
41
+ def call
42
+ config = build_step_config
43
+ LoopRunner.call(@initial_prompt, @max_iterations, config)
44
+ end
45
+
46
+ private
47
+
48
+ def build_step_config
49
+ {
50
+ system_prompt: @system_prompt,
51
+ client_params: @client_params,
52
+ working_dir: @working_dir,
53
+ container_id: @container_id
54
+ }
55
+ end
56
+ end
57
+ end
58
+ end