ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
data/bin/skill-bench ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
5
+ require 'skill_bench'
6
+
7
+ begin
8
+ exit SkillBench::CLI.call(ARGV)
9
+ rescue Interrupt
10
+ warn "\nInterrupted."
11
+ exit 130
12
+ rescue StandardError => e
13
+ warn "Error: #{e.message}"
14
+ exit 1
15
+ end
@@ -0,0 +1,200 @@
1
+ # SkillBench Architecture
2
+
3
+ Ruby Skill Bench provides a reproducible and isolated environment for testing AI agents. It consists of several decoupled components that orchestrate the evaluation flow.
4
+
5
+ ## High-Level Flow
6
+
7
+ 1. **`RunnerService`**: The entry point. Resolves eval, skill, and provider, then runs baseline and context agents.
8
+ 2. **`Sandbox`**: Creates a temporary directory, copies task files, and initializes a Git repository for clean, reproducible runs.
9
+ 3. **`ContextHydrator`**: Loads skill documentation (.md, .rb, .json, .yml, .yaml, .txt up to 50KB each) and wraps it in XML for the agent's system prompt.
10
+ 4. **`ReactAgent`**: Autonomous agent following a **Thought → Tool → Observation** loop.
11
+ 5. **`EvaluationRunner`**: Orchestrates blind judging — builds `JudgePrompt` for baseline and context outputs, calls `Judge` twice, then computes deltas via `DeltaReport`.
12
+ 6. **`DeltaReport`**: Computes per-dimension deltas and determines verdict based on `pass_threshold` and `minimum_delta`.
13
+ 7. **`Client`**: Provider-agnostic abstraction for LLM backends.
14
+
15
+ ## Key Components
16
+
17
+ ### `SkillBench::Services::RunnerService`
18
+
19
+ - Resolves eval, skill, and provider configuration.
20
+ - Runs baseline agent (no skill context) and context agent (with skill context).
21
+ - Delegates judging and delta computation to `EvaluationRunner`.
22
+ - Falls back to mock provider when config is unavailable.
23
+
24
+ ### `SkillBench::EvaluationRunner`
25
+
26
+ - Builds `JudgePrompt` for baseline and context outputs.
27
+ - Calls `Judge` twice (blind scoring).
28
+ - Uses `DeltaReport` to compute per-dimension deltas and final verdict.
29
+
30
+ ### `SkillBench::DeltaReport`
31
+
32
+ - Computes baseline vs context deltas per dimension.
33
+ - Verdict requires: `context_total >= pass_threshold` AND `total_delta >= minimum_delta`.
34
+
35
+ ### `SkillBench::CLI` Commands
36
+
37
+ - `InitCommand` — Creates `skill-bench.json` configuration
38
+ - `RunCommand` — Executes evaluations
39
+ - `SkillCommand` — Scaffolds new skills with templates
40
+ - `EvalCommand` — Creates evaluation scenarios
41
+
42
+ ### `SkillBench::Services::TemplateRegistry`
43
+
44
+ - Provides pre-built templates for generating eval scaffolding
45
+ - Supports three template types: `task_md`, `criteria_json`, `skill_md`
46
+ - Offers 10 Rails pattern categories: `crud`, `api`, `background_job`, `controller`, `model`, `migration`, `concern`, `policy`, `form_object`, `view_component`
47
+ - Enables variable interpolation using `{{variable_name}}` syntax
48
+ - Used for programmatic eval creation and tool building
49
+
50
+ ### `SkillBench::Sandbox`
51
+
52
+ - Uses `Dir.mktmpdir` for isolation.
53
+ - Captures state changes using `git diff`.
54
+ - Validates sandbox path to prevent directory traversal.
55
+ - Cleans up automatically after execution.
56
+
57
+ ### `SkillBench::ReactAgent`
58
+
59
+ - Implements a stateful loop.
60
+ - Supports tool usage (e.g., `read_file`, `write_file`, `run_shell_command`).
61
+ - Manages conversation history.
62
+
63
+ ### `SkillBench::Clients::BaseClient`
64
+
65
+ - Implements the **Template Method** pattern.
66
+ - Handles Faraday connection setup and timeouts.
67
+ - Centralizes error logging and response normalization.
68
+ - Delegates to `ResponseParser`, `ResponseErrorHandler`, and `RequestBuilder`
69
+
70
+ ### `SkillBench::OutputFormatter`
71
+
72
+ - Formats results as human-readable text, JSON, or JUnit XML
73
+ - Human format displays a dimension table with baseline, context, and delta columns
74
+ - Escapes XML output to prevent injection
75
+ - Provides exit codes for CI/CD integration
76
+
77
+ ### `SkillBench::ErrorLogger`
78
+
79
+ - Shared error logging module for all service objects
80
+ - Logs error message and full backtrace
81
+ - Uses `Rails.logger` when available, falls back to `warn`
82
+
83
+ ## Data Flow: What Passes Between Components
84
+
85
+ Understanding what data moves between components helps debug issues and write better evals.
86
+
87
+ ### Flow 1: RunnerService → EvaluationRunner
88
+
89
+ ```ruby
90
+ # RunnerService builds this and passes it to EvaluationRunner.call
91
+ evaluation = {
92
+ task: "Create a UserRegistrationService...", # from task.md
93
+ criteria: <Criteria object>, # from criteria.json
94
+ skill_context: "<agent_context>...SKILL.md...</agent_context>", # from ContextHydrator
95
+ baseline_output: '{"result":"...","status":":success"}', # from baseline agent run
96
+ context_output: '{"result":"...","status":":success"}' # from context agent run
97
+ }
98
+ ```
99
+
100
+ ### Flow 2: EvaluationRunner → Judge (two calls)
101
+
102
+ ```ruby
103
+ # First call — baseline (no skill context)
104
+ JudgePrompt.call(
105
+ task: task,
106
+ criteria: criteria,
107
+ skill_context: "", # empty string for baseline
108
+ agent_output: baseline_output
109
+ )
110
+
111
+ # Second call — context (with skill context)
112
+ JudgePrompt.call(
113
+ task: task,
114
+ criteria: criteria,
115
+ skill_context: skill_context, # XML-wrapped SKILL.md
116
+ agent_output: context_output
117
+ )
118
+ ```
119
+
120
+ ### Flow 3: Judge → JudgeResponse
121
+
122
+ The judge returns a JSON string like:
123
+
124
+ ```json
125
+ {
126
+ "dimensions": {
127
+ "correctness": { "score": 28, "max_score": 30, "reasoning": "All requirements met." },
128
+ "skill_adherence": { "score": 22, "max_score": 25, "reasoning": "Used .call pattern correctly." }
129
+ },
130
+ "overall_reasoning": "Solid implementation."
131
+ }
132
+ ```
133
+
134
+ `JudgeResponse` parses this, validates that scores are numeric and within bounds, and returns a structured object.
135
+
136
+ ### Flow 4: DeltaReport → Output
137
+
138
+ ```ruby
139
+ # DeltaReport receives two JudgeResponse objects
140
+ baseline = {
141
+ 'correctness' => { score: 12, max_score: 30 },
142
+ 'skill_adherence' => { score: 5, max_score: 25 }
143
+ }
144
+
145
+ context = {
146
+ 'correctness' => { score: 28, max_score: 30 },
147
+ 'skill_adherence' => { score: 22, max_score: 25 }
148
+ }
149
+
150
+ # Produces:
151
+ deltas = {
152
+ 'correctness' => 16, # 28 - 12
153
+ 'skill_adherence' => 17 # 22 - 5
154
+ }
155
+
156
+ baseline_total = 17 # 12 + 5
157
+ context_total = 50 # 28 + 22
158
+ verdict = context_total >= pass_threshold && (context_total - baseline_total) >= minimum_delta
159
+ ```
160
+
161
+ ## Directory Structure
162
+
163
+ The evaluator relies on a strict directory convention:
164
+
165
+ ```bash
166
+ project-root/
167
+ ├── skill-bench.json # Provider configuration
168
+ ├── skills/
169
+ │ └── my-service/
170
+ │ └── SKILL.md # Skill instructions
171
+ ├── evals/
172
+ │ └── my-first-eval/
173
+ │ ├── task.md # Agent prompt
174
+ │ └── criteria.json # Scoring rules
175
+ └── .skill-bench-history.json # Benchmark history (auto-generated)
176
+ ```
177
+
178
+ ### Skill Discovery
179
+
180
+ Skills are discovered recursively. These are all valid:
181
+
182
+ ```bash
183
+ skills/my-service/SKILL.md
184
+ skills/api/rest-collection/SKILL.md
185
+ skills/workflows/tdd-loop/SKILL.md
186
+ ```
187
+
188
+ The `SkillResolver` walks `skills/` recursively and matches by directory name.
189
+
190
+ ### Eval Discovery
191
+
192
+ Evals are resolved in this order:
193
+
194
+ 1. If the path contains `/`, use it as-is (e.g., `evals/my-eval`)
195
+ 2. Otherwise, prepend `evals/` (e.g., `my-eval` → `evals/my-eval`)
196
+
197
+ The eval directory must contain at minimum:
198
+
199
+ - `task.md` — the agent prompt
200
+ - `criteria.json` — the scoring rules (optional; defaults to empty criteria if missing)