rubric_llm 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +23 -0
- data/LICENSE.txt +21 -0
- data/README.md +352 -0
- data/lib/rubric_llm/comparison.rb +149 -0
- data/lib/rubric_llm/config.rb +93 -0
- data/lib/rubric_llm/errors.rb +9 -0
- data/lib/rubric_llm/evaluator.rb +51 -0
- data/lib/rubric_llm/judge.rb +63 -0
- data/lib/rubric_llm/metrics/base.rb +29 -0
- data/lib/rubric_llm/metrics/context_precision.rb +49 -0
- data/lib/rubric_llm/metrics/context_recall.rb +50 -0
- data/lib/rubric_llm/metrics/correctness.rb +46 -0
- data/lib/rubric_llm/metrics/factual_accuracy.rb +48 -0
- data/lib/rubric_llm/metrics/faithfulness.rb +51 -0
- data/lib/rubric_llm/metrics/relevance.rb +42 -0
- data/lib/rubric_llm/minitest.rb +59 -0
- data/lib/rubric_llm/report.rb +102 -0
- data/lib/rubric_llm/result.rb +43 -0
- data/lib/rubric_llm/retrieval_result.rb +68 -0
- data/lib/rubric_llm/rspec.rb +134 -0
- data/lib/rubric_llm/version.rb +5 -0
- data/lib/rubric_llm.rb +126 -0
- metadata +85 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: a549a52da8585cfbdf8287315548389de6e93b1b0adca9b8fb32fedde3d966e5
|
|
4
|
+
data.tar.gz: 3de825ff22b9c4b3dc091cc17ef089e504f53238857d30393b1951d55884c38d
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 004aff76b7b92d2266d75ec54b897d240cef7a86f424cbca8d1904d7e16eeb1821900c722e6df4f20be511936f90993e73b4e3aebd436e9d82f24ee558f12996
|
|
7
|
+
data.tar.gz: eacc3c28cb3d3504323dbbb491c3e9b8af6625520308cb319cab2d5164868ac56a52c3daed5a603e496c092e53a2c1a7b1f6f32754b9dc1c230ec0aebca30be9
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-03-24
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- LLM-as-Judge evaluation via RubyLLM (provider-agnostic)
|
|
15
|
+
- Built-in metrics: Faithfulness, Relevance, Correctness, ContextPrecision, ContextRecall, FactualAccuracy
|
|
16
|
+
- Pluggable metric interface (`RubricLLM::Metrics::Base`)
|
|
17
|
+
- Single-sample evaluation (`RubricLLM.evaluate`)
|
|
18
|
+
- Batch evaluation with reports (`RubricLLM.evaluate_batch`)
|
|
19
|
+
- A/B model comparison with paired t-tests (`RubricLLM.compare`)
|
|
20
|
+
- Retrieval metrics without LLM calls (`RubricLLM.evaluate_retrieval`)
|
|
21
|
+
- Minitest assertions (`RubricLLM::Assertions`)
|
|
22
|
+
- RSpec matchers (`RubricLLM::RSpecMatchers`)
|
|
23
|
+
- CSV and JSON export for reports
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 David Paluy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# RubricLLM
|
|
2
|
+
|
|
3
|
+
Lightweight LLM evaluation framework for Ruby, inspired by [Ragas](https://github.com/vibrantlabsai/ragas), powered by [RubyLLM](https://github.com/crmne/ruby_llm).
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/rb/rubric_llm)
|
|
6
|
+
[](https://github.com/dpaluy/rubric_llm/actions/workflows/ci.yml)
|
|
7
|
+
|
|
8
|
+
Provider-agnostic evaluation with pluggable metrics, statistical A/B comparison, and test framework integration — no Rails, no ActiveRecord, no UI. Works anywhere Ruby runs.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Add to your Gemfile:
|
|
13
|
+
|
|
14
|
+
```ruby
|
|
15
|
+
gem "rubric_llm"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Or install directly:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
gem install rubric_llm
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
require "rubric_llm"
|
|
28
|
+
|
|
29
|
+
RubricLLM.configure do |c|
|
|
30
|
+
c.judge_model = "gpt-4o"
|
|
31
|
+
c.judge_provider = :openai
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
result = RubricLLM.evaluate(
|
|
35
|
+
question: "What is the capital of France?",
|
|
36
|
+
answer: "The capital of France is Paris, located on the Seine river.",
|
|
37
|
+
context: ["Paris is the capital and largest city of France."],
|
|
38
|
+
ground_truth: "Paris"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
result.faithfulness # => 0.95
|
|
42
|
+
result.relevance # => 0.92
|
|
43
|
+
result.correctness # => 0.98
|
|
44
|
+
result.overall # => 0.94
|
|
45
|
+
result.pass? # => true
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Configuration
|
|
49
|
+
|
|
50
|
+
### Global
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
RubricLLM.configure do |c|
|
|
54
|
+
c.judge_model = "gpt-4o" # any model RubyLLM supports
|
|
55
|
+
c.judge_provider = :openai # :openai, :anthropic, :gemini, etc.
|
|
56
|
+
c.temperature = 0.0 # deterministic scoring (default)
|
|
57
|
+
c.max_tokens = 4096 # max tokens for judge response
|
|
58
|
+
end
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Environment Variables
|
|
62
|
+
|
|
63
|
+
All config fields can be set via environment variables:
|
|
64
|
+
|
|
65
|
+
| Variable | Default | Description |
|
|
66
|
+
|----------|---------|-------------|
|
|
67
|
+
| `RUBRIC_JUDGE_MODEL` | `gpt-4o` | Judge LLM model name |
|
|
68
|
+
| `RUBRIC_JUDGE_PROVIDER` | `openai` | RubyLLM provider |
|
|
69
|
+
| `RUBRIC_TEMPERATURE` | `0.0` | Judge temperature |
|
|
70
|
+
| `RUBRIC_MAX_TOKENS` | `4096` | Max response tokens |
|
|
71
|
+
| `RUBRIC_MAX_RETRIES` | `2` | Max retries on transient failures |
|
|
72
|
+
| `RUBRIC_RETRY_BASE_DELAY` | `1.0` | Base delay (seconds) for exponential backoff |
|
|
73
|
+
| `RUBRIC_CONCURRENCY` | `1` | Thread pool size for batch evaluation |
|
|
74
|
+
|
|
75
|
+
```ruby
|
|
76
|
+
# Reads all RUBRIC_* env vars automatically
|
|
77
|
+
config = RubricLLM::Config.from_env
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Per-Evaluation Override
|
|
81
|
+
|
|
82
|
+
```ruby
|
|
83
|
+
custom = RubricLLM::Config.new(judge_model: "claude-haiku-4-5", judge_provider: :anthropic)
|
|
84
|
+
|
|
85
|
+
result = RubricLLM.evaluate(question: "...", answer: "...", config: custom)
|
|
86
|
+
report = RubricLLM.evaluate_batch(dataset, config: custom)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Rails Setup
|
|
90
|
+
|
|
91
|
+
```ruby
|
|
92
|
+
# config/initializers/rubric_llm.rb
|
|
93
|
+
RubricLLM.configure do |c|
|
|
94
|
+
c.judge_model = "gpt-4o"
|
|
95
|
+
c.judge_provider = :openai
|
|
96
|
+
end
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Metrics
|
|
100
|
+
|
|
101
|
+
### LLM-as-Judge Metrics
|
|
102
|
+
|
|
103
|
+
These metrics use a judge LLM to evaluate quality. Each sends a structured prompt and parses a JSON response with a 0.0–1.0 score.
|
|
104
|
+
|
|
105
|
+
| Metric | Question it answers | Requires |
|
|
106
|
+
|--------|-------------------|----------|
|
|
107
|
+
| **Faithfulness** | Is every claim in the answer supported by the context? | `context` |
|
|
108
|
+
| **Relevance** | Does the answer address what was asked? | `question` |
|
|
109
|
+
| **Correctness** | Does the answer match the known correct answer? | `ground_truth` |
|
|
110
|
+
| **Context Precision** | Are the retrieved context chunks actually relevant? | `question`, `context` |
|
|
111
|
+
| **Context Recall** | Do the contexts cover the information in the ground truth? | `context`, `ground_truth` |
|
|
112
|
+
| **Factual Accuracy** | Are there factual discrepancies between candidate and reference? | `ground_truth` |
|
|
113
|
+
|
|
114
|
+
```ruby
|
|
115
|
+
# Only context — gets faithfulness, relevance, context_precision
|
|
116
|
+
result = RubricLLM.evaluate(
|
|
117
|
+
question: "How does photosynthesis work?",
|
|
118
|
+
answer: "Plants convert sunlight into energy.",
|
|
119
|
+
context: ["Photosynthesis is the process by which plants convert light energy into chemical energy."]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# With ground truth — gets all metrics
|
|
123
|
+
result = RubricLLM.evaluate(
|
|
124
|
+
question: "How does photosynthesis work?",
|
|
125
|
+
answer: "Plants convert sunlight into energy.",
|
|
126
|
+
context: ["Photosynthesis is the process by which plants convert light energy into chemical energy."],
|
|
127
|
+
ground_truth: "Plants use photosynthesis to convert sunlight, water, and CO2 into glucose and oxygen."
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Custom Metrics
|
|
132
|
+
|
|
133
|
+
```ruby
|
|
134
|
+
class ToneMetric < RubricLLM::Metrics::Base
|
|
135
|
+
SYSTEM_PROMPT = "Rate professional tone from 0.0 to 1.0. Respond with JSON: {\"score\": 0.0, \"tone\": \"description\"}"
|
|
136
|
+
|
|
137
|
+
def call(answer:, **)
|
|
138
|
+
result = judge_eval(system_prompt: SYSTEM_PROMPT, user_prompt: "Answer: #{answer}")
|
|
139
|
+
return { score: nil, details: result } unless result.is_a?(Hash) && result["score"]
|
|
140
|
+
|
|
141
|
+
{ score: Float(result["score"]), details: { tone: result["tone"] } }
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
result = RubricLLM.evaluate(
|
|
146
|
+
question: "q", answer: "a",
|
|
147
|
+
metrics: [RubricLLM::Metrics::Faithfulness, ToneMetric]
|
|
148
|
+
)
|
|
149
|
+
result.scores[:tone_metric] # => 0.85
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Retrieval Metrics
|
|
153
|
+
|
|
154
|
+
Pure math — no LLM calls, no API key needed.
|
|
155
|
+
|
|
156
|
+
```ruby
|
|
157
|
+
result = RubricLLM.evaluate_retrieval(
|
|
158
|
+
retrieved: ["doc_a", "doc_b", "doc_c", "doc_d"],
|
|
159
|
+
relevant: ["doc_a", "doc_c"]
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
result.precision_at_k(3) # => 0.67
|
|
163
|
+
result.recall_at_k(3) # => 1.0
|
|
164
|
+
result.mrr # => 1.0
|
|
165
|
+
result.ndcg # => 0.86
|
|
166
|
+
result.hit_rate # => 1.0
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Batch Evaluation
|
|
170
|
+
|
|
171
|
+
Evaluate a dataset and get aggregate statistics:
|
|
172
|
+
|
|
173
|
+
```ruby
|
|
174
|
+
dataset = [
|
|
175
|
+
{ question: "What is Ruby?", answer: "A programming language.",
|
|
176
|
+
context: ["Ruby is a dynamic language."], ground_truth: "Ruby is a programming language." },
|
|
177
|
+
{ question: "What is Rails?", answer: "A web framework.",
|
|
178
|
+
context: ["Rails is a web framework for Ruby."], ground_truth: "Rails is a Ruby web framework." },
|
|
179
|
+
# ...
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
report = RubricLLM.evaluate_batch(dataset)
|
|
183
|
+
|
|
184
|
+
# Speed up with concurrent evaluation (thread pool)
|
|
185
|
+
report = RubricLLM.evaluate_batch(dataset, concurrency: 4)
|
|
186
|
+
|
|
187
|
+
puts report.summary
|
|
188
|
+
# RubricLLM Evaluation Report
|
|
189
|
+
# ========================================
|
|
190
|
+
# Samples: 20
|
|
191
|
+
# Duration: 45.2s
|
|
192
|
+
# faithfulness mean=0.920 std=0.050 min=0.850 max=0.980 n=20
|
|
193
|
+
|
|
194
|
+
report.worst(3) # 3 lowest-scoring results
|
|
195
|
+
report.failures(threshold: 0.8) # results below 0.8
|
|
196
|
+
report.export_csv("results.csv") # export to CSV
|
|
197
|
+
report.export_json("results.json") # export to JSON
|
|
198
|
+
report.to_json # returns JSON string
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## A/B Model Comparison
|
|
202
|
+
|
|
203
|
+
Compare two models with statistical significance testing:
|
|
204
|
+
|
|
205
|
+
```ruby
|
|
206
|
+
config_a = RubricLLM::Config.new(judge_model: "gpt-4o")
|
|
207
|
+
config_b = RubricLLM::Config.new(judge_model: "claude-sonnet-4-6")
|
|
208
|
+
|
|
209
|
+
report_a = RubricLLM.evaluate_batch(dataset, config: config_a)
|
|
210
|
+
report_b = RubricLLM.evaluate_batch(dataset, config: config_b)
|
|
211
|
+
|
|
212
|
+
comparison = RubricLLM.compare(report_a, report_b)
|
|
213
|
+
|
|
214
|
+
puts comparison.summary
|
|
215
|
+
# A/B Comparison
|
|
216
|
+
# ======================================================================
|
|
217
|
+
# Metric A B Delta p-value Sig
|
|
218
|
+
# ----------------------------------------------------------------------
|
|
219
|
+
# faithfulness 0.880 0.920 +0.040 0.0230 *
|
|
220
|
+
# relevance 0.850 0.860 +0.010 0.4210
|
|
221
|
+
# correctness 0.910 0.940 +0.030 0.0089 **
|
|
222
|
+
|
|
223
|
+
comparison.significant_improvements # => [:faithfulness, :correctness]
|
|
224
|
+
comparison.significant_regressions # => []
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Significance markers: `*` (p < 0.05), `**` (p < 0.01), `***` (p < 0.001)
|
|
228
|
+
|
|
229
|
+
## Test Integration
|
|
230
|
+
|
|
231
|
+
### Minitest
|
|
232
|
+
|
|
233
|
+
```ruby
|
|
234
|
+
require "rubric_llm/minitest"
|
|
235
|
+
|
|
236
|
+
class AdvisorTest < Minitest::Test
|
|
237
|
+
include RubricLLM::Assertions
|
|
238
|
+
|
|
239
|
+
def test_answer_is_faithful
|
|
240
|
+
answer = my_llm.ask("What is Ruby?", context: docs)
|
|
241
|
+
assert_faithful answer, docs, threshold: 0.8
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def test_answer_is_correct
|
|
245
|
+
answer = my_llm.ask("What is 2+2?")
|
|
246
|
+
assert_correct answer, "4", threshold: 0.9
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def test_no_hallucination
|
|
250
|
+
answer = my_llm.ask("Summarize this", context: docs)
|
|
251
|
+
refute_hallucination answer, docs
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def test_answer_is_relevant
|
|
255
|
+
answer = my_llm.ask("How do I deploy Rails?")
|
|
256
|
+
assert_relevant "How do I deploy Rails?", answer, threshold: 0.7
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### RSpec
|
|
262
|
+
|
|
263
|
+
```ruby
|
|
264
|
+
require "rubric_llm/rspec"
|
|
265
|
+
|
|
266
|
+
RSpec.describe "My LLM" do
|
|
267
|
+
include RubricLLM::RSpecMatchers
|
|
268
|
+
|
|
269
|
+
let(:answer) { my_llm.ask(question, context: docs) }
|
|
270
|
+
|
|
271
|
+
it { expect(answer).to be_faithful_to(docs).with_threshold(0.8) }
|
|
272
|
+
it { expect(answer).to be_relevant_to(question) }
|
|
273
|
+
it { expect(answer).to be_correct_for(expected_answer) }
|
|
274
|
+
it { expect(answer).not_to hallucinate_from(docs) }
|
|
275
|
+
end
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Error Handling
|
|
279
|
+
|
|
280
|
+
```ruby
|
|
281
|
+
begin
|
|
282
|
+
result = RubricLLM.evaluate(question: "q", answer: "a", context: ["c"])
|
|
283
|
+
rescue RubricLLM::JudgeError => e
|
|
284
|
+
# LLM call failed (network, auth, rate limit)
|
|
285
|
+
puts "Judge error: #{e.message}"
|
|
286
|
+
rescue RubricLLM::ConfigurationError => e
|
|
287
|
+
# Invalid configuration
|
|
288
|
+
puts "Config error: #{e.message}"
|
|
289
|
+
rescue RubricLLM::Error => e
|
|
290
|
+
# Catch-all for any RubricLLM error
|
|
291
|
+
puts "Error: #{e.message}"
|
|
292
|
+
end
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
Individual metric failures are handled gracefully — a failed metric returns `nil` for the score and includes the error in details:
|
|
296
|
+
|
|
297
|
+
```ruby
|
|
298
|
+
result = RubricLLM.evaluate(question: "q", answer: "a")
|
|
299
|
+
result.scores[:faithfulness] # => nil (if judge failed)
|
|
300
|
+
result.details[:faithfulness][:error] # => "Judge call failed: ..."
|
|
301
|
+
result.overall # => mean of non-nil scores only
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
bundle install
|
|
308
|
+
bundle exec rake test
|
|
309
|
+
bundle exec rubocop
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Limitations
|
|
313
|
+
|
|
314
|
+
RubricLLM uses LLM-as-Judge — an LLM scores another LLM's output. This is the industry-standard approach (used by Ragas, DeepEval, ARES), but it means the judge shares the same class of failure modes as the system being evaluated. If the judge hallucinates that an answer is faithful, you get a false positive.
|
|
315
|
+
|
|
316
|
+
Mitigations built into the framework:
|
|
317
|
+
|
|
318
|
+
- **Cross-model judging.** Configure a different model as judge than the one being evaluated. Don't let GPT-4o grade GPT-4o.
|
|
319
|
+
- **Retrieval metrics are pure math.** `precision_at_k`, `recall_at_k`, `mrr`, `ndcg` — no LLM involved, no judge bias.
|
|
320
|
+
- **Custom non-LLM metrics.** Subclass `Metrics::Base` with regex checks, embedding similarity, or any deterministic logic.
|
|
321
|
+
- **Statistical comparison.** A/B testing with paired t-tests surfaces systematic judge bias across runs.
|
|
322
|
+
|
|
323
|
+
For high-stakes evaluation, pair LLM-as-Judge metrics with retrieval metrics and periodic human review.
|
|
324
|
+
|
|
325
|
+
## Why RubricLLM?
|
|
326
|
+
|
|
327
|
+
Ruby has two LLM evaluation options today. Neither fits most use cases:
|
|
328
|
+
|
|
329
|
+
| | [eval-ruby](https://github.com/johannesdwicahyo/eval-ruby) | [leva](https://github.com/kieranklaassen/leva) | RubricLLM |
|
|
330
|
+
|---|---|---|---|
|
|
331
|
+
| **What it is** | Generic RAG metrics | Rails engine with UI | Lightweight eval framework |
|
|
332
|
+
| **LLM access** | Raw HTTP (OpenAI/Anthropic only) | You implement it | RubyLLM (any provider) |
|
|
333
|
+
| **Rails required?** | No | Yes (engine + 6 migrations) | No |
|
|
334
|
+
| **ActiveRecord?** | No | Yes | No |
|
|
335
|
+
| **A/B comparison** | Basic | No | Paired t-test with p-values |
|
|
336
|
+
| **Test assertions** | Minitest + RSpec | No | Minitest + RSpec |
|
|
337
|
+
| **Pluggable metrics** | No (fixed set) | Yes | Yes |
|
|
338
|
+
| **Retrieval metrics** | Yes | No | Yes |
|
|
339
|
+
|
|
340
|
+
## Requirements
|
|
341
|
+
|
|
342
|
+
- Ruby >= 3.4
|
|
343
|
+
- [ruby_llm](https://github.com/crmne/ruby_llm) ~> 1.0
|
|
344
|
+
- An API key for your chosen LLM provider (set via RubyLLM configuration)
|
|
345
|
+
|
|
346
|
+
## Contributing
|
|
347
|
+
|
|
348
|
+
Bug reports and pull requests are welcome on [GitHub](https://github.com/dpaluy/rubric_llm).
|
|
349
|
+
|
|
350
|
+
## License
|
|
351
|
+
|
|
352
|
+
[MIT](LICENSE.txt)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubricLLM
|
|
4
|
+
class Comparison
|
|
5
|
+
attr_reader :report_a, :report_b
|
|
6
|
+
|
|
7
|
+
def initialize(report_a, report_b)
|
|
8
|
+
@report_a = report_a
|
|
9
|
+
@report_b = report_b
|
|
10
|
+
|
|
11
|
+
return if report_a.results.size == report_b.results.size
|
|
12
|
+
|
|
13
|
+
warn "[RubricLLM] Comparison reports have different sizes " \
|
|
14
|
+
"(#{report_a.results.size} vs #{report_b.results.size}). " \
|
|
15
|
+
"Unmatched pairs will be dropped."
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def results
|
|
19
|
+
@results ||= compute_results
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def summary
|
|
23
|
+
lines = ["A/B Comparison"]
|
|
24
|
+
lines << ("=" * 70)
|
|
25
|
+
lines << "Metric A B Delta p-value Sig"
|
|
26
|
+
lines << ("-" * 70)
|
|
27
|
+
|
|
28
|
+
results.each do |metric, r|
|
|
29
|
+
lines << format("%-20s %8.3f %8.3f %+8.3f %10.4f %4s",
|
|
30
|
+
metric, r[:mean_a], r[:mean_b], r[:delta], r[:p_value], r[:significance])
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
lines.join("\n")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def significant_improvements(alpha: 0.05)
|
|
37
|
+
results.select { |_, r| r[:p_value] < alpha && r[:delta].positive? }.keys
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def significant_regressions(alpha: 0.05)
|
|
41
|
+
results.select { |_, r| r[:p_value] < alpha && r[:delta].negative? }.keys
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def compute_results
|
|
47
|
+
metrics = (report_a.metric_stats.keys | report_b.metric_stats.keys)
|
|
48
|
+
|
|
49
|
+
metrics.each_with_object({}) do |metric, hash|
|
|
50
|
+
paired_scores = report_a.scores_for(metric)
|
|
51
|
+
.zip(report_b.scores_for(metric))
|
|
52
|
+
.select { |score_a, score_b| !score_a.nil? && !score_b.nil? }
|
|
53
|
+
|
|
54
|
+
next if paired_scores.empty?
|
|
55
|
+
|
|
56
|
+
scores_a, scores_b = paired_scores.transpose
|
|
57
|
+
|
|
58
|
+
mean_a = scores_a.sum / scores_a.size.to_f
|
|
59
|
+
mean_b = scores_b.sum / scores_b.size.to_f
|
|
60
|
+
delta = mean_b - mean_a
|
|
61
|
+
p_value = paired_t_test(scores_a, scores_b)
|
|
62
|
+
|
|
63
|
+
hash[metric] = {
|
|
64
|
+
mean_a:,
|
|
65
|
+
mean_b:,
|
|
66
|
+
delta:,
|
|
67
|
+
p_value:,
|
|
68
|
+
significance: significance_marker(p_value)
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def paired_t_test(a, b)
|
|
74
|
+
n = [a.size, b.size].min
|
|
75
|
+
return 1.0 if n < 2
|
|
76
|
+
|
|
77
|
+
diffs = a.first(n).zip(b.first(n)).map { |x, y| y - x }
|
|
78
|
+
mean_d = diffs.sum / n.to_f
|
|
79
|
+
var_d = diffs.sum { |d| (d - mean_d)**2 } / (n - 1).to_f
|
|
80
|
+
se = Math.sqrt(var_d / n)
|
|
81
|
+
|
|
82
|
+
return 1.0 if se.zero?
|
|
83
|
+
|
|
84
|
+
t = mean_d / se
|
|
85
|
+
df = n - 1
|
|
86
|
+
|
|
87
|
+
# Two-tailed p-value approximation using Student's t-distribution
|
|
88
|
+
two_tailed_p(t.abs, df)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Two-tailed p-value for Student's t-distribution.
|
|
92
|
+
# p = I_x(df/2, 1/2) where x = df/(df + t²)
|
|
93
|
+
def two_tailed_p(t, df)
|
|
94
|
+
x = df / (df + (t**2))
|
|
95
|
+
regularized_beta(x, df / 2.0, 0.5)
|
|
96
|
+
rescue StandardError
|
|
97
|
+
1.0
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Regularized incomplete beta function via continued fraction (Lentz's method).
|
|
101
|
+
def regularized_beta(x, a, b)
|
|
102
|
+
return 0.0 if x <= 0.0
|
|
103
|
+
return 1.0 if x >= 1.0
|
|
104
|
+
|
|
105
|
+
ln_beta = Math.lgamma(a)[0] + Math.lgamma(b)[0] - Math.lgamma(a + b)[0]
|
|
106
|
+
front = Math.exp((a * Math.log(x)) + (b * Math.log(1.0 - x)) - ln_beta) / a
|
|
107
|
+
|
|
108
|
+
# Lentz's continued fraction
|
|
109
|
+
c = 1.0
|
|
110
|
+
d = 1.0 - ((a + b) * x / (a + 1.0))
|
|
111
|
+
d = 1.0 if d.abs < 1e-30
|
|
112
|
+
d = 1.0 / d
|
|
113
|
+
f = d
|
|
114
|
+
|
|
115
|
+
(1..200).each do |m|
|
|
116
|
+
# Even step
|
|
117
|
+
numerator = m * (b - m) * x / ((a + (2 * m) - 1) * (a + (2 * m)))
|
|
118
|
+
d = 1.0 + (numerator * d)
|
|
119
|
+
d = 1e-30 if d.abs < 1e-30
|
|
120
|
+
c = 1.0 + (numerator / c)
|
|
121
|
+
c = 1e-30 if c.abs < 1e-30
|
|
122
|
+
d = 1.0 / d
|
|
123
|
+
f *= c * d
|
|
124
|
+
|
|
125
|
+
# Odd step
|
|
126
|
+
numerator = -(a + m) * (a + b + m) * x / ((a + (2 * m)) * (a + (2 * m) + 1))
|
|
127
|
+
d = 1.0 + (numerator * d)
|
|
128
|
+
d = 1e-30 if d.abs < 1e-30
|
|
129
|
+
c = 1.0 + (numerator / c)
|
|
130
|
+
c = 1e-30 if c.abs < 1e-30
|
|
131
|
+
d = 1.0 / d
|
|
132
|
+
delta = c * d
|
|
133
|
+
f *= delta
|
|
134
|
+
|
|
135
|
+
break if (delta - 1.0).abs < 1e-10
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
front * f
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def significance_marker(p)
|
|
142
|
+
if p < 0.001 then "***"
|
|
143
|
+
elsif p < 0.01 then "**"
|
|
144
|
+
elsif p < 0.05 then "*"
|
|
145
|
+
else ""
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubricLLM
|
|
4
|
+
class Config
|
|
5
|
+
attr_accessor :judge_model, :judge_provider, :temperature, :max_tokens, :custom_prompt,
|
|
6
|
+
:max_retries, :retry_base_delay, :concurrency
|
|
7
|
+
|
|
8
|
+
def initialize(judge_model: nil, judge_provider: nil, temperature: nil, max_tokens: nil, # rubocop:disable Metrics/ParameterLists
|
|
9
|
+
custom_prompt: nil, max_retries: nil, retry_base_delay: nil, concurrency: nil, validate: false)
|
|
10
|
+
@judge_model = judge_model || ENV.fetch("RUBRIC_JUDGE_MODEL", "gpt-4o")
|
|
11
|
+
@judge_provider = (judge_provider || ENV.fetch("RUBRIC_JUDGE_PROVIDER", "openai")).to_sym
|
|
12
|
+
@temperature = temperature || Float(ENV.fetch("RUBRIC_TEMPERATURE", "0.0"))
|
|
13
|
+
@max_tokens = max_tokens || Integer(ENV.fetch("RUBRIC_MAX_TOKENS", "4096"))
|
|
14
|
+
@custom_prompt = custom_prompt
|
|
15
|
+
@max_retries = max_retries || Integer(ENV.fetch("RUBRIC_MAX_RETRIES", "2"))
|
|
16
|
+
@retry_base_delay = retry_base_delay || Float(ENV.fetch("RUBRIC_RETRY_BASE_DELAY", "1.0"))
|
|
17
|
+
@concurrency = concurrency || Integer(ENV.fetch("RUBRIC_CONCURRENCY", "1"))
|
|
18
|
+
validate! if validate
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.from_env
|
|
22
|
+
new
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def validate!
|
|
26
|
+
validate_judge_model
|
|
27
|
+
validate_judge_provider
|
|
28
|
+
validate_temperature
|
|
29
|
+
validate_max_tokens
|
|
30
|
+
validate_max_retries
|
|
31
|
+
validate_retry_base_delay
|
|
32
|
+
validate_concurrency
|
|
33
|
+
self
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def to_h
|
|
37
|
+
{
|
|
38
|
+
judge_model:,
|
|
39
|
+
judge_provider:,
|
|
40
|
+
temperature:,
|
|
41
|
+
max_tokens:,
|
|
42
|
+
custom_prompt:,
|
|
43
|
+
max_retries:,
|
|
44
|
+
retry_base_delay:,
|
|
45
|
+
concurrency:
|
|
46
|
+
}
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def validate_judge_model
|
|
52
|
+
return unless judge_model.nil? || judge_model.to_s.strip.empty?
|
|
53
|
+
|
|
54
|
+
raise ConfigurationError, "judge_model must be a non-empty string"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def validate_judge_provider
|
|
58
|
+
return if judge_provider.is_a?(Symbol)
|
|
59
|
+
|
|
60
|
+
raise ConfigurationError, "judge_provider must be a symbol"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def validate_temperature
|
|
64
|
+
return if temperature.is_a?(Numeric) && temperature.between?(0.0, 2.0)
|
|
65
|
+
|
|
66
|
+
raise ConfigurationError, "temperature must be between 0.0 and 2.0"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def validate_max_tokens
|
|
70
|
+
return if max_tokens.is_a?(Integer) && max_tokens.positive?
|
|
71
|
+
|
|
72
|
+
raise ConfigurationError, "max_tokens must be a positive integer"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def validate_max_retries
|
|
76
|
+
return if max_retries.is_a?(Integer) && max_retries >= 0
|
|
77
|
+
|
|
78
|
+
raise ConfigurationError, "max_retries must be a non-negative integer"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def validate_retry_base_delay
|
|
82
|
+
return if retry_base_delay.is_a?(Numeric) && retry_base_delay >= 0
|
|
83
|
+
|
|
84
|
+
raise ConfigurationError, "retry_base_delay must be a non-negative number"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def validate_concurrency
|
|
88
|
+
return if concurrency.is_a?(Integer) && concurrency.positive?
|
|
89
|
+
|
|
90
|
+
raise ConfigurationError, "concurrency must be a positive integer"
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|