rubric_llm 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/lib/rubric_llm/comparison.rb +32 -18
- data/lib/rubric_llm/judge.rb +2 -1
- data/lib/rubric_llm/version.rb +1 -1
- data/lib/rubric_llm.rb +9 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 52a6704521634882ccc1d1779c20e2b65587f398b2cfd00b3c7892d9a97706e1
|
|
4
|
+
data.tar.gz: 3410606f13f439af21457ac9714b0e67a33752a3231854ccd0f73e759d4ac209
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 17a9e4d4308627889576b29fbbdb91a93a12e702847780e21fe070cb499ac06d243fa5e688d520e4fd43827eb05ba86b85eafb039598160e31e273838ab69f1c
|
|
7
|
+
data.tar.gz: 86b21112726b41c2eef29cc2e0fc0f6b6b4d742a6c2797d46f391d0292fb2ad8477d3792029d1b90f562425118f52df496e998f6d434128c4b4017756aaa217d
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.1] - 2026-03-24
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- Use RubyLLM system instructions instead of the attachment API when calling the judge
|
|
15
|
+
- Roll back invalid global configuration changes when `RubricLLM.configure` validation fails
|
|
16
|
+
- Accept string-keyed batch dataset hashes in sequential and concurrent evaluation
|
|
17
|
+
- Stabilize Student's t-test p-value calculation for small deltas and ordinary sample sizes
|
|
18
|
+
|
|
10
19
|
## [0.1.0] - 2026-03-24
|
|
11
20
|
|
|
12
21
|
### Added
|
|
@@ -102,40 +102,54 @@ module RubricLLM
|
|
|
102
102
|
return 0.0 if x <= 0.0
|
|
103
103
|
return 1.0 if x >= 1.0
|
|
104
104
|
|
|
105
|
-
ln_beta = Math.lgamma(a)[0]
|
|
106
|
-
front = Math.exp((a * Math.log(x)) + (b * Math.log(1.0 - x))
|
|
105
|
+
ln_beta = Math.lgamma(a + b)[0] - Math.lgamma(a)[0] - Math.lgamma(b)[0]
|
|
106
|
+
front = Math.exp(ln_beta + (a * Math.log(x)) + (b * Math.log(1.0 - x)))
|
|
107
|
+
|
|
108
|
+
result = if x < ((a + 1.0) / (a + b + 2.0))
|
|
109
|
+
front * beta_continued_fraction(a, b, x) / a
|
|
110
|
+
else
|
|
111
|
+
1.0 - ((front * beta_continued_fraction(b, a, 1.0 - x)) / b)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
result.clamp(0.0, 1.0)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def beta_continued_fraction(a, b, x)
|
|
118
|
+
tiny = 1e-30
|
|
119
|
+
qab = a + b
|
|
120
|
+
qap = a + 1.0
|
|
121
|
+
qam = a - 1.0
|
|
107
122
|
|
|
108
|
-
# Lentz's continued fraction
|
|
109
123
|
c = 1.0
|
|
110
|
-
d = 1.0 - ((
|
|
111
|
-
d =
|
|
124
|
+
d = 1.0 - ((qab * x) / qap)
|
|
125
|
+
d = tiny if d.abs < tiny
|
|
112
126
|
d = 1.0 / d
|
|
113
|
-
|
|
127
|
+
fraction = d
|
|
114
128
|
|
|
115
129
|
(1..200).each do |m|
|
|
116
|
-
|
|
117
|
-
|
|
130
|
+
m2 = 2 * m
|
|
131
|
+
|
|
132
|
+
numerator = (m * (b - m) * x) / ((qam + m2) * (a + m2))
|
|
118
133
|
d = 1.0 + (numerator * d)
|
|
119
|
-
d =
|
|
134
|
+
d = tiny if d.abs < tiny
|
|
120
135
|
c = 1.0 + (numerator / c)
|
|
121
|
-
c =
|
|
136
|
+
c = tiny if c.abs < tiny
|
|
122
137
|
d = 1.0 / d
|
|
123
|
-
|
|
138
|
+
fraction *= c * d
|
|
124
139
|
|
|
125
|
-
|
|
126
|
-
numerator = -(a + m) * (a + b + m) * x / ((a + (2 * m)) * (a + (2 * m) + 1))
|
|
140
|
+
numerator = -((a + m) * (qab + m) * x) / ((a + m2) * (qap + m2))
|
|
127
141
|
d = 1.0 + (numerator * d)
|
|
128
|
-
d =
|
|
142
|
+
d = tiny if d.abs < tiny
|
|
129
143
|
c = 1.0 + (numerator / c)
|
|
130
|
-
c =
|
|
144
|
+
c = tiny if c.abs < tiny
|
|
131
145
|
d = 1.0 / d
|
|
132
146
|
delta = c * d
|
|
133
|
-
|
|
147
|
+
fraction *= delta
|
|
134
148
|
|
|
135
|
-
break if (delta - 1.0).abs < 1e-
|
|
149
|
+
break if (delta - 1.0).abs < 1e-12
|
|
136
150
|
end
|
|
137
151
|
|
|
138
|
-
|
|
152
|
+
fraction
|
|
139
153
|
end
|
|
140
154
|
|
|
141
155
|
def significance_marker(p)
|
data/lib/rubric_llm/judge.rb
CHANGED
|
@@ -22,7 +22,8 @@ module RubricLLM
|
|
|
22
22
|
chat.with_params(max_tokens: config.max_tokens)
|
|
23
23
|
|
|
24
24
|
full_system_prompt = build_system_prompt(system_prompt)
|
|
25
|
-
|
|
25
|
+
chat.with_instructions(full_system_prompt)
|
|
26
|
+
response = chat.ask(user_prompt)
|
|
26
27
|
parse_json(response.content)
|
|
27
28
|
rescue StandardError => e
|
|
28
29
|
raise JudgeError, "Judge call failed: #{e.message}" if attempts > config.max_retries
|
data/lib/rubric_llm/version.rb
CHANGED
data/lib/rubric_llm.rb
CHANGED
|
@@ -26,8 +26,10 @@ module RubricLLM
|
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
def configure
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
new_config = Config.new(**config.to_h)
|
|
30
|
+
yield(new_config)
|
|
31
|
+
new_config.validate!
|
|
32
|
+
@config = new_config
|
|
31
33
|
end
|
|
32
34
|
|
|
33
35
|
def reset_configuration!
|
|
@@ -86,6 +88,7 @@ module RubricLLM
|
|
|
86
88
|
private
|
|
87
89
|
|
|
88
90
|
def evaluate_sample(evaluator, sample)
|
|
91
|
+
sample = normalize_sample(sample)
|
|
89
92
|
evaluator.call(
|
|
90
93
|
question: sample[:question],
|
|
91
94
|
answer: sample[:answer],
|
|
@@ -122,5 +125,9 @@ module RubricLLM
|
|
|
122
125
|
|
|
123
126
|
Config.new(**config.to_h.compact, custom_prompt:)
|
|
124
127
|
end
|
|
128
|
+
|
|
129
|
+
def normalize_sample(sample)
|
|
130
|
+
sample.transform_keys(&:to_sym)
|
|
131
|
+
end
|
|
125
132
|
end
|
|
126
133
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rubric_llm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Paluy
|
|
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
79
79
|
- !ruby/object:Gem::Version
|
|
80
80
|
version: '0'
|
|
81
81
|
requirements: []
|
|
82
|
-
rubygems_version:
|
|
82
|
+
rubygems_version: 3.6.9
|
|
83
83
|
specification_version: 4
|
|
84
84
|
summary: Lightweight LLM evaluation framework for Ruby
|
|
85
85
|
test_files: []
|