rubric_llm 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a549a52da8585cfbdf8287315548389de6e93b1b0adca9b8fb32fedde3d966e5
4
- data.tar.gz: 3de825ff22b9c4b3dc091cc17ef089e504f53238857d30393b1951d55884c38d
3
+ metadata.gz: 52a6704521634882ccc1d1779c20e2b65587f398b2cfd00b3c7892d9a97706e1
4
+ data.tar.gz: 3410606f13f439af21457ac9714b0e67a33752a3231854ccd0f73e759d4ac209
5
5
  SHA512:
6
- metadata.gz: 004aff76b7b92d2266d75ec54b897d240cef7a86f424cbca8d1904d7e16eeb1821900c722e6df4f20be511936f90993e73b4e3aebd436e9d82f24ee558f12996
7
- data.tar.gz: eacc3c28cb3d3504323dbbb491c3e9b8af6625520308cb319cab2d5164868ac56a52c3daed5a603e496c092e53a2c1a7b1f6f32754b9dc1c230ec0aebca30be9
6
+ metadata.gz: 17a9e4d4308627889576b29fbbdb91a93a12e702847780e21fe070cb499ac06d243fa5e688d520e4fd43827eb05ba86b85eafb039598160e31e273838ab69f1c
7
+ data.tar.gz: 86b21112726b41c2eef29cc2e0fc0f6b6b4d742a6c2797d46f391d0292fb2ad8477d3792029d1b90f562425118f52df496e998f6d434128c4b4017756aaa217d
data/CHANGELOG.md CHANGED
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.1.1] - 2026-03-24
11
+
12
+ ### Fixed
13
+
14
+ - Use RubyLLM system instructions instead of the attachment API when calling the judge
15
+ - Roll back invalid global configuration changes when `RubricLLM.configure` validation fails
16
+ - Accept string-keyed batch dataset hashes in sequential and concurrent evaluation
17
+ - Stabilize Student's t-test p-value calculation for small deltas and ordinary sample sizes
18
+
10
19
  ## [0.1.0] - 2026-03-24
11
20
 
12
21
  ### Added
@@ -102,40 +102,54 @@ module RubricLLM
102
102
  return 0.0 if x <= 0.0
103
103
  return 1.0 if x >= 1.0
104
104
 
105
- ln_beta = Math.lgamma(a)[0] + Math.lgamma(b)[0] - Math.lgamma(a + b)[0]
106
- front = Math.exp((a * Math.log(x)) + (b * Math.log(1.0 - x)) - ln_beta) / a
105
+ ln_beta = Math.lgamma(a + b)[0] - Math.lgamma(a)[0] - Math.lgamma(b)[0]
106
+ front = Math.exp(ln_beta + (a * Math.log(x)) + (b * Math.log(1.0 - x)))
107
+
108
+ result = if x < ((a + 1.0) / (a + b + 2.0))
109
+ front * beta_continued_fraction(a, b, x) / a
110
+ else
111
+ 1.0 - ((front * beta_continued_fraction(b, a, 1.0 - x)) / b)
112
+ end
113
+
114
+ result.clamp(0.0, 1.0)
115
+ end
116
+
117
+ def beta_continued_fraction(a, b, x)
118
+ tiny = 1e-30
119
+ qab = a + b
120
+ qap = a + 1.0
121
+ qam = a - 1.0
107
122
 
108
- # Lentz's continued fraction
109
123
  c = 1.0
110
- d = 1.0 - ((a + b) * x / (a + 1.0))
111
- d = 1.0 if d.abs < 1e-30
124
+ d = 1.0 - ((qab * x) / qap)
125
+ d = tiny if d.abs < tiny
112
126
  d = 1.0 / d
113
- f = d
127
+ fraction = d
114
128
 
115
129
  (1..200).each do |m|
116
- # Even step
117
- numerator = m * (b - m) * x / ((a + (2 * m) - 1) * (a + (2 * m)))
130
+ m2 = 2 * m
131
+
132
+ numerator = (m * (b - m) * x) / ((qam + m2) * (a + m2))
118
133
  d = 1.0 + (numerator * d)
119
- d = 1e-30 if d.abs < 1e-30
134
+ d = tiny if d.abs < tiny
120
135
  c = 1.0 + (numerator / c)
121
- c = 1e-30 if c.abs < 1e-30
136
+ c = tiny if c.abs < tiny
122
137
  d = 1.0 / d
123
- f *= c * d
138
+ fraction *= c * d
124
139
 
125
- # Odd step
126
- numerator = -(a + m) * (a + b + m) * x / ((a + (2 * m)) * (a + (2 * m) + 1))
140
+ numerator = -((a + m) * (qab + m) * x) / ((a + m2) * (qap + m2))
127
141
  d = 1.0 + (numerator * d)
128
- d = 1e-30 if d.abs < 1e-30
142
+ d = tiny if d.abs < tiny
129
143
  c = 1.0 + (numerator / c)
130
- c = 1e-30 if c.abs < 1e-30
144
+ c = tiny if c.abs < tiny
131
145
  d = 1.0 / d
132
146
  delta = c * d
133
- f *= delta
147
+ fraction *= delta
134
148
 
135
- break if (delta - 1.0).abs < 1e-10
149
+ break if (delta - 1.0).abs < 1e-12
136
150
  end
137
151
 
138
- front * f
152
+ fraction
139
153
  end
140
154
 
141
155
  def significance_marker(p)
@@ -22,7 +22,8 @@ module RubricLLM
22
22
  chat.with_params(max_tokens: config.max_tokens)
23
23
 
24
24
  full_system_prompt = build_system_prompt(system_prompt)
25
- response = chat.ask(user_prompt, with: full_system_prompt)
25
+ chat.with_instructions(full_system_prompt)
26
+ response = chat.ask(user_prompt)
26
27
  parse_json(response.content)
27
28
  rescue StandardError => e
28
29
  raise JudgeError, "Judge call failed: #{e.message}" if attempts > config.max_retries
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RubricLLM
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
data/lib/rubric_llm.rb CHANGED
@@ -26,8 +26,10 @@ module RubricLLM
26
26
  end
27
27
 
28
28
  def configure
29
- yield(config)
30
- config.validate!
29
+ new_config = Config.new(**config.to_h)
30
+ yield(new_config)
31
+ new_config.validate!
32
+ @config = new_config
31
33
  end
32
34
 
33
35
  def reset_configuration!
@@ -86,6 +88,7 @@ module RubricLLM
86
88
  private
87
89
 
88
90
  def evaluate_sample(evaluator, sample)
91
+ sample = normalize_sample(sample)
89
92
  evaluator.call(
90
93
  question: sample[:question],
91
94
  answer: sample[:answer],
@@ -122,5 +125,9 @@ module RubricLLM
122
125
 
123
126
  Config.new(**config.to_h.compact, custom_prompt:)
124
127
  end
128
+
129
+ def normalize_sample(sample)
130
+ sample.transform_keys(&:to_sym)
131
+ end
125
132
  end
126
133
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubric_llm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Paluy
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  - !ruby/object:Gem::Version
80
80
  version: '0'
81
81
  requirements: []
82
- rubygems_version: 4.0.6
82
+ rubygems_version: 3.6.9
83
83
  specification_version: 4
84
84
  summary: Lightweight LLM evaluation framework for Ruby
85
85
  test_files: []