ruby_llm-contract 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +19 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +42 -2
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +5 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +93 -38
- data/lib/ruby_llm/contract/step/dsl.rb +10 -0
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +46 -85
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +28 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2e6861a314beadad8064d5fe08b85dc0f94032987ba41c27fc9a640788f10c28
|
|
4
|
+
data.tar.gz: 7efb88142ef8ac8287ed58f6cc9bc93ca78130084e14f8310981e7f90faf9943
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1bd259ab22d13b2e7cc9848c401b91ebfaa135dc7502b72074fd28e71e6120f7ea183c09361948bf8d68df9cd5190e9450a3f03954bbd2927f8091c987368bb7
|
|
7
|
+
data.tar.gz: 904185a06d1def6513268033cbe87b30e3af9b92e0a26ec3cf89c93ad88450c4d49aa13d79c2428b1f7f6cc1cbffefeccf146b5a5c7ddf0057ab3db2f1b7dc8d
|
data/.rubycritic.yml
ADDED
data/.simplecov
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "simplecov"
|
|
4
|
+
|
|
5
|
+
SimpleCov.start do
|
|
6
|
+
enable_coverage :branch
|
|
7
|
+
primary_coverage :branch
|
|
8
|
+
|
|
9
|
+
add_filter "/spec/"
|
|
10
|
+
add_filter "/examples/"
|
|
11
|
+
add_filter "/internal/"
|
|
12
|
+
add_filter "/tmp/"
|
|
13
|
+
|
|
14
|
+
track_files "lib/**/*.rb"
|
|
15
|
+
|
|
16
|
+
if ENV["CI"] == "true" || ENV["SIMPLECOV_STRICT"] == "1"
|
|
17
|
+
minimum_coverage line: 89
|
|
18
|
+
minimum_coverage branch: 75
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
command_name "RSpec"
|
|
22
|
+
end
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.5.0 (2026-03-25)
|
|
4
|
+
|
|
5
|
+
Data-Driven Prompt Engineering — see ADR-0015.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- **`observe` DSL** — soft observations that log but never fail. `observe("scores differ") { |o| o[:a] != o[:b] }`. Results in `result.observations`. Logged via `Contract.logger` when they fail. Runs only when validation passes.
|
|
10
|
+
- **`compare_with`** — prompt A/B testing. `StepV2.compare_with(StepV1, eval: "regression", model: "nano")` returns `PromptDiff` with `improvements`, `regressions`, `score_delta`, `safe_to_switch?`. Reuses `BaselineDiff` internally.
|
|
11
|
+
- **RSpec `compared_with` chain** — `expect(StepV2).to pass_eval("x").compared_with(StepV1).without_regressions` blocks merge if new prompt regresses any case.
|
|
12
|
+
|
|
13
|
+
### Game changer continuity
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
v0.2: "Which model?" → compare_models (snapshot)
|
|
17
|
+
v0.3: "Did it change?" → baseline regression (binary)
|
|
18
|
+
v0.4: "Show me the trend" → eval history (time series)
|
|
19
|
+
v0.5: "Which prompt is better?" → compare_with (A/B testing)
|
|
20
|
+
```
|
|
21
|
+
|
|
3
22
|
## 0.4.5 (2026-03-24)
|
|
4
23
|
|
|
5
24
|
Audit hardening — 18 bugs fixed across 4 audit rounds.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ruby_llm-contract (0.
|
|
4
|
+
ruby_llm-contract (0.5.0)
|
|
5
5
|
dry-types (~> 1.7)
|
|
6
6
|
ruby_llm (~> 1.0)
|
|
7
7
|
ruby_llm-schema (~> 0.3)
|
|
@@ -12,20 +12,43 @@ GEM
|
|
|
12
12
|
addressable (2.8.9)
|
|
13
13
|
public_suffix (>= 2.0.2, < 8.0)
|
|
14
14
|
ast (2.4.3)
|
|
15
|
+
axiom-types (0.1.1)
|
|
16
|
+
descendants_tracker (~> 0.0.4)
|
|
17
|
+
ice_nine (~> 0.11.0)
|
|
18
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
|
15
19
|
base64 (0.3.0)
|
|
16
20
|
bigdecimal (4.0.1)
|
|
21
|
+
childprocess (5.1.0)
|
|
22
|
+
logger (~> 1.5)
|
|
23
|
+
coercible (1.0.0)
|
|
24
|
+
descendants_tracker (~> 0.0.1)
|
|
17
25
|
concurrent-ruby (1.3.6)
|
|
26
|
+
descendants_tracker (0.0.4)
|
|
27
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
|
18
28
|
diff-lcs (1.6.2)
|
|
29
|
+
docile (1.4.1)
|
|
30
|
+
dry-configurable (1.3.0)
|
|
31
|
+
dry-core (~> 1.1)
|
|
32
|
+
zeitwerk (~> 2.6)
|
|
19
33
|
dry-core (1.2.0)
|
|
20
34
|
concurrent-ruby (~> 1.0)
|
|
21
35
|
logger
|
|
22
36
|
zeitwerk (~> 2.6)
|
|
23
37
|
dry-inflector (1.3.1)
|
|
38
|
+
dry-initializer (3.2.0)
|
|
24
39
|
dry-logic (1.6.0)
|
|
25
40
|
bigdecimal
|
|
26
41
|
concurrent-ruby (~> 1.0)
|
|
27
42
|
dry-core (~> 1.1)
|
|
28
43
|
zeitwerk (~> 2.6)
|
|
44
|
+
dry-schema (1.16.0)
|
|
45
|
+
concurrent-ruby (~> 1.0)
|
|
46
|
+
dry-configurable (~> 1.0, >= 1.0.1)
|
|
47
|
+
dry-core (~> 1.1)
|
|
48
|
+
dry-initializer (~> 3.2)
|
|
49
|
+
dry-logic (~> 1.6)
|
|
50
|
+
dry-types (~> 1.9, >= 1.9.1)
|
|
51
|
+
zeitwerk (~> 2.6)
|
|
29
52
|
dry-types (1.9.1)
|
|
30
53
|
bigdecimal (>= 3.0)
|
|
31
54
|
concurrent-ruby (~> 1.0)
|
|
@@ -33,6 +56,7 @@ GEM
|
|
|
33
56
|
dry-inflector (~> 1.0)
|
|
34
57
|
dry-logic (~> 1.4)
|
|
35
58
|
zeitwerk (~> 2.6)
|
|
59
|
+
erubi (1.13.1)
|
|
36
60
|
event_stream_parser (1.0.0)
|
|
37
61
|
faraday (2.14.1)
|
|
38
62
|
faraday-net_http (>= 2.0, < 3.5)
|
|
@@ -44,11 +68,25 @@ GEM
|
|
|
44
68
|
net-http (~> 0.5)
|
|
45
69
|
faraday-retry (2.4.0)
|
|
46
70
|
faraday (~> 2.0)
|
|
71
|
+
flay (2.14.3)
|
|
72
|
+
erubi (~> 1.10)
|
|
73
|
+
path_expander (~> 2.0)
|
|
74
|
+
prism (~> 1.7)
|
|
75
|
+
sexp_processor (~> 4.0)
|
|
76
|
+
flog (4.9.4)
|
|
77
|
+
path_expander (~> 2.0)
|
|
78
|
+
prism (~> 1.7)
|
|
79
|
+
sexp_processor (~> 4.8)
|
|
80
|
+
ice_nine (0.11.2)
|
|
47
81
|
json (2.19.2)
|
|
48
82
|
json-schema (6.2.0)
|
|
49
83
|
addressable (~> 2.8)
|
|
50
84
|
bigdecimal (>= 3.1, < 5)
|
|
51
85
|
language_server-protocol (3.17.0.5)
|
|
86
|
+
launchy (3.1.1)
|
|
87
|
+
addressable (~> 2.8)
|
|
88
|
+
childprocess (~> 5.0)
|
|
89
|
+
logger (~> 1.6)
|
|
52
90
|
lint_roller (1.1.0)
|
|
53
91
|
logger (1.7.0)
|
|
54
92
|
marcel (1.1.0)
|
|
@@ -61,12 +99,20 @@ GEM
|
|
|
61
99
|
parser (3.3.10.2)
|
|
62
100
|
ast (~> 2.4.1)
|
|
63
101
|
racc
|
|
102
|
+
path_expander (2.0.1)
|
|
64
103
|
prism (1.9.0)
|
|
65
104
|
public_suffix (7.0.5)
|
|
66
105
|
racc (1.8.1)
|
|
67
106
|
rainbow (3.1.1)
|
|
68
107
|
rake (13.3.1)
|
|
108
|
+
reek (6.5.0)
|
|
109
|
+
dry-schema (~> 1.13)
|
|
110
|
+
logger (~> 1.6)
|
|
111
|
+
parser (~> 3.3.0)
|
|
112
|
+
rainbow (>= 2.0, < 4.0)
|
|
113
|
+
rexml (~> 3.1)
|
|
69
114
|
regexp_parser (2.11.3)
|
|
115
|
+
rexml (3.4.4)
|
|
70
116
|
rspec (3.13.2)
|
|
71
117
|
rspec-core (~> 3.13.0)
|
|
72
118
|
rspec-expectations (~> 3.13.0)
|
|
@@ -107,10 +153,39 @@ GEM
|
|
|
107
153
|
ruby_llm-schema (~> 0)
|
|
108
154
|
zeitwerk (~> 2)
|
|
109
155
|
ruby_llm-schema (0.3.0)
|
|
156
|
+
ruby_parser (3.22.0)
|
|
157
|
+
racc (~> 1.5)
|
|
158
|
+
sexp_processor (~> 4.16)
|
|
159
|
+
rubycritic (4.12.0)
|
|
160
|
+
flay (~> 2.13)
|
|
161
|
+
flog (~> 4.7)
|
|
162
|
+
launchy (>= 2.5.2)
|
|
163
|
+
parser (>= 3.3.0.5)
|
|
164
|
+
prism (>= 1.6.0)
|
|
165
|
+
rainbow (~> 3.1.1)
|
|
166
|
+
reek (~> 6.5.0, < 7.0)
|
|
167
|
+
rexml
|
|
168
|
+
ruby_parser (~> 3.21)
|
|
169
|
+
simplecov (>= 0.22.0)
|
|
170
|
+
tty-which (~> 0.5.0)
|
|
171
|
+
virtus (~> 2.0)
|
|
172
|
+
sexp_processor (4.17.5)
|
|
173
|
+
simplecov (0.22.0)
|
|
174
|
+
docile (~> 1.1)
|
|
175
|
+
simplecov-html (~> 0.11)
|
|
176
|
+
simplecov_json_formatter (~> 0.1)
|
|
177
|
+
simplecov-html (0.13.2)
|
|
178
|
+
simplecov_json_formatter (0.1.4)
|
|
179
|
+
thread_safe (0.3.6)
|
|
180
|
+
tty-which (0.5.0)
|
|
110
181
|
unicode-display_width (3.2.0)
|
|
111
182
|
unicode-emoji (~> 4.1)
|
|
112
183
|
unicode-emoji (4.2.0)
|
|
113
184
|
uri (1.1.1)
|
|
185
|
+
virtus (2.0.0)
|
|
186
|
+
axiom-types (~> 0.1)
|
|
187
|
+
coercible (~> 1.0)
|
|
188
|
+
descendants_tracker (~> 0.0, >= 0.0.3)
|
|
114
189
|
zeitwerk (2.7.5)
|
|
115
190
|
|
|
116
191
|
PLATFORMS
|
|
@@ -122,26 +197,41 @@ DEPENDENCIES
|
|
|
122
197
|
rspec (~> 3.13)
|
|
123
198
|
rubocop (~> 1.75)
|
|
124
199
|
ruby_llm-contract!
|
|
200
|
+
rubycritic (~> 4.9)
|
|
201
|
+
simplecov (~> 0.22)
|
|
125
202
|
|
|
126
203
|
CHECKSUMS
|
|
127
204
|
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
128
205
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
206
|
+
axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383
|
|
129
207
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
130
208
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
209
|
+
childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec
|
|
210
|
+
coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc
|
|
131
211
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
212
|
+
descendants_tracker (0.0.4) sha256=e9c41dd4cfbb85829a9301ea7e7c48c2a03b26f09319db230e6479ccdc780897
|
|
132
213
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
214
|
+
docile (1.4.1) sha256=96159be799bfa73cdb721b840e9802126e4e03dfc26863db73647204c727f21e
|
|
215
|
+
dry-configurable (1.3.0) sha256=882d862858567fc1210d2549d4c090f34370fc1bb7c5c1933de3fe792e18afa8
|
|
133
216
|
dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
|
|
134
217
|
dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
|
|
218
|
+
dry-initializer (3.2.0) sha256=37d59798f912dc0a1efe14a4db4a9306989007b302dcd5f25d0a2a20c166c4e3
|
|
135
219
|
dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
|
|
220
|
+
dry-schema (1.16.0) sha256=cd3aaeabc0f1af66ec82a29096d4c4fb92a0a58b9dae29a22b1bbceb78985727
|
|
136
221
|
dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
|
|
222
|
+
erubi (1.13.1) sha256=a082103b0885dbc5ecf1172fede897f9ebdb745a4b97a5e8dc63953db1ee4ad9
|
|
137
223
|
event_stream_parser (1.0.0) sha256=a2683bab70126286f8184dc88f7968ffc4028f813161fb073ec90d171f7de3c8
|
|
138
224
|
faraday (2.14.1) sha256=a43cceedc1e39d188f4d2cdd360a8aaa6a11da0c407052e426ba8d3fb42ef61c
|
|
139
225
|
faraday-multipart (1.2.0) sha256=7d89a949693714176f612323ca13746a2ded204031a6ba528adee788694ef757
|
|
140
226
|
faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
|
|
141
227
|
faraday-retry (2.4.0) sha256=7b79c48fb7e56526faf247b12d94a680071ff40c9fda7cf1ec1549439ad11ebe
|
|
228
|
+
flay (2.14.3) sha256=7f96a495f4bde880460e77e7345464e752bd44f09f5cd30c80af02afe0ed3f29
|
|
229
|
+
flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb
|
|
230
|
+
ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
|
|
142
231
|
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
143
232
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
144
233
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
234
|
+
launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e
|
|
145
235
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
146
236
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
147
237
|
marcel (1.1.0) sha256=fdcfcfa33cc52e93c4308d40e4090a5d4ea279e160a7f6af988260fa970e0bee
|
|
@@ -150,12 +240,15 @@ CHECKSUMS
|
|
|
150
240
|
net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
|
|
151
241
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
152
242
|
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
243
|
+
path_expander (2.0.1) sha256=2de201164bff4719cc4d0b3767286e9977cc832a59c4d70abab571ec86cb41e4
|
|
153
244
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
154
245
|
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
|
155
246
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
156
247
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
157
248
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
249
|
+
reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1
|
|
158
250
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
251
|
+
rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
|
|
159
252
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
160
253
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
161
254
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
@@ -165,11 +258,20 @@ CHECKSUMS
|
|
|
165
258
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
166
259
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
167
260
|
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
168
|
-
ruby_llm-contract (0.
|
|
261
|
+
ruby_llm-contract (0.5.0)
|
|
169
262
|
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
263
|
+
ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
|
|
264
|
+
rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8
|
|
265
|
+
sexp_processor (4.17.5) sha256=ae2b48ba98353d5d465ce8759836b7a05f2e12c5879fcd14d7815b026de32f0e
|
|
266
|
+
simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
|
|
267
|
+
simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
|
|
268
|
+
simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
|
|
269
|
+
thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a
|
|
270
|
+
tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466
|
|
170
271
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
171
272
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
|
172
273
|
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
|
274
|
+
virtus (2.0.0) sha256=8841dae4eb7fcc097320ba5ea516bf1839e5d056c61ee27138aa4bddd6e3d1c2
|
|
173
275
|
zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
|
|
174
276
|
|
|
175
277
|
BUNDLED WITH
|
data/README.md
CHANGED
|
@@ -157,6 +157,44 @@ report = ClassifyTicket.run_eval("regression",
|
|
|
157
157
|
concurrency: 4)
|
|
158
158
|
```
|
|
159
159
|
|
|
160
|
+
## Prompt A/B testing
|
|
161
|
+
|
|
162
|
+
Changed a prompt? Compare old vs new with regression safety:
|
|
163
|
+
|
|
164
|
+
```ruby
|
|
165
|
+
diff = ClassifyTicketV2.compare_with(ClassifyTicketV1,
|
|
166
|
+
eval: "regression", model: "gpt-4.1-mini")
|
|
167
|
+
|
|
168
|
+
diff.safe_to_switch? # => true (no regressions, no per-case score drops)
|
|
169
|
+
diff.improvements # => [{case: "outage", ...}]
|
|
170
|
+
diff.score_delta # => +0.33
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Requires `model:` or `context: { adapter: ... }`.
|
|
174
|
+
`compare_with` ignores `sample_response`; without a real model/adapter both sides are skipped and the A/B result is not meaningful.
|
|
175
|
+
|
|
176
|
+
CI gate:
|
|
177
|
+
```ruby
|
|
178
|
+
expect(ClassifyTicketV2).to pass_eval("regression")
|
|
179
|
+
.compared_with(ClassifyTicketV1)
|
|
180
|
+
.with_minimum_score(0.8)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Soft observations
|
|
184
|
+
|
|
185
|
+
Log suspicious-but-not-invalid output without failing the contract:
|
|
186
|
+
|
|
187
|
+
```ruby
|
|
188
|
+
class EvaluateComparative < RubyLLM::Contract::Step::Base
|
|
189
|
+
validate("scores in range") { |o| (1..10).include?(o[:score_a]) }
|
|
190
|
+
observe("scores should differ") { |o| o[:score_a] != o[:score_b] }
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
result = EvaluateComparative.run(input)
|
|
194
|
+
result.ok? # => true (observe never fails)
|
|
195
|
+
result.observations # => [{description: "scores should differ", passed: false}]
|
|
196
|
+
```
|
|
197
|
+
|
|
160
198
|
## Predict cost before running
|
|
161
199
|
|
|
162
200
|
```ruby
|
|
@@ -190,11 +228,13 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
|
190
228
|
|
|
191
229
|
## Roadmap
|
|
192
230
|
|
|
193
|
-
**v0.
|
|
231
|
+
**v0.5 (current):** Data-driven prompt engineering — `compare_with(OtherStep)` for prompt A/B testing with regression safety. `observe` DSL for soft observations that log but never fail.
|
|
232
|
+
|
|
233
|
+
**v0.4:** Observability & scale — eval history with trending, batch eval with concurrency, pipeline per-step eval, Minitest support, structured logging. Audit hardening (18 bugfixes).
|
|
194
234
|
|
|
195
235
|
**v0.3:** Baseline regression detection, migration guide, production hardening.
|
|
196
236
|
|
|
197
|
-
**v0.
|
|
237
|
+
**v0.6:** Model recommendation based on eval history data. Cross-provider comparison docs.
|
|
198
238
|
|
|
199
239
|
## License
|
|
200
240
|
|
|
@@ -9,22 +9,23 @@ module RubyLLM
|
|
|
9
9
|
private
|
|
10
10
|
|
|
11
11
|
def safe_context(context)
|
|
12
|
-
(context || {}).transform_keys { |
|
|
12
|
+
(context || {}).transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key }
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def isolate_context(context)
|
|
16
|
-
context.transform_values do |
|
|
17
|
-
|
|
18
|
-
v.clone_for_concurrency
|
|
19
|
-
elsif v.respond_to?(:dup)
|
|
20
|
-
v.dup
|
|
21
|
-
else
|
|
22
|
-
v
|
|
23
|
-
end
|
|
16
|
+
context.transform_values do |value|
|
|
17
|
+
duplicate_context_value(value)
|
|
24
18
|
rescue TypeError
|
|
25
|
-
|
|
19
|
+
value
|
|
26
20
|
end
|
|
27
21
|
end
|
|
22
|
+
|
|
23
|
+
def duplicate_context_value(value)
|
|
24
|
+
return value.clone_for_concurrency if value.respond_to?(:clone_for_concurrency)
|
|
25
|
+
return value.dup if value.respond_to?(:dup)
|
|
26
|
+
|
|
27
|
+
value
|
|
28
|
+
end
|
|
28
29
|
end
|
|
29
30
|
end
|
|
30
31
|
end
|
|
@@ -8,15 +8,21 @@ module RubyLLM
|
|
|
8
8
|
module DeepFreeze
|
|
9
9
|
private
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
when
|
|
16
|
-
when
|
|
17
|
-
|
|
11
|
+
IMMUTABLE_TYPES = [NilClass, Integer, Float, Symbol, TrueClass, FalseClass].freeze
|
|
12
|
+
|
|
13
|
+
def deep_dup_freeze(object)
|
|
14
|
+
case object
|
|
15
|
+
when *IMMUTABLE_TYPES then object
|
|
16
|
+
when Hash then object.transform_values { |value| deep_dup_freeze(value) }.freeze
|
|
17
|
+
when Array then object.map { |value| deep_dup_freeze(value) }.freeze
|
|
18
|
+
else
|
|
19
|
+
frozen_copy(object)
|
|
18
20
|
end
|
|
19
21
|
end
|
|
22
|
+
|
|
23
|
+
def frozen_copy(object)
|
|
24
|
+
object.frozen? ? object : object.dup.freeze
|
|
25
|
+
end
|
|
20
26
|
end
|
|
21
27
|
end
|
|
22
28
|
end
|
|
@@ -3,12 +3,22 @@
|
|
|
3
3
|
module RubyLLM
|
|
4
4
|
module Contract
|
|
5
5
|
module Concerns
|
|
6
|
+
# Recursively converts Hash keys to symbols while preserving array shape.
|
|
6
7
|
module DeepSymbolize
|
|
7
|
-
def deep_symbolize(
|
|
8
|
-
case
|
|
9
|
-
when Hash then
|
|
10
|
-
when Array then
|
|
11
|
-
else
|
|
8
|
+
def deep_symbolize(object)
|
|
9
|
+
case object
|
|
10
|
+
when Hash then symbolize_hash(object)
|
|
11
|
+
when Array then object.map { |value| deep_symbolize(value) }
|
|
12
|
+
else
|
|
13
|
+
object
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def symbolize_hash(hash)
|
|
20
|
+
hash.each_with_object({}) do |(key, value), symbolized|
|
|
21
|
+
symbolized[key.to_sym] = deep_symbolize(value)
|
|
12
22
|
end
|
|
13
23
|
end
|
|
14
24
|
end
|
|
@@ -5,6 +5,11 @@ module RubyLLM
|
|
|
5
5
|
module Concerns
|
|
6
6
|
module EvalHost
|
|
7
7
|
include ContextHelpers
|
|
8
|
+
|
|
9
|
+
SAMPLE_RESPONSE_COMPARE_WARNING = "[ruby_llm-contract] compare_with ignores sample_response. " \
|
|
10
|
+
"Without model: or context: { adapter: ... }, both sides will be skipped " \
|
|
11
|
+
"and the A/B comparison is not meaningful.".freeze
|
|
12
|
+
|
|
8
13
|
def define_eval(name, &)
|
|
9
14
|
@eval_definitions ||= {}
|
|
10
15
|
@file_sourced_evals ||= Set.new
|
|
@@ -45,6 +50,26 @@ module RubyLLM
|
|
|
45
50
|
end
|
|
46
51
|
end
|
|
47
52
|
|
|
53
|
+
# Compare this step (candidate) with another step (baseline) using the
|
|
54
|
+
# baseline's eval definition as single source of truth.
|
|
55
|
+
#
|
|
56
|
+
# Requires a real adapter or model in context. sample_response is
|
|
57
|
+
# intentionally NOT used, because A/B testing with canned data
|
|
58
|
+
# gives identical results for both sides rather than a real comparison.
|
|
59
|
+
def compare_with(other_step, eval:, model: nil, context: {})
|
|
60
|
+
ctx = comparison_context(context, model)
|
|
61
|
+
baseline_defn = baseline_eval_definition(other_step, eval)
|
|
62
|
+
raise ArgumentError, "No eval '#{eval}' on baseline step #{other_step}" unless baseline_defn
|
|
63
|
+
|
|
64
|
+
dataset = baseline_defn.build_dataset
|
|
65
|
+
warn_sample_response_compare(ctx, baseline_defn)
|
|
66
|
+
|
|
67
|
+
my_report = Eval::Runner.run(step: self, dataset: dataset, context: isolate_context(ctx))
|
|
68
|
+
other_report = Eval::Runner.run(step: other_step, dataset: dataset, context: isolate_context(ctx))
|
|
69
|
+
|
|
70
|
+
Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
|
|
71
|
+
end
|
|
72
|
+
|
|
48
73
|
def compare_models(eval_name, models:, context: {})
|
|
49
74
|
context = safe_context(context)
|
|
50
75
|
models = models.uniq
|
|
@@ -57,6 +82,21 @@ module RubyLLM
|
|
|
57
82
|
|
|
58
83
|
private
|
|
59
84
|
|
|
85
|
+
def comparison_context(context, model)
|
|
86
|
+
base_context = safe_context(context)
|
|
87
|
+
model ? base_context.merge(model: model) : base_context
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def baseline_eval_definition(other_step, eval_name)
|
|
91
|
+
other_step.send(:all_eval_definitions)[eval_name.to_s]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def warn_sample_response_compare(context, baseline_defn)
|
|
95
|
+
return if context[:adapter] || context[:model] || !baseline_defn.build_adapter
|
|
96
|
+
|
|
97
|
+
warn SAMPLE_RESPONSE_COMPARE_WARNING
|
|
98
|
+
end
|
|
99
|
+
|
|
60
100
|
def all_eval_definitions
|
|
61
101
|
inherited = if superclass.respond_to?(:all_eval_definitions, true)
|
|
62
102
|
superclass.send(:all_eval_definitions)
|
|
@@ -71,20 +111,24 @@ module RubyLLM
|
|
|
71
111
|
defn = all_eval_definitions[name.to_s]
|
|
72
112
|
raise ArgumentError, "No eval '#{name}' defined. Available: #{all_eval_definitions.keys}" unless defn
|
|
73
113
|
|
|
74
|
-
|
|
75
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
76
|
-
concurrency: concurrency)
|
|
114
|
+
run_eval_definition(defn, context, concurrency: concurrency)
|
|
77
115
|
end
|
|
78
116
|
|
|
79
117
|
def run_all_own_evals(context, concurrency: nil)
|
|
80
118
|
all_eval_definitions.transform_values do |defn|
|
|
81
|
-
|
|
82
|
-
effective_context = eval_context(defn, isolated_context)
|
|
83
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
84
|
-
concurrency: concurrency)
|
|
119
|
+
run_eval_definition(defn, isolate_context(context), concurrency: concurrency)
|
|
85
120
|
end
|
|
86
121
|
end
|
|
87
122
|
|
|
123
|
+
def run_eval_definition(defn, context, concurrency: nil)
|
|
124
|
+
Eval::Runner.run(
|
|
125
|
+
step: self,
|
|
126
|
+
dataset: defn.build_dataset,
|
|
127
|
+
context: eval_context(defn, context),
|
|
128
|
+
concurrency: concurrency
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
|
|
88
132
|
def eval_context(defn, context)
|
|
89
133
|
context = safe_context(context)
|
|
90
134
|
return context if context[:adapter]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
class SchemaValidator
|
|
6
|
+
# Validates numeric and collection size bounds for a node.
|
|
7
|
+
class BoundRule
|
|
8
|
+
NUMERIC_LIMITS = [
|
|
9
|
+
{ key: :minimum, label: "minimum", relation: "below", invalid: ->(actual, limit) { actual < limit } },
|
|
10
|
+
{ key: :maximum, label: "maximum", relation: "above", invalid: ->(actual, limit) { actual > limit } }
|
|
11
|
+
].freeze
|
|
12
|
+
SIZE_LIMITS = [
|
|
13
|
+
{ bound: :min_key, relation: "below", invalid: ->(actual, limit) { actual < limit } },
|
|
14
|
+
{ bound: :max_key, relation: "above", invalid: ->(actual, limit) { actual > limit } }
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
def initialize(errors)
|
|
18
|
+
@errors = errors
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def validate(node)
|
|
22
|
+
value = node.value
|
|
23
|
+
schema = node.schema
|
|
24
|
+
path = node.path
|
|
25
|
+
|
|
26
|
+
append_numeric_bound_errors(path, value, schema) if value.is_a?(Numeric)
|
|
27
|
+
append_size_bound_errors(path, value, schema)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def append_numeric_bound_errors(path, value, schema)
|
|
33
|
+
NUMERIC_LIMITS.each do |limit_config|
|
|
34
|
+
append_bound_error(
|
|
35
|
+
path: path,
|
|
36
|
+
actual: value,
|
|
37
|
+
limit: schema[limit_config[:key]],
|
|
38
|
+
label: limit_config[:label],
|
|
39
|
+
relation: limit_config[:relation],
|
|
40
|
+
invalid: limit_config[:invalid]
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def append_size_bound_errors(path, value, schema)
|
|
46
|
+
bounds = size_bounds_for(value)
|
|
47
|
+
return unless bounds
|
|
48
|
+
|
|
49
|
+
actual_size = value.length
|
|
50
|
+
metric = bounds[:metric]
|
|
51
|
+
|
|
52
|
+
SIZE_LIMITS.each do |limit_config|
|
|
53
|
+
label = bounds[limit_config[:bound]]
|
|
54
|
+
append_bound_error(
|
|
55
|
+
path: path,
|
|
56
|
+
actual: actual_size,
|
|
57
|
+
limit: schema[label],
|
|
58
|
+
label: label,
|
|
59
|
+
relation: limit_config[:relation],
|
|
60
|
+
metric: metric,
|
|
61
|
+
invalid: limit_config[:invalid]
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def append_bound_error(path:, actual:, limit:, label:, relation:, invalid:, metric: nil)
|
|
67
|
+
return unless limit
|
|
68
|
+
return unless invalid.call(actual, limit)
|
|
69
|
+
|
|
70
|
+
subject = metric ? "#{metric} #{actual}" : actual
|
|
71
|
+
@errors << "#{path}: #{subject} is #{relation} #{label} #{limit}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def size_bounds_for(value)
|
|
75
|
+
case value
|
|
76
|
+
when String
|
|
77
|
+
SchemaValidator::SIZE_BOUNDS[:string]
|
|
78
|
+
when Array
|
|
79
|
+
SchemaValidator::SIZE_BOUNDS[:array]
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
class SchemaValidator
|
|
6
|
+
# Validates enum membership for a node when enum values are declared.
|
|
7
|
+
class EnumRule
|
|
8
|
+
def initialize(errors)
|
|
9
|
+
@errors = errors
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def validate(node)
|
|
13
|
+
allowed_values = node.schema[:enum]
|
|
14
|
+
value = node.value
|
|
15
|
+
return unless allowed_values
|
|
16
|
+
return if allowed_values.include?(value)
|
|
17
|
+
|
|
18
|
+
@errors << "#{node.path}: #{value.inspect} is not in enum #{allowed_values.inspect}"
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|