ruby_llm-contract 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubycritic.yml +8 -0
  3. data/.simplecov +22 -0
  4. data/CHANGELOG.md +19 -0
  5. data/Gemfile +2 -0
  6. data/Gemfile.lock +104 -2
  7. data/README.md +42 -2
  8. data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
  9. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
  10. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
  11. data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
  12. data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
  13. data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
  14. data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
  15. data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
  16. data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
  17. data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
  18. data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
  19. data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
  20. data/lib/ruby_llm/contract/contract/validator.rb +9 -0
  21. data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
  22. data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
  23. data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
  24. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
  25. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
  26. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
  27. data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
  28. data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
  29. data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
  30. data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
  31. data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
  32. data/lib/ruby_llm/contract/eval/report.rb +19 -191
  33. data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
  34. data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
  35. data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
  36. data/lib/ruby_llm/contract/eval/runner.rb +30 -207
  37. data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
  38. data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
  39. data/lib/ruby_llm/contract/eval.rb +13 -0
  40. data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
  41. data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
  42. data/lib/ruby_llm/contract/rspec.rb +5 -0
  43. data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
  44. data/lib/ruby_llm/contract/step/base.rb +93 -38
  45. data/lib/ruby_llm/contract/step/dsl.rb +10 -0
  46. data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
  47. data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
  48. data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
  49. data/lib/ruby_llm/contract/step/result.rb +3 -2
  50. data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
  51. data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
  52. data/lib/ruby_llm/contract/step/runner.rb +46 -85
  53. data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
  54. data/lib/ruby_llm/contract/step.rb +5 -0
  55. data/lib/ruby_llm/contract/version.rb +1 -1
  56. metadata +28 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 502a22f4a2c8f88416bac904fb2ca370f25ba70076b3257700ae296705320314
4
- data.tar.gz: '096dd32146b497b400984185185b9e2e81e6b5b53169896946a43545e368b25c'
3
+ metadata.gz: 2e6861a314beadad8064d5fe08b85dc0f94032987ba41c27fc9a640788f10c28
4
+ data.tar.gz: 7efb88142ef8ac8287ed58f6cc9bc93ca78130084e14f8310981e7f90faf9943
5
5
  SHA512:
6
- metadata.gz: 2111cd0c66eee5c1bec53ae4e5278aa9a79643304f3812bba65113ded58b7a42fa56b4d612461e1e5553e4cebd529417760bc07c919a52b1462498ca3ececbf3
7
- data.tar.gz: 61e8112e9ec2c577d675458d53ecbae303da8db31351803d6e0758b7b7f8b6566587147efa8b889d93a955b85217ebe7d1883d6c506f53d04490a50b6448cf2a
6
+ metadata.gz: 1bd259ab22d13b2e7cc9848c401b91ebfaa135dc7502b72074fd28e71e6120f7ea183c09361948bf8d68df9cd5190e9450a3f03954bbd2927f8091c987368bb7
7
+ data.tar.gz: 904185a06d1def6513268033cbe87b30e3af9b92e0a26ec3cf89c93ad88450c4d49aa13d79c2428b1f7f6cc1cbffefeccf146b5a5c7ddf0057ab3db2f1b7dc8d
data/.rubycritic.yml ADDED
@@ -0,0 +1,8 @@
1
+ paths:
2
+ - lib
3
+
4
+ formats:
5
+ - console
6
+
7
+ minimum_score: 80
8
+ no_browser: true
data/.simplecov ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "simplecov"
4
+
5
+ SimpleCov.start do
6
+ enable_coverage :branch
7
+ primary_coverage :branch
8
+
9
+ add_filter "/spec/"
10
+ add_filter "/examples/"
11
+ add_filter "/internal/"
12
+ add_filter "/tmp/"
13
+
14
+ track_files "lib/**/*.rb"
15
+
16
+ if ENV["CI"] == "true" || ENV["SIMPLECOV_STRICT"] == "1"
17
+ minimum_coverage line: 89
18
+ minimum_coverage branch: 75
19
+ end
20
+
21
+ command_name "RSpec"
22
+ end
data/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.5.0 (2026-03-25)
4
+
5
+ Data-Driven Prompt Engineering — see ADR-0015.
6
+
7
+ ### Features
8
+
9
+ - **`observe` DSL** — soft observations that log but never fail. `observe("scores differ") { |o| o[:a] != o[:b] }`. Results in `result.observations`. Logged via `Contract.logger` when they fail. Runs only when validation passes.
10
+ - **`compare_with`** — prompt A/B testing. `StepV2.compare_with(StepV1, eval: "regression", model: "nano")` returns `PromptDiff` with `improvements`, `regressions`, `score_delta`, `safe_to_switch?`. Reuses `BaselineDiff` internally.
11
+ - **RSpec `compared_with` chain** — `expect(StepV2).to pass_eval("x").compared_with(StepV1).without_regressions` blocks merge if new prompt regresses any case.
12
+
13
+ ### Game changer continuity
14
+
15
+ ```
16
+ v0.2: "Which model?" → compare_models (snapshot)
17
+ v0.3: "Did it change?" → baseline regression (binary)
18
+ v0.4: "Show me the trend" → eval history (time series)
19
+ v0.5: "Which prompt is better?" → compare_with (A/B testing)
20
+ ```
21
+
3
22
  ## 0.4.5 (2026-03-24)
4
23
 
5
24
  Audit hardening — 18 bugs fixed across 4 audit rounds.
data/Gemfile CHANGED
@@ -8,4 +8,6 @@ group :development, :test do
8
8
  gem "rake", "~> 13.0"
9
9
  gem "rspec", "~> 3.13"
10
10
  gem "rubocop", "~> 1.75"
11
+ gem "rubycritic", "~> 4.9"
12
+ gem "simplecov", "~> 0.22"
11
13
  end
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby_llm-contract (0.4.5)
4
+ ruby_llm-contract (0.5.0)
5
5
  dry-types (~> 1.7)
6
6
  ruby_llm (~> 1.0)
7
7
  ruby_llm-schema (~> 0.3)
@@ -12,20 +12,43 @@ GEM
12
12
  addressable (2.8.9)
13
13
  public_suffix (>= 2.0.2, < 8.0)
14
14
  ast (2.4.3)
15
+ axiom-types (0.1.1)
16
+ descendants_tracker (~> 0.0.4)
17
+ ice_nine (~> 0.11.0)
18
+ thread_safe (~> 0.3, >= 0.3.1)
15
19
  base64 (0.3.0)
16
20
  bigdecimal (4.0.1)
21
+ childprocess (5.1.0)
22
+ logger (~> 1.5)
23
+ coercible (1.0.0)
24
+ descendants_tracker (~> 0.0.1)
17
25
  concurrent-ruby (1.3.6)
26
+ descendants_tracker (0.0.4)
27
+ thread_safe (~> 0.3, >= 0.3.1)
18
28
  diff-lcs (1.6.2)
29
+ docile (1.4.1)
30
+ dry-configurable (1.3.0)
31
+ dry-core (~> 1.1)
32
+ zeitwerk (~> 2.6)
19
33
  dry-core (1.2.0)
20
34
  concurrent-ruby (~> 1.0)
21
35
  logger
22
36
  zeitwerk (~> 2.6)
23
37
  dry-inflector (1.3.1)
38
+ dry-initializer (3.2.0)
24
39
  dry-logic (1.6.0)
25
40
  bigdecimal
26
41
  concurrent-ruby (~> 1.0)
27
42
  dry-core (~> 1.1)
28
43
  zeitwerk (~> 2.6)
44
+ dry-schema (1.16.0)
45
+ concurrent-ruby (~> 1.0)
46
+ dry-configurable (~> 1.0, >= 1.0.1)
47
+ dry-core (~> 1.1)
48
+ dry-initializer (~> 3.2)
49
+ dry-logic (~> 1.6)
50
+ dry-types (~> 1.9, >= 1.9.1)
51
+ zeitwerk (~> 2.6)
29
52
  dry-types (1.9.1)
30
53
  bigdecimal (>= 3.0)
31
54
  concurrent-ruby (~> 1.0)
@@ -33,6 +56,7 @@ GEM
33
56
  dry-inflector (~> 1.0)
34
57
  dry-logic (~> 1.4)
35
58
  zeitwerk (~> 2.6)
59
+ erubi (1.13.1)
36
60
  event_stream_parser (1.0.0)
37
61
  faraday (2.14.1)
38
62
  faraday-net_http (>= 2.0, < 3.5)
@@ -44,11 +68,25 @@ GEM
44
68
  net-http (~> 0.5)
45
69
  faraday-retry (2.4.0)
46
70
  faraday (~> 2.0)
71
+ flay (2.14.3)
72
+ erubi (~> 1.10)
73
+ path_expander (~> 2.0)
74
+ prism (~> 1.7)
75
+ sexp_processor (~> 4.0)
76
+ flog (4.9.4)
77
+ path_expander (~> 2.0)
78
+ prism (~> 1.7)
79
+ sexp_processor (~> 4.8)
80
+ ice_nine (0.11.2)
47
81
  json (2.19.2)
48
82
  json-schema (6.2.0)
49
83
  addressable (~> 2.8)
50
84
  bigdecimal (>= 3.1, < 5)
51
85
  language_server-protocol (3.17.0.5)
86
+ launchy (3.1.1)
87
+ addressable (~> 2.8)
88
+ childprocess (~> 5.0)
89
+ logger (~> 1.6)
52
90
  lint_roller (1.1.0)
53
91
  logger (1.7.0)
54
92
  marcel (1.1.0)
@@ -61,12 +99,20 @@ GEM
61
99
  parser (3.3.10.2)
62
100
  ast (~> 2.4.1)
63
101
  racc
102
+ path_expander (2.0.1)
64
103
  prism (1.9.0)
65
104
  public_suffix (7.0.5)
66
105
  racc (1.8.1)
67
106
  rainbow (3.1.1)
68
107
  rake (13.3.1)
108
+ reek (6.5.0)
109
+ dry-schema (~> 1.13)
110
+ logger (~> 1.6)
111
+ parser (~> 3.3.0)
112
+ rainbow (>= 2.0, < 4.0)
113
+ rexml (~> 3.1)
69
114
  regexp_parser (2.11.3)
115
+ rexml (3.4.4)
70
116
  rspec (3.13.2)
71
117
  rspec-core (~> 3.13.0)
72
118
  rspec-expectations (~> 3.13.0)
@@ -107,10 +153,39 @@ GEM
107
153
  ruby_llm-schema (~> 0)
108
154
  zeitwerk (~> 2)
109
155
  ruby_llm-schema (0.3.0)
156
+ ruby_parser (3.22.0)
157
+ racc (~> 1.5)
158
+ sexp_processor (~> 4.16)
159
+ rubycritic (4.12.0)
160
+ flay (~> 2.13)
161
+ flog (~> 4.7)
162
+ launchy (>= 2.5.2)
163
+ parser (>= 3.3.0.5)
164
+ prism (>= 1.6.0)
165
+ rainbow (~> 3.1.1)
166
+ reek (~> 6.5.0, < 7.0)
167
+ rexml
168
+ ruby_parser (~> 3.21)
169
+ simplecov (>= 0.22.0)
170
+ tty-which (~> 0.5.0)
171
+ virtus (~> 2.0)
172
+ sexp_processor (4.17.5)
173
+ simplecov (0.22.0)
174
+ docile (~> 1.1)
175
+ simplecov-html (~> 0.11)
176
+ simplecov_json_formatter (~> 0.1)
177
+ simplecov-html (0.13.2)
178
+ simplecov_json_formatter (0.1.4)
179
+ thread_safe (0.3.6)
180
+ tty-which (0.5.0)
110
181
  unicode-display_width (3.2.0)
111
182
  unicode-emoji (~> 4.1)
112
183
  unicode-emoji (4.2.0)
113
184
  uri (1.1.1)
185
+ virtus (2.0.0)
186
+ axiom-types (~> 0.1)
187
+ coercible (~> 1.0)
188
+ descendants_tracker (~> 0.0, >= 0.0.3)
114
189
  zeitwerk (2.7.5)
115
190
 
116
191
  PLATFORMS
@@ -122,26 +197,41 @@ DEPENDENCIES
122
197
  rspec (~> 3.13)
123
198
  rubocop (~> 1.75)
124
199
  ruby_llm-contract!
200
+ rubycritic (~> 4.9)
201
+ simplecov (~> 0.22)
125
202
 
126
203
  CHECKSUMS
127
204
  addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
128
205
  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
206
+ axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383
129
207
  base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
130
208
  bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
209
+ childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec
210
+ coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc
131
211
  concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
212
+ descendants_tracker (0.0.4) sha256=e9c41dd4cfbb85829a9301ea7e7c48c2a03b26f09319db230e6479ccdc780897
132
213
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
214
+ docile (1.4.1) sha256=96159be799bfa73cdb721b840e9802126e4e03dfc26863db73647204c727f21e
215
+ dry-configurable (1.3.0) sha256=882d862858567fc1210d2549d4c090f34370fc1bb7c5c1933de3fe792e18afa8
133
216
  dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
134
217
  dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
218
+ dry-initializer (3.2.0) sha256=37d59798f912dc0a1efe14a4db4a9306989007b302dcd5f25d0a2a20c166c4e3
135
219
  dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
220
+ dry-schema (1.16.0) sha256=cd3aaeabc0f1af66ec82a29096d4c4fb92a0a58b9dae29a22b1bbceb78985727
136
221
  dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
222
+ erubi (1.13.1) sha256=a082103b0885dbc5ecf1172fede897f9ebdb745a4b97a5e8dc63953db1ee4ad9
137
223
  event_stream_parser (1.0.0) sha256=a2683bab70126286f8184dc88f7968ffc4028f813161fb073ec90d171f7de3c8
138
224
  faraday (2.14.1) sha256=a43cceedc1e39d188f4d2cdd360a8aaa6a11da0c407052e426ba8d3fb42ef61c
139
225
  faraday-multipart (1.2.0) sha256=7d89a949693714176f612323ca13746a2ded204031a6ba528adee788694ef757
140
226
  faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
141
227
  faraday-retry (2.4.0) sha256=7b79c48fb7e56526faf247b12d94a680071ff40c9fda7cf1ec1549439ad11ebe
228
+ flay (2.14.3) sha256=7f96a495f4bde880460e77e7345464e752bd44f09f5cd30c80af02afe0ed3f29
229
+ flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb
230
+ ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
142
231
  json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
143
232
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
144
233
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
234
+ launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e
145
235
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
146
236
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
147
237
  marcel (1.1.0) sha256=fdcfcfa33cc52e93c4308d40e4090a5d4ea279e160a7f6af988260fa970e0bee
@@ -150,12 +240,15 @@ CHECKSUMS
150
240
  net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
151
241
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
152
242
  parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
243
+ path_expander (2.0.1) sha256=2de201164bff4719cc4d0b3767286e9977cc832a59c4d70abab571ec86cb41e4
153
244
  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
154
245
  public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
155
246
  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
156
247
  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
157
248
  rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
249
+ reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1
158
250
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
251
+ rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
159
252
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
160
253
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
161
254
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
@@ -165,11 +258,20 @@ CHECKSUMS
165
258
  rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
166
259
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
167
260
  ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
168
- ruby_llm-contract (0.4.5)
261
+ ruby_llm-contract (0.5.0)
169
262
  ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
263
+ ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
264
+ rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8
265
+ sexp_processor (4.17.5) sha256=ae2b48ba98353d5d465ce8759836b7a05f2e12c5879fcd14d7815b026de32f0e
266
+ simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
267
+ simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
268
+ simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
269
+ thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a
270
+ tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466
170
271
  unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
171
272
  unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
172
273
  uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
274
+ virtus (2.0.0) sha256=8841dae4eb7fcc097320ba5ea516bf1839e5d056c61ee27138aa4bddd6e3d1c2
173
275
  zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
174
276
 
175
277
  BUNDLED WITH
data/README.md CHANGED
@@ -157,6 +157,44 @@ report = ClassifyTicket.run_eval("regression",
157
157
  concurrency: 4)
158
158
  ```
159
159
 
160
+ ## Prompt A/B testing
161
+
162
+ Changed a prompt? Compare old vs new with regression safety:
163
+
164
+ ```ruby
165
+ diff = ClassifyTicketV2.compare_with(ClassifyTicketV1,
166
+ eval: "regression", model: "gpt-4.1-mini")
167
+
168
+ diff.safe_to_switch? # => true (no regressions, no per-case score drops)
169
+ diff.improvements # => [{case: "outage", ...}]
170
+ diff.score_delta # => +0.33
171
+ ```
172
+
173
+ Requires `model:` or `context: { adapter: ... }`.
174
+ `compare_with` ignores `sample_response`; without a real model/adapter both sides are skipped and the A/B result is not meaningful.
175
+
176
+ CI gate:
177
+ ```ruby
178
+ expect(ClassifyTicketV2).to pass_eval("regression")
179
+ .compared_with(ClassifyTicketV1)
180
+ .with_minimum_score(0.8)
181
+ ```
182
+
183
+ ## Soft observations
184
+
185
+ Log suspicious-but-not-invalid output without failing the contract:
186
+
187
+ ```ruby
188
+ class EvaluateComparative < RubyLLM::Contract::Step::Base
189
+ validate("scores in range") { |o| (1..10).include?(o[:score_a]) }
190
+ observe("scores should differ") { |o| o[:score_a] != o[:score_b] }
191
+ end
192
+
193
+ result = EvaluateComparative.run(input)
194
+ result.ok? # => true (observe never fails)
195
+ result.observations # => [{description: "scores should differ", passed: false}]
196
+ ```
197
+
160
198
  ## Predict cost before running
161
199
 
162
200
  ```ruby
@@ -190,11 +228,13 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
190
228
 
191
229
  ## Roadmap
192
230
 
193
- **v0.4 (current):** Observability & scaleeval history with trending, batch eval with concurrency, pipeline per-step eval, Minitest support, structured logging.
231
+ **v0.5 (current):** Data-driven prompt engineering`compare_with(OtherStep)` for prompt A/B testing with regression safety. `observe` DSL for soft observations that log but never fail.
232
+
233
+ **v0.4:** Observability & scale — eval history with trending, batch eval with concurrency, pipeline per-step eval, Minitest support, structured logging. Audit hardening (18 bugfixes).
194
234
 
195
235
  **v0.3:** Baseline regression detection, migration guide, production hardening.
196
236
 
197
- **v0.5:** Prompt A/B testing `compare_with(OtherStep)` for data-driven prompt engineering with regression safety. Cross-provider comparison docs.
237
+ **v0.6:** Model recommendation based on eval history data. Cross-provider comparison docs.
198
238
 
199
239
  ## License
200
240
 
@@ -9,22 +9,23 @@ module RubyLLM
9
9
  private
10
10
 
11
11
  def safe_context(context)
12
- (context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
12
+ (context || {}).transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key }
13
13
  end
14
14
 
15
15
  def isolate_context(context)
16
- context.transform_values do |v|
17
- if v.respond_to?(:clone_for_concurrency)
18
- v.clone_for_concurrency
19
- elsif v.respond_to?(:dup)
20
- v.dup
21
- else
22
- v
23
- end
16
+ context.transform_values do |value|
17
+ duplicate_context_value(value)
24
18
  rescue TypeError
25
- v
19
+ value
26
20
  end
27
21
  end
22
+
23
+ def duplicate_context_value(value)
24
+ return value.clone_for_concurrency if value.respond_to?(:clone_for_concurrency)
25
+ return value.dup if value.respond_to?(:dup)
26
+
27
+ value
28
+ end
28
29
  end
29
30
  end
30
31
  end
@@ -8,15 +8,21 @@ module RubyLLM
8
8
  module DeepFreeze
9
9
  private
10
10
 
11
- def deep_dup_freeze(obj)
12
- case obj
13
- when NilClass, Integer, Float, Symbol, TrueClass, FalseClass then obj
14
- when Hash then obj.transform_values { |v| deep_dup_freeze(v) }.freeze
15
- when Array then obj.map { |v| deep_dup_freeze(v) }.freeze
16
- when String then obj.frozen? ? obj : obj.dup.freeze
17
- else obj.frozen? ? obj : obj.dup.freeze
11
+ IMMUTABLE_TYPES = [NilClass, Integer, Float, Symbol, TrueClass, FalseClass].freeze
12
+
13
+ def deep_dup_freeze(object)
14
+ case object
15
+ when *IMMUTABLE_TYPES then object
16
+ when Hash then object.transform_values { |value| deep_dup_freeze(value) }.freeze
17
+ when Array then object.map { |value| deep_dup_freeze(value) }.freeze
18
+ else
19
+ frozen_copy(object)
18
20
  end
19
21
  end
22
+
23
+ def frozen_copy(object)
24
+ object.frozen? ? object : object.dup.freeze
25
+ end
20
26
  end
21
27
  end
22
28
  end
@@ -3,12 +3,22 @@
3
3
  module RubyLLM
4
4
  module Contract
5
5
  module Concerns
6
+ # Recursively converts Hash keys to symbols while preserving array shape.
6
7
  module DeepSymbolize
7
- def deep_symbolize(obj)
8
- case obj
9
- when Hash then obj.transform_keys(&:to_sym).transform_values { |val| deep_symbolize(val) }
10
- when Array then obj.map { |val| deep_symbolize(val) }
11
- else obj
8
+ def deep_symbolize(object)
9
+ case object
10
+ when Hash then symbolize_hash(object)
11
+ when Array then object.map { |value| deep_symbolize(value) }
12
+ else
13
+ object
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def symbolize_hash(hash)
20
+ hash.each_with_object({}) do |(key, value), symbolized|
21
+ symbolized[key.to_sym] = deep_symbolize(value)
12
22
  end
13
23
  end
14
24
  end
@@ -5,6 +5,11 @@ module RubyLLM
5
5
  module Concerns
6
6
  module EvalHost
7
7
  include ContextHelpers
8
+
9
+ SAMPLE_RESPONSE_COMPARE_WARNING = "[ruby_llm-contract] compare_with ignores sample_response. " \
10
+ "Without model: or context: { adapter: ... }, both sides will be skipped " \
11
+ "and the A/B comparison is not meaningful.".freeze
12
+
8
13
  def define_eval(name, &)
9
14
  @eval_definitions ||= {}
10
15
  @file_sourced_evals ||= Set.new
@@ -45,6 +50,26 @@ module RubyLLM
45
50
  end
46
51
  end
47
52
 
53
+ # Compare this step (candidate) with another step (baseline) using the
54
+ # baseline's eval definition as single source of truth.
55
+ #
56
+ # Requires a real adapter or model in context. sample_response is
57
+ # intentionally NOT used, because A/B testing with canned data
58
+ # gives identical results for both sides rather than a real comparison.
59
+ def compare_with(other_step, eval:, model: nil, context: {})
60
+ ctx = comparison_context(context, model)
61
+ baseline_defn = baseline_eval_definition(other_step, eval)
62
+ raise ArgumentError, "No eval '#{eval}' on baseline step #{other_step}" unless baseline_defn
63
+
64
+ dataset = baseline_defn.build_dataset
65
+ warn_sample_response_compare(ctx, baseline_defn)
66
+
67
+ my_report = Eval::Runner.run(step: self, dataset: dataset, context: isolate_context(ctx))
68
+ other_report = Eval::Runner.run(step: other_step, dataset: dataset, context: isolate_context(ctx))
69
+
70
+ Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
71
+ end
72
+
48
73
  def compare_models(eval_name, models:, context: {})
49
74
  context = safe_context(context)
50
75
  models = models.uniq
@@ -57,6 +82,21 @@ module RubyLLM
57
82
 
58
83
  private
59
84
 
85
+ def comparison_context(context, model)
86
+ base_context = safe_context(context)
87
+ model ? base_context.merge(model: model) : base_context
88
+ end
89
+
90
+ def baseline_eval_definition(other_step, eval_name)
91
+ other_step.send(:all_eval_definitions)[eval_name.to_s]
92
+ end
93
+
94
+ def warn_sample_response_compare(context, baseline_defn)
95
+ return if context[:adapter] || context[:model] || !baseline_defn.build_adapter
96
+
97
+ warn SAMPLE_RESPONSE_COMPARE_WARNING
98
+ end
99
+
60
100
  def all_eval_definitions
61
101
  inherited = if superclass.respond_to?(:all_eval_definitions, true)
62
102
  superclass.send(:all_eval_definitions)
@@ -71,20 +111,24 @@ module RubyLLM
71
111
  defn = all_eval_definitions[name.to_s]
72
112
  raise ArgumentError, "No eval '#{name}' defined. Available: #{all_eval_definitions.keys}" unless defn
73
113
 
74
- effective_context = eval_context(defn, context)
75
- Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
76
- concurrency: concurrency)
114
+ run_eval_definition(defn, context, concurrency: concurrency)
77
115
  end
78
116
 
79
117
  def run_all_own_evals(context, concurrency: nil)
80
118
  all_eval_definitions.transform_values do |defn|
81
- isolated_context = isolate_context(context)
82
- effective_context = eval_context(defn, isolated_context)
83
- Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
84
- concurrency: concurrency)
119
+ run_eval_definition(defn, isolate_context(context), concurrency: concurrency)
85
120
  end
86
121
  end
87
122
 
123
+ def run_eval_definition(defn, context, concurrency: nil)
124
+ Eval::Runner.run(
125
+ step: self,
126
+ dataset: defn.build_dataset,
127
+ context: eval_context(defn, context),
128
+ concurrency: concurrency
129
+ )
130
+ end
131
+
88
132
  def eval_context(defn, context)
89
133
  context = safe_context(context)
90
134
  return context if context[:adapter]
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ class SchemaValidator
6
+ # Validates numeric and collection size bounds for a node.
7
+ class BoundRule
8
+ NUMERIC_LIMITS = [
9
+ { key: :minimum, label: "minimum", relation: "below", invalid: ->(actual, limit) { actual < limit } },
10
+ { key: :maximum, label: "maximum", relation: "above", invalid: ->(actual, limit) { actual > limit } }
11
+ ].freeze
12
+ SIZE_LIMITS = [
13
+ { bound: :min_key, relation: "below", invalid: ->(actual, limit) { actual < limit } },
14
+ { bound: :max_key, relation: "above", invalid: ->(actual, limit) { actual > limit } }
15
+ ].freeze
16
+
17
+ def initialize(errors)
18
+ @errors = errors
19
+ end
20
+
21
+ def validate(node)
22
+ value = node.value
23
+ schema = node.schema
24
+ path = node.path
25
+
26
+ append_numeric_bound_errors(path, value, schema) if value.is_a?(Numeric)
27
+ append_size_bound_errors(path, value, schema)
28
+ end
29
+
30
+ private
31
+
32
+ def append_numeric_bound_errors(path, value, schema)
33
+ NUMERIC_LIMITS.each do |limit_config|
34
+ append_bound_error(
35
+ path: path,
36
+ actual: value,
37
+ limit: schema[limit_config[:key]],
38
+ label: limit_config[:label],
39
+ relation: limit_config[:relation],
40
+ invalid: limit_config[:invalid]
41
+ )
42
+ end
43
+ end
44
+
45
+ def append_size_bound_errors(path, value, schema)
46
+ bounds = size_bounds_for(value)
47
+ return unless bounds
48
+
49
+ actual_size = value.length
50
+ metric = bounds[:metric]
51
+
52
+ SIZE_LIMITS.each do |limit_config|
53
+ label = bounds[limit_config[:bound]]
54
+ append_bound_error(
55
+ path: path,
56
+ actual: actual_size,
57
+ limit: schema[label],
58
+ label: label,
59
+ relation: limit_config[:relation],
60
+ metric: metric,
61
+ invalid: limit_config[:invalid]
62
+ )
63
+ end
64
+ end
65
+
66
+ def append_bound_error(path:, actual:, limit:, label:, relation:, invalid:, metric: nil)
67
+ return unless limit
68
+ return unless invalid.call(actual, limit)
69
+
70
+ subject = metric ? "#{metric} #{actual}" : actual
71
+ @errors << "#{path}: #{subject} is #{relation} #{label} #{limit}"
72
+ end
73
+
74
+ def size_bounds_for(value)
75
+ case value
76
+ when String
77
+ SchemaValidator::SIZE_BOUNDS[:string]
78
+ when Array
79
+ SchemaValidator::SIZE_BOUNDS[:array]
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ class SchemaValidator
6
+ # Validates enum membership for a node when enum values are declared.
7
+ class EnumRule
8
+ def initialize(errors)
9
+ @errors = errors
10
+ end
11
+
12
+ def validate(node)
13
+ allowed_values = node.schema[:enum]
14
+ value = node.value
15
+ return unless allowed_values
16
+ return if allowed_values.include?(value)
17
+
18
+ @errors << "#{node.path}: #{value.inspect} is not in enum #{allowed_values.inspect}"
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end