ruby_llm-contract 0.4.5 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +25 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +55 -2
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +4 -1
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +5 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +94 -39
- data/lib/ruby_llm/contract/step/dsl.rb +10 -0
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +46 -85
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +28 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 359d08f8cf1e31b84f308c47c7f93c7cee7663054de3ab538a34c1f67873554f
|
|
4
|
+
data.tar.gz: 60d8728bed042277d40ec1d231b6712e258b658fd893a73afc6ed1f8e9cff8c8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4bd4d7cea9fde7281bf84e1283c4201f8c5e9425cb8357e40b85e5184f19f51eb57a88a35901eddf571defd93ff33ef790e24b5e2eb90add8ef6371e791d37e5
|
|
7
|
+
data.tar.gz: e68ca27fc2225224cd900b1afb2180cfd43929e0461420c7fd2987706a2ebaa282b1e659c8b5c14e69e30d1250ede547061e2d2ab74b5c9cc0bb7fdb77109f0a
|
data/.rubycritic.yml
ADDED
data/.simplecov
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "simplecov"
|
|
4
|
+
|
|
5
|
+
SimpleCov.start do
|
|
6
|
+
enable_coverage :branch
|
|
7
|
+
primary_coverage :branch
|
|
8
|
+
|
|
9
|
+
add_filter "/spec/"
|
|
10
|
+
add_filter "/examples/"
|
|
11
|
+
add_filter "/internal/"
|
|
12
|
+
add_filter "/tmp/"
|
|
13
|
+
|
|
14
|
+
track_files "lib/**/*.rb"
|
|
15
|
+
|
|
16
|
+
if ENV["CI"] == "true" || ENV["SIMPLECOV_STRICT"] == "1"
|
|
17
|
+
minimum_coverage line: 89
|
|
18
|
+
minimum_coverage branch: 75
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
command_name "RSpec"
|
|
22
|
+
end
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,30 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.5.2 (2026-04-06)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
- **`reasoning_effort` forwarded to provider** — `context: { reasoning_effort: "low" }` now passed through `with_params` to the LLM. Previously accepted as a known context key but silently ignored by the RubyLLM adapter.
|
|
8
|
+
|
|
9
|
+
## 0.5.0 (2026-03-25)
|
|
10
|
+
|
|
11
|
+
Data-Driven Prompt Engineering — see ADR-0015.
|
|
12
|
+
|
|
13
|
+
### Features
|
|
14
|
+
|
|
15
|
+
- **`observe` DSL** — soft observations that log but never fail. `observe("scores differ") { |o| o[:a] != o[:b] }`. Results in `result.observations`. Logged via `Contract.logger` when they fail. Runs only when validation passes.
|
|
16
|
+
- **`compare_with`** — prompt A/B testing. `StepV2.compare_with(StepV1, eval: "regression", model: "nano")` returns `PromptDiff` with `improvements`, `regressions`, `score_delta`, `safe_to_switch?`. Reuses `BaselineDiff` internally.
|
|
17
|
+
- **RSpec `compared_with` chain** — `expect(StepV2).to pass_eval("x").compared_with(StepV1).without_regressions` blocks merge if new prompt regresses any case.
|
|
18
|
+
|
|
19
|
+
### Game changer continuity
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
v0.2: "Which model?" → compare_models (snapshot)
|
|
23
|
+
v0.3: "Did it change?" → baseline regression (binary)
|
|
24
|
+
v0.4: "Show me the trend" → eval history (time series)
|
|
25
|
+
v0.5: "Which prompt is better?" → compare_with (A/B testing)
|
|
26
|
+
```
|
|
27
|
+
|
|
3
28
|
## 0.4.5 (2026-03-24)
|
|
4
29
|
|
|
5
30
|
Audit hardening — 18 bugs fixed across 4 audit rounds.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ruby_llm-contract (0.
|
|
4
|
+
ruby_llm-contract (0.5.2)
|
|
5
5
|
dry-types (~> 1.7)
|
|
6
6
|
ruby_llm (~> 1.0)
|
|
7
7
|
ruby_llm-schema (~> 0.3)
|
|
@@ -12,20 +12,43 @@ GEM
|
|
|
12
12
|
addressable (2.8.9)
|
|
13
13
|
public_suffix (>= 2.0.2, < 8.0)
|
|
14
14
|
ast (2.4.3)
|
|
15
|
+
axiom-types (0.1.1)
|
|
16
|
+
descendants_tracker (~> 0.0.4)
|
|
17
|
+
ice_nine (~> 0.11.0)
|
|
18
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
|
15
19
|
base64 (0.3.0)
|
|
16
20
|
bigdecimal (4.0.1)
|
|
21
|
+
childprocess (5.1.0)
|
|
22
|
+
logger (~> 1.5)
|
|
23
|
+
coercible (1.0.0)
|
|
24
|
+
descendants_tracker (~> 0.0.1)
|
|
17
25
|
concurrent-ruby (1.3.6)
|
|
26
|
+
descendants_tracker (0.0.4)
|
|
27
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
|
18
28
|
diff-lcs (1.6.2)
|
|
29
|
+
docile (1.4.1)
|
|
30
|
+
dry-configurable (1.3.0)
|
|
31
|
+
dry-core (~> 1.1)
|
|
32
|
+
zeitwerk (~> 2.6)
|
|
19
33
|
dry-core (1.2.0)
|
|
20
34
|
concurrent-ruby (~> 1.0)
|
|
21
35
|
logger
|
|
22
36
|
zeitwerk (~> 2.6)
|
|
23
37
|
dry-inflector (1.3.1)
|
|
38
|
+
dry-initializer (3.2.0)
|
|
24
39
|
dry-logic (1.6.0)
|
|
25
40
|
bigdecimal
|
|
26
41
|
concurrent-ruby (~> 1.0)
|
|
27
42
|
dry-core (~> 1.1)
|
|
28
43
|
zeitwerk (~> 2.6)
|
|
44
|
+
dry-schema (1.16.0)
|
|
45
|
+
concurrent-ruby (~> 1.0)
|
|
46
|
+
dry-configurable (~> 1.0, >= 1.0.1)
|
|
47
|
+
dry-core (~> 1.1)
|
|
48
|
+
dry-initializer (~> 3.2)
|
|
49
|
+
dry-logic (~> 1.6)
|
|
50
|
+
dry-types (~> 1.9, >= 1.9.1)
|
|
51
|
+
zeitwerk (~> 2.6)
|
|
29
52
|
dry-types (1.9.1)
|
|
30
53
|
bigdecimal (>= 3.0)
|
|
31
54
|
concurrent-ruby (~> 1.0)
|
|
@@ -33,6 +56,7 @@ GEM
|
|
|
33
56
|
dry-inflector (~> 1.0)
|
|
34
57
|
dry-logic (~> 1.4)
|
|
35
58
|
zeitwerk (~> 2.6)
|
|
59
|
+
erubi (1.13.1)
|
|
36
60
|
event_stream_parser (1.0.0)
|
|
37
61
|
faraday (2.14.1)
|
|
38
62
|
faraday-net_http (>= 2.0, < 3.5)
|
|
@@ -44,11 +68,25 @@ GEM
|
|
|
44
68
|
net-http (~> 0.5)
|
|
45
69
|
faraday-retry (2.4.0)
|
|
46
70
|
faraday (~> 2.0)
|
|
71
|
+
flay (2.14.3)
|
|
72
|
+
erubi (~> 1.10)
|
|
73
|
+
path_expander (~> 2.0)
|
|
74
|
+
prism (~> 1.7)
|
|
75
|
+
sexp_processor (~> 4.0)
|
|
76
|
+
flog (4.9.4)
|
|
77
|
+
path_expander (~> 2.0)
|
|
78
|
+
prism (~> 1.7)
|
|
79
|
+
sexp_processor (~> 4.8)
|
|
80
|
+
ice_nine (0.11.2)
|
|
47
81
|
json (2.19.2)
|
|
48
82
|
json-schema (6.2.0)
|
|
49
83
|
addressable (~> 2.8)
|
|
50
84
|
bigdecimal (>= 3.1, < 5)
|
|
51
85
|
language_server-protocol (3.17.0.5)
|
|
86
|
+
launchy (3.1.1)
|
|
87
|
+
addressable (~> 2.8)
|
|
88
|
+
childprocess (~> 5.0)
|
|
89
|
+
logger (~> 1.6)
|
|
52
90
|
lint_roller (1.1.0)
|
|
53
91
|
logger (1.7.0)
|
|
54
92
|
marcel (1.1.0)
|
|
@@ -61,12 +99,20 @@ GEM
|
|
|
61
99
|
parser (3.3.10.2)
|
|
62
100
|
ast (~> 2.4.1)
|
|
63
101
|
racc
|
|
102
|
+
path_expander (2.0.1)
|
|
64
103
|
prism (1.9.0)
|
|
65
104
|
public_suffix (7.0.5)
|
|
66
105
|
racc (1.8.1)
|
|
67
106
|
rainbow (3.1.1)
|
|
68
107
|
rake (13.3.1)
|
|
108
|
+
reek (6.5.0)
|
|
109
|
+
dry-schema (~> 1.13)
|
|
110
|
+
logger (~> 1.6)
|
|
111
|
+
parser (~> 3.3.0)
|
|
112
|
+
rainbow (>= 2.0, < 4.0)
|
|
113
|
+
rexml (~> 3.1)
|
|
69
114
|
regexp_parser (2.11.3)
|
|
115
|
+
rexml (3.4.4)
|
|
70
116
|
rspec (3.13.2)
|
|
71
117
|
rspec-core (~> 3.13.0)
|
|
72
118
|
rspec-expectations (~> 3.13.0)
|
|
@@ -107,10 +153,39 @@ GEM
|
|
|
107
153
|
ruby_llm-schema (~> 0)
|
|
108
154
|
zeitwerk (~> 2)
|
|
109
155
|
ruby_llm-schema (0.3.0)
|
|
156
|
+
ruby_parser (3.22.0)
|
|
157
|
+
racc (~> 1.5)
|
|
158
|
+
sexp_processor (~> 4.16)
|
|
159
|
+
rubycritic (4.12.0)
|
|
160
|
+
flay (~> 2.13)
|
|
161
|
+
flog (~> 4.7)
|
|
162
|
+
launchy (>= 2.5.2)
|
|
163
|
+
parser (>= 3.3.0.5)
|
|
164
|
+
prism (>= 1.6.0)
|
|
165
|
+
rainbow (~> 3.1.1)
|
|
166
|
+
reek (~> 6.5.0, < 7.0)
|
|
167
|
+
rexml
|
|
168
|
+
ruby_parser (~> 3.21)
|
|
169
|
+
simplecov (>= 0.22.0)
|
|
170
|
+
tty-which (~> 0.5.0)
|
|
171
|
+
virtus (~> 2.0)
|
|
172
|
+
sexp_processor (4.17.5)
|
|
173
|
+
simplecov (0.22.0)
|
|
174
|
+
docile (~> 1.1)
|
|
175
|
+
simplecov-html (~> 0.11)
|
|
176
|
+
simplecov_json_formatter (~> 0.1)
|
|
177
|
+
simplecov-html (0.13.2)
|
|
178
|
+
simplecov_json_formatter (0.1.4)
|
|
179
|
+
thread_safe (0.3.6)
|
|
180
|
+
tty-which (0.5.0)
|
|
110
181
|
unicode-display_width (3.2.0)
|
|
111
182
|
unicode-emoji (~> 4.1)
|
|
112
183
|
unicode-emoji (4.2.0)
|
|
113
184
|
uri (1.1.1)
|
|
185
|
+
virtus (2.0.0)
|
|
186
|
+
axiom-types (~> 0.1)
|
|
187
|
+
coercible (~> 1.0)
|
|
188
|
+
descendants_tracker (~> 0.0, >= 0.0.3)
|
|
114
189
|
zeitwerk (2.7.5)
|
|
115
190
|
|
|
116
191
|
PLATFORMS
|
|
@@ -122,26 +197,41 @@ DEPENDENCIES
|
|
|
122
197
|
rspec (~> 3.13)
|
|
123
198
|
rubocop (~> 1.75)
|
|
124
199
|
ruby_llm-contract!
|
|
200
|
+
rubycritic (~> 4.9)
|
|
201
|
+
simplecov (~> 0.22)
|
|
125
202
|
|
|
126
203
|
CHECKSUMS
|
|
127
204
|
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
128
205
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
206
|
+
axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383
|
|
129
207
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
130
208
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
209
|
+
childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec
|
|
210
|
+
coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc
|
|
131
211
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
212
|
+
descendants_tracker (0.0.4) sha256=e9c41dd4cfbb85829a9301ea7e7c48c2a03b26f09319db230e6479ccdc780897
|
|
132
213
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
214
|
+
docile (1.4.1) sha256=96159be799bfa73cdb721b840e9802126e4e03dfc26863db73647204c727f21e
|
|
215
|
+
dry-configurable (1.3.0) sha256=882d862858567fc1210d2549d4c090f34370fc1bb7c5c1933de3fe792e18afa8
|
|
133
216
|
dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
|
|
134
217
|
dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
|
|
218
|
+
dry-initializer (3.2.0) sha256=37d59798f912dc0a1efe14a4db4a9306989007b302dcd5f25d0a2a20c166c4e3
|
|
135
219
|
dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
|
|
220
|
+
dry-schema (1.16.0) sha256=cd3aaeabc0f1af66ec82a29096d4c4fb92a0a58b9dae29a22b1bbceb78985727
|
|
136
221
|
dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
|
|
222
|
+
erubi (1.13.1) sha256=a082103b0885dbc5ecf1172fede897f9ebdb745a4b97a5e8dc63953db1ee4ad9
|
|
137
223
|
event_stream_parser (1.0.0) sha256=a2683bab70126286f8184dc88f7968ffc4028f813161fb073ec90d171f7de3c8
|
|
138
224
|
faraday (2.14.1) sha256=a43cceedc1e39d188f4d2cdd360a8aaa6a11da0c407052e426ba8d3fb42ef61c
|
|
139
225
|
faraday-multipart (1.2.0) sha256=7d89a949693714176f612323ca13746a2ded204031a6ba528adee788694ef757
|
|
140
226
|
faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
|
|
141
227
|
faraday-retry (2.4.0) sha256=7b79c48fb7e56526faf247b12d94a680071ff40c9fda7cf1ec1549439ad11ebe
|
|
228
|
+
flay (2.14.3) sha256=7f96a495f4bde880460e77e7345464e752bd44f09f5cd30c80af02afe0ed3f29
|
|
229
|
+
flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb
|
|
230
|
+
ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
|
|
142
231
|
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
143
232
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
144
233
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
234
|
+
launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e
|
|
145
235
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
146
236
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
147
237
|
marcel (1.1.0) sha256=fdcfcfa33cc52e93c4308d40e4090a5d4ea279e160a7f6af988260fa970e0bee
|
|
@@ -150,12 +240,15 @@ CHECKSUMS
|
|
|
150
240
|
net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
|
|
151
241
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
152
242
|
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
243
|
+
path_expander (2.0.1) sha256=2de201164bff4719cc4d0b3767286e9977cc832a59c4d70abab571ec86cb41e4
|
|
153
244
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
154
245
|
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
|
155
246
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
156
247
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
157
248
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
249
|
+
reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1
|
|
158
250
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
251
|
+
rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
|
|
159
252
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
160
253
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
161
254
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
@@ -165,11 +258,20 @@ CHECKSUMS
|
|
|
165
258
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
166
259
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
167
260
|
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
168
|
-
ruby_llm-contract (0.
|
|
261
|
+
ruby_llm-contract (0.5.2)
|
|
169
262
|
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
263
|
+
ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
|
|
264
|
+
rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8
|
|
265
|
+
sexp_processor (4.17.5) sha256=ae2b48ba98353d5d465ce8759836b7a05f2e12c5879fcd14d7815b026de32f0e
|
|
266
|
+
simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
|
|
267
|
+
simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
|
|
268
|
+
simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
|
|
269
|
+
thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a
|
|
270
|
+
tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466
|
|
170
271
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
171
272
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
|
172
273
|
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
|
274
|
+
virtus (2.0.0) sha256=8841dae4eb7fcc097320ba5ea516bf1839e5d056c61ee27138aa4bddd6e3d1c2
|
|
173
275
|
zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
|
|
174
276
|
|
|
175
277
|
BUNDLED WITH
|
data/README.md
CHANGED
|
@@ -38,6 +38,18 @@ result.trace[:model] # => "gpt-4.1-nano"
|
|
|
38
38
|
|
|
39
39
|
Bad JSON? Auto-retry. Wrong value? Escalate to a smarter model. Schema violated? Caught client-side even if the provider ignores it. All with cost tracking.
|
|
40
40
|
|
|
41
|
+
## Start Here: Eval-First
|
|
42
|
+
|
|
43
|
+
The most powerful way to use this gem is simple:
|
|
44
|
+
|
|
45
|
+
- define evals before changing prompts
|
|
46
|
+
- compare prompt versions on the same dataset
|
|
47
|
+
- merge only when the eval stays green
|
|
48
|
+
|
|
49
|
+
Read: [Eval-First](docs/guide/eval_first.md)
|
|
50
|
+
|
|
51
|
+
This is the workflow that gives prompt engineering teeth. No vibes, no cherry-picked examples, no "it felt better in the playground". Just cases, regressions, baselines, and measured wins.
|
|
52
|
+
|
|
41
53
|
## Which model should I use?
|
|
42
54
|
|
|
43
55
|
Define test cases. Compare models. Get data.
|
|
@@ -157,6 +169,44 @@ report = ClassifyTicket.run_eval("regression",
|
|
|
157
169
|
concurrency: 4)
|
|
158
170
|
```
|
|
159
171
|
|
|
172
|
+
## Prompt A/B testing
|
|
173
|
+
|
|
174
|
+
Changed a prompt? Compare old vs new with regression safety:
|
|
175
|
+
|
|
176
|
+
```ruby
|
|
177
|
+
diff = ClassifyTicketV2.compare_with(ClassifyTicketV1,
|
|
178
|
+
eval: "regression", model: "gpt-4.1-mini")
|
|
179
|
+
|
|
180
|
+
diff.safe_to_switch? # => true (no regressions, no per-case score drops)
|
|
181
|
+
diff.improvements # => [{case: "outage", ...}]
|
|
182
|
+
diff.score_delta # => +0.33
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Requires `model:` or `context: { adapter: ... }`.
|
|
186
|
+
`compare_with` ignores `sample_response`; without a real model/adapter both sides are skipped and the A/B result is not meaningful.
|
|
187
|
+
|
|
188
|
+
CI gate:
|
|
189
|
+
```ruby
|
|
190
|
+
expect(ClassifyTicketV2).to pass_eval("regression")
|
|
191
|
+
.compared_with(ClassifyTicketV1)
|
|
192
|
+
.with_minimum_score(0.8)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Soft observations
|
|
196
|
+
|
|
197
|
+
Log suspicious-but-not-invalid output without failing the contract:
|
|
198
|
+
|
|
199
|
+
```ruby
|
|
200
|
+
class EvaluateComparative < RubyLLM::Contract::Step::Base
|
|
201
|
+
validate("scores in range") { |o| (1..10).include?(o[:score_a]) }
|
|
202
|
+
observe("scores should differ") { |o| o[:score_a] != o[:score_b] }
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
result = EvaluateComparative.run(input)
|
|
206
|
+
result.ok? # => true (observe never fails)
|
|
207
|
+
result.observations # => [{description: "scores should differ", passed: false}]
|
|
208
|
+
```
|
|
209
|
+
|
|
160
210
|
## Predict cost before running
|
|
161
211
|
|
|
162
212
|
```ruby
|
|
@@ -182,6 +232,7 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
|
182
232
|
| Guide | |
|
|
183
233
|
|-------|-|
|
|
184
234
|
| [Getting Started](docs/guide/getting_started.md) | Features walkthrough, model escalation, eval |
|
|
235
|
+
| [Eval-First](docs/guide/eval_first.md) | Practical workflow for prompt engineering with datasets, baselines, and A/B gates |
|
|
185
236
|
| [Best Practices](docs/guide/best_practices.md) | 6 patterns for bulletproof validates |
|
|
186
237
|
| [Output Schema](docs/guide/output_schema.md) | Full schema reference + constraints |
|
|
187
238
|
| [Pipeline](docs/guide/pipeline.md) | Multi-step composition, timeout, fail-fast |
|
|
@@ -190,11 +241,13 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
|
190
241
|
|
|
191
242
|
## Roadmap
|
|
192
243
|
|
|
193
|
-
**v0.
|
|
244
|
+
**v0.5 (current):** Data-driven prompt engineering — `compare_with(OtherStep)` for prompt A/B testing with regression safety. `observe` DSL for soft observations that log but never fail.
|
|
245
|
+
|
|
246
|
+
**v0.4:** Observability & scale — eval history with trending, batch eval with concurrency, pipeline per-step eval, Minitest support, structured logging. Audit hardening (18 bugfixes).
|
|
194
247
|
|
|
195
248
|
**v0.3:** Baseline regression detection, migration guide, production hardening.
|
|
196
249
|
|
|
197
|
-
**v0.
|
|
250
|
+
**v0.6:** Model recommendation based on eval history data. Cross-provider comparison docs.
|
|
198
251
|
|
|
199
252
|
## License
|
|
200
253
|
|
|
@@ -52,7 +52,10 @@ module RubyLLM
|
|
|
52
52
|
CHAT_OPTION_METHODS.each do |key, method_name|
|
|
53
53
|
chat.public_send(method_name, options[key]) if options[key]
|
|
54
54
|
end
|
|
55
|
-
|
|
55
|
+
params = {}
|
|
56
|
+
params[:max_tokens] = options[:max_tokens] if options[:max_tokens]
|
|
57
|
+
params[:reasoning_effort] = options[:reasoning_effort] if options[:reasoning_effort]
|
|
58
|
+
chat.with_params(**params) if params.any?
|
|
56
59
|
end
|
|
57
60
|
|
|
58
61
|
def build_response(response)
|
|
@@ -9,22 +9,23 @@ module RubyLLM
|
|
|
9
9
|
private
|
|
10
10
|
|
|
11
11
|
def safe_context(context)
|
|
12
|
-
(context || {}).transform_keys { |
|
|
12
|
+
(context || {}).transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key }
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def isolate_context(context)
|
|
16
|
-
context.transform_values do |
|
|
17
|
-
|
|
18
|
-
v.clone_for_concurrency
|
|
19
|
-
elsif v.respond_to?(:dup)
|
|
20
|
-
v.dup
|
|
21
|
-
else
|
|
22
|
-
v
|
|
23
|
-
end
|
|
16
|
+
context.transform_values do |value|
|
|
17
|
+
duplicate_context_value(value)
|
|
24
18
|
rescue TypeError
|
|
25
|
-
|
|
19
|
+
value
|
|
26
20
|
end
|
|
27
21
|
end
|
|
22
|
+
|
|
23
|
+
def duplicate_context_value(value)
|
|
24
|
+
return value.clone_for_concurrency if value.respond_to?(:clone_for_concurrency)
|
|
25
|
+
return value.dup if value.respond_to?(:dup)
|
|
26
|
+
|
|
27
|
+
value
|
|
28
|
+
end
|
|
28
29
|
end
|
|
29
30
|
end
|
|
30
31
|
end
|
|
@@ -8,15 +8,21 @@ module RubyLLM
|
|
|
8
8
|
module DeepFreeze
|
|
9
9
|
private
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
when
|
|
16
|
-
when
|
|
17
|
-
|
|
11
|
+
IMMUTABLE_TYPES = [NilClass, Integer, Float, Symbol, TrueClass, FalseClass].freeze
|
|
12
|
+
|
|
13
|
+
def deep_dup_freeze(object)
|
|
14
|
+
case object
|
|
15
|
+
when *IMMUTABLE_TYPES then object
|
|
16
|
+
when Hash then object.transform_values { |value| deep_dup_freeze(value) }.freeze
|
|
17
|
+
when Array then object.map { |value| deep_dup_freeze(value) }.freeze
|
|
18
|
+
else
|
|
19
|
+
frozen_copy(object)
|
|
18
20
|
end
|
|
19
21
|
end
|
|
22
|
+
|
|
23
|
+
def frozen_copy(object)
|
|
24
|
+
object.frozen? ? object : object.dup.freeze
|
|
25
|
+
end
|
|
20
26
|
end
|
|
21
27
|
end
|
|
22
28
|
end
|
|
@@ -3,12 +3,22 @@
|
|
|
3
3
|
module RubyLLM
|
|
4
4
|
module Contract
|
|
5
5
|
module Concerns
|
|
6
|
+
# Recursively converts Hash keys to symbols while preserving array shape.
|
|
6
7
|
module DeepSymbolize
|
|
7
|
-
def deep_symbolize(
|
|
8
|
-
case
|
|
9
|
-
when Hash then
|
|
10
|
-
when Array then
|
|
11
|
-
else
|
|
8
|
+
def deep_symbolize(object)
|
|
9
|
+
case object
|
|
10
|
+
when Hash then symbolize_hash(object)
|
|
11
|
+
when Array then object.map { |value| deep_symbolize(value) }
|
|
12
|
+
else
|
|
13
|
+
object
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def symbolize_hash(hash)
|
|
20
|
+
hash.each_with_object({}) do |(key, value), symbolized|
|
|
21
|
+
symbolized[key.to_sym] = deep_symbolize(value)
|
|
12
22
|
end
|
|
13
23
|
end
|
|
14
24
|
end
|
|
@@ -5,6 +5,11 @@ module RubyLLM
|
|
|
5
5
|
module Concerns
|
|
6
6
|
module EvalHost
|
|
7
7
|
include ContextHelpers
|
|
8
|
+
|
|
9
|
+
SAMPLE_RESPONSE_COMPARE_WARNING = "[ruby_llm-contract] compare_with ignores sample_response. " \
|
|
10
|
+
"Without model: or context: { adapter: ... }, both sides will be skipped " \
|
|
11
|
+
"and the A/B comparison is not meaningful.".freeze
|
|
12
|
+
|
|
8
13
|
def define_eval(name, &)
|
|
9
14
|
@eval_definitions ||= {}
|
|
10
15
|
@file_sourced_evals ||= Set.new
|
|
@@ -45,6 +50,26 @@ module RubyLLM
|
|
|
45
50
|
end
|
|
46
51
|
end
|
|
47
52
|
|
|
53
|
+
# Compare this step (candidate) with another step (baseline) using the
|
|
54
|
+
# baseline's eval definition as single source of truth.
|
|
55
|
+
#
|
|
56
|
+
# Requires a real adapter or model in context. sample_response is
|
|
57
|
+
# intentionally NOT used, because A/B testing with canned data
|
|
58
|
+
# gives identical results for both sides rather than a real comparison.
|
|
59
|
+
def compare_with(other_step, eval:, model: nil, context: {})
|
|
60
|
+
ctx = comparison_context(context, model)
|
|
61
|
+
baseline_defn = baseline_eval_definition(other_step, eval)
|
|
62
|
+
raise ArgumentError, "No eval '#{eval}' on baseline step #{other_step}" unless baseline_defn
|
|
63
|
+
|
|
64
|
+
dataset = baseline_defn.build_dataset
|
|
65
|
+
warn_sample_response_compare(ctx, baseline_defn)
|
|
66
|
+
|
|
67
|
+
my_report = Eval::Runner.run(step: self, dataset: dataset, context: isolate_context(ctx))
|
|
68
|
+
other_report = Eval::Runner.run(step: other_step, dataset: dataset, context: isolate_context(ctx))
|
|
69
|
+
|
|
70
|
+
Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
|
|
71
|
+
end
|
|
72
|
+
|
|
48
73
|
def compare_models(eval_name, models:, context: {})
|
|
49
74
|
context = safe_context(context)
|
|
50
75
|
models = models.uniq
|
|
@@ -57,6 +82,21 @@ module RubyLLM
|
|
|
57
82
|
|
|
58
83
|
private
|
|
59
84
|
|
|
85
|
+
def comparison_context(context, model)
|
|
86
|
+
base_context = safe_context(context)
|
|
87
|
+
model ? base_context.merge(model: model) : base_context
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def baseline_eval_definition(other_step, eval_name)
|
|
91
|
+
other_step.send(:all_eval_definitions)[eval_name.to_s]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def warn_sample_response_compare(context, baseline_defn)
|
|
95
|
+
return if context[:adapter] || context[:model] || !baseline_defn.build_adapter
|
|
96
|
+
|
|
97
|
+
warn SAMPLE_RESPONSE_COMPARE_WARNING
|
|
98
|
+
end
|
|
99
|
+
|
|
60
100
|
def all_eval_definitions
|
|
61
101
|
inherited = if superclass.respond_to?(:all_eval_definitions, true)
|
|
62
102
|
superclass.send(:all_eval_definitions)
|
|
@@ -71,20 +111,24 @@ module RubyLLM
|
|
|
71
111
|
defn = all_eval_definitions[name.to_s]
|
|
72
112
|
raise ArgumentError, "No eval '#{name}' defined. Available: #{all_eval_definitions.keys}" unless defn
|
|
73
113
|
|
|
74
|
-
|
|
75
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
76
|
-
concurrency: concurrency)
|
|
114
|
+
run_eval_definition(defn, context, concurrency: concurrency)
|
|
77
115
|
end
|
|
78
116
|
|
|
79
117
|
def run_all_own_evals(context, concurrency: nil)
|
|
80
118
|
all_eval_definitions.transform_values do |defn|
|
|
81
|
-
|
|
82
|
-
effective_context = eval_context(defn, isolated_context)
|
|
83
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
84
|
-
concurrency: concurrency)
|
|
119
|
+
run_eval_definition(defn, isolate_context(context), concurrency: concurrency)
|
|
85
120
|
end
|
|
86
121
|
end
|
|
87
122
|
|
|
123
|
+
def run_eval_definition(defn, context, concurrency: nil)
|
|
124
|
+
Eval::Runner.run(
|
|
125
|
+
step: self,
|
|
126
|
+
dataset: defn.build_dataset,
|
|
127
|
+
context: eval_context(defn, context),
|
|
128
|
+
concurrency: concurrency
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
|
|
88
132
|
def eval_context(defn, context)
|
|
89
133
|
context = safe_context(context)
|
|
90
134
|
return context if context[:adapter]
|