ruby_llm-contract 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +96 -0
- data/Gemfile.lock +3 -3
- data/README.md +64 -316
- data/examples/00_basics.rb +110 -428
- data/examples/01_fallback_showcase.rb +208 -0
- data/examples/02_real_llm_minimal.rb +45 -0
- data/examples/03_summarize_with_keywords.rb +128 -0
- data/examples/04_summarize_and_translate.rb +196 -0
- data/examples/05_eval_dataset.rb +144 -0
- data/examples/06_retry_variants.rb +147 -0
- data/examples/README.md +20 -128
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +22 -1
- data/lib/ruby_llm/contract/cost_calculator.rb +39 -0
- data/lib/ruby_llm/contract/eval/model_comparison.rb +4 -4
- data/lib/ruby_llm/contract/eval/retry_optimizer.rb +7 -3
- data/lib/ruby_llm/contract/step/base.rb +18 -1
- data/lib/ruby_llm/contract/step/dsl.rb +38 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +2 -2
- data/lib/ruby_llm/contract/token_estimator.rb +20 -3
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/ruby_llm-contract.gemspec +6 -5
- metadata +14 -16
- data/examples/01_classify_threads.rb +0 -220
- data/examples/02_generate_comment.rb +0 -203
- data/examples/03_target_audience.rb +0 -201
- data/examples/04_real_llm.rb +0 -410
- data/examples/05_output_schema.rb +0 -258
- data/examples/07_keyword_extraction.rb +0 -239
- data/examples/08_translation.rb +0 -353
- data/examples/09_eval_dataset.rb +0 -287
- data/examples/10_reddit_full_showcase.rb +0 -363
data/examples/08_translation.rb
DELETED
|
@@ -1,353 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
# =============================================================================
|
|
4
|
-
# EXAMPLE 8: Translation pipeline with quality checks
|
|
5
|
-
#
|
|
6
|
-
# Real-world case: translate product page segments preserving tone,
|
|
7
|
-
# length constraints, and key terms. Pipeline:
|
|
8
|
-
#
|
|
9
|
-
# 1. Extract — find translatable segments with context and max length
|
|
10
|
-
# 2. Translate — translate each segment respecting constraints
|
|
11
|
-
# 3. Review — quality-check translations (detect untranslated terms,
|
|
12
|
-
# length violations, tone mismatches)
|
|
13
|
-
#
|
|
14
|
-
# Shows:
|
|
15
|
-
# - Pipeline where each step has a fundamentally different LLM skill
|
|
16
|
-
# (analysis → creative writing → evaluation)
|
|
17
|
-
# - Cross-validation: all segment keys from step 1 must appear in step 2
|
|
18
|
-
# - 2-arity invariant: max_length from extraction enforced on translations
|
|
19
|
-
# - Content quality: detect untranslated source terms left in output
|
|
20
|
-
# - Why 3 steps can't be 1: same model evaluating its own translation
|
|
21
|
-
# has self-evaluation bias — step 3 should ideally use a different model
|
|
22
|
-
# =============================================================================
|
|
23
|
-
|
|
24
|
-
require_relative "../lib/ruby_llm/contract"
|
|
25
|
-
|
|
26
|
-
# =============================================================================
|
|
27
|
-
# STEP 1: Extract translatable segments
|
|
28
|
-
#
|
|
29
|
-
# Input: raw product page text
|
|
30
|
-
# Output: structured segments with context, importance, and max length
|
|
31
|
-
# =============================================================================
|
|
32
|
-
|
|
33
|
-
class ExtractSegments < RubyLLM::Contract::Step::Base
|
|
34
|
-
input_type RubyLLM::Contract::Types::Hash.schema(
|
|
35
|
-
page_text: RubyLLM::Contract::Types::String,
|
|
36
|
-
target_lang: RubyLLM::Contract::Types::String
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
output_schema do
|
|
40
|
-
string :source_lang
|
|
41
|
-
string :target_lang
|
|
42
|
-
array :segments, min_items: 1 do
|
|
43
|
-
string :key, description: "Unique identifier like hero_headline, cta_button"
|
|
44
|
-
string :text, description: "Original text to translate"
|
|
45
|
-
string :context, enum: %w[headline subheadline description cta legal testimonial]
|
|
46
|
-
integer :max_length, description: "Max character count for the translation"
|
|
47
|
-
string :tone, enum: %w[punchy professional casual formal technical]
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
prompt do
|
|
52
|
-
system "Extract translatable text segments from a product page."
|
|
53
|
-
rule "Assign each segment a unique key based on its role (e.g., hero_headline, cta_primary)."
|
|
54
|
-
rule "Determine context type and appropriate tone for translation."
|
|
55
|
-
rule "Set max_length based on UI constraints — headlines short, descriptions longer."
|
|
56
|
-
|
|
57
|
-
example input: "Ship faster. The deployment platform for modern teams. Try free →",
|
|
58
|
-
output: '{"source_lang":"en","target_lang":"fr","segments":[' \
|
|
59
|
-
'{"key":"hero_headline","text":"Ship faster","context":"headline","max_length":20,"tone":"punchy"},' \
|
|
60
|
-
'{"key":"hero_sub","text":"The deployment platform for modern teams","context":"subheadline","max_length":60,"tone":"professional"},' \
|
|
61
|
-
'{"key":"cta_primary","text":"Try free →","context":"cta","max_length":15,"tone":"punchy"}]}'
|
|
62
|
-
|
|
63
|
-
user "Target language: {target_lang}\n\nPage text:\n{page_text}"
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
validate("target_lang preserved") do |output, input|
|
|
67
|
-
output[:target_lang] == input[:target_lang]
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
validate("unique segment keys") do |o|
|
|
71
|
-
keys = o[:segments].map { |s| s[:key] }
|
|
72
|
-
keys.uniq.length == keys.length
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# =============================================================================
|
|
77
|
-
# STEP 2: Translate segments
|
|
78
|
-
#
|
|
79
|
-
# Input: extracted segments with context and constraints
|
|
80
|
-
# Output: translated segments preserving keys and respecting max_length
|
|
81
|
-
# =============================================================================
|
|
82
|
-
|
|
83
|
-
class TranslateSegments < RubyLLM::Contract::Step::Base
|
|
84
|
-
input_type Hash
|
|
85
|
-
|
|
86
|
-
output_schema do
|
|
87
|
-
string :source_lang
|
|
88
|
-
string :target_lang
|
|
89
|
-
array :translations, min_items: 1 do
|
|
90
|
-
string :key
|
|
91
|
-
string :original
|
|
92
|
-
string :translated
|
|
93
|
-
string :context, enum: %w[headline subheadline description cta legal testimonial]
|
|
94
|
-
integer :max_length, description: "Carried through from extraction for downstream validation"
|
|
95
|
-
integer :original_length
|
|
96
|
-
integer :translated_length
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
prompt do
|
|
101
|
-
system "Translate product page segments to the target language."
|
|
102
|
-
rule "Preserve tone: headlines punchy, CTAs action-oriented, descriptions natural."
|
|
103
|
-
rule "Respect max_length — abbreviate naturally if needed, never truncate mid-word."
|
|
104
|
-
rule "Keep brand names, product names, and URLs untranslated."
|
|
105
|
-
rule "Carry through max_length from the input segments."
|
|
106
|
-
rule "Include original and translated length for quality tracking."
|
|
107
|
-
user "Source: {source_lang} → Target: {target_lang}\n\nSegments:\n{segments}"
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
validate("all segments translated") do |output, input|
|
|
111
|
-
output[:translations].map { |t| t[:key] }.sort ==
|
|
112
|
-
(input[:segments] || []).map { |s| s[:key] }.sort
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
validate("translations within max_length") do |output, input|
|
|
116
|
-
segments_by_key = (input[:segments] || []).to_h { |s| [s[:key], s] }
|
|
117
|
-
output[:translations].all? do |t|
|
|
118
|
-
max = segments_by_key.dig(t[:key], :max_length)
|
|
119
|
-
max.nil? || t[:translated].to_s.length <= max
|
|
120
|
-
end
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
validate("translations differ from originals") do |o|
|
|
124
|
-
o[:translations].all? { |t| t[:translated] != t[:original] }
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini]
|
|
128
|
-
end
|
|
129
|
-
|
|
130
|
-
# =============================================================================
|
|
131
|
-
# STEP 3: Review translation quality
|
|
132
|
-
#
|
|
133
|
-
# Input: original segments + translations
|
|
134
|
-
# Output: quality report with per-segment scores and issues
|
|
135
|
-
#
|
|
136
|
-
# This step uses a DIFFERENT LLM skill (evaluation, not generation).
|
|
137
|
-
# A model reviewing its own translations has bias — in production,
|
|
138
|
-
# you'd use a different model or temperature for this step.
|
|
139
|
-
# =============================================================================
|
|
140
|
-
|
|
141
|
-
class ReviewTranslations < RubyLLM::Contract::Step::Base
|
|
142
|
-
input_type Hash
|
|
143
|
-
|
|
144
|
-
output_schema do
|
|
145
|
-
string :target_lang
|
|
146
|
-
integer :total_segments
|
|
147
|
-
integer :passed_segments
|
|
148
|
-
array :reviews, min_items: 1 do
|
|
149
|
-
string :key
|
|
150
|
-
string :verdict, enum: %w[pass warning fail]
|
|
151
|
-
string :issue, description: "Empty if pass, description if warning/fail"
|
|
152
|
-
end
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
prompt do
|
|
156
|
-
system "Review translations for quality. You are a professional translator and editor."
|
|
157
|
-
rule "Check each translation for: accuracy, natural phrasing, tone match, length vs max_length."
|
|
158
|
-
rule "Pass: translation is accurate, natural, and within max_length."
|
|
159
|
-
rule "Warning: minor issue (slightly awkward phrasing, could be improved)."
|
|
160
|
-
rule "Fail: wrong meaning, untranslated text left in, or translated_length exceeds max_length."
|
|
161
|
-
user "Target language: {target_lang}\n\nTranslations:\n{translations}"
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
validate("all translations reviewed") do |output, input|
|
|
165
|
-
output[:reviews].map { |r| r[:key] }.sort ==
|
|
166
|
-
(input[:translations] || []).map { |t| t[:key] }.sort
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
validate("counts are consistent") do |o|
|
|
170
|
-
o[:passed_segments] == o[:reviews].count { |r| r[:verdict] == "pass" }
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
validate("failed reviews have issues") do |o|
|
|
174
|
-
o[:reviews].reject { |r| r[:verdict] == "pass" }.all? do |r|
|
|
175
|
-
!r[:issue].to_s.strip.empty?
|
|
176
|
-
end
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
validate("fail verdict for over-limit translations") do |output, input|
|
|
180
|
-
translations_by_key = (input[:translations] || []).to_h { |t| [t[:key], t] }
|
|
181
|
-
output[:reviews].all? do |r|
|
|
182
|
-
t = translations_by_key[r[:key]]
|
|
183
|
-
next true unless t && t[:max_length] && t[:translated_length]
|
|
184
|
-
next true if t[:translated_length] <= t[:max_length]
|
|
185
|
-
|
|
186
|
-
%w[warning fail].include?(r[:verdict])
|
|
187
|
-
end
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
# =============================================================================
|
|
192
|
-
# PIPELINE
|
|
193
|
-
# =============================================================================
|
|
194
|
-
|
|
195
|
-
class TranslationPipeline < RubyLLM::Contract::Pipeline::Base
|
|
196
|
-
step ExtractSegments, as: :extract
|
|
197
|
-
step TranslateSegments, as: :translate
|
|
198
|
-
step ReviewTranslations, as: :review
|
|
199
|
-
end
|
|
200
|
-
|
|
201
|
-
# =============================================================================
|
|
202
|
-
# TEST WITH CANNED RESPONSES
|
|
203
|
-
# =============================================================================
|
|
204
|
-
|
|
205
|
-
page_text = <<~PAGE
|
|
206
|
-
Ship faster with Acme Deploy
|
|
207
|
-
|
|
208
|
-
The deployment platform built for modern engineering teams.
|
|
209
|
-
Push to production in seconds, not hours. Zero-downtime deploys,
|
|
210
|
-
instant rollbacks, and real-time logs.
|
|
211
|
-
|
|
212
|
-
Start free — no credit card required.
|
|
213
|
-
|
|
214
|
-
"Acme Deploy cut our deployment time from 45 minutes to 30 seconds."
|
|
215
|
-
— Sarah Chen, CTO at Widgets Inc.
|
|
216
|
-
PAGE
|
|
217
|
-
|
|
218
|
-
input = { page_text: page_text, target_lang: "fr" }
|
|
219
|
-
|
|
220
|
-
extract_response = {
|
|
221
|
-
source_lang: "en", target_lang: "fr",
|
|
222
|
-
segments: [
|
|
223
|
-
{ key: "hero_headline", text: "Ship faster with Acme Deploy", context: "headline", max_length: 40, tone: "punchy" },
|
|
224
|
-
{ key: "hero_sub", text: "The deployment platform built for modern engineering teams", context: "subheadline",
|
|
225
|
-
max_length: 80, tone: "professional" },
|
|
226
|
-
{ key: "feature_1", text: "Push to production in seconds, not hours", context: "description", max_length: 60,
|
|
227
|
-
tone: "punchy" },
|
|
228
|
-
{ key: "feature_2", text: "Zero-downtime deploys, instant rollbacks, and real-time logs", context: "description",
|
|
229
|
-
max_length: 80, tone: "technical" },
|
|
230
|
-
{ key: "cta_primary", text: "Start free — no credit card required", context: "cta", max_length: 50,
|
|
231
|
-
tone: "punchy" },
|
|
232
|
-
{ key: "testimonial", text: "Acme Deploy cut our deployment time from 45 minutes to 30 seconds.",
|
|
233
|
-
context: "testimonial", max_length: 100, tone: "formal" }
|
|
234
|
-
]
|
|
235
|
-
}.to_json
|
|
236
|
-
|
|
237
|
-
translate_response = {
|
|
238
|
-
source_lang: "en", target_lang: "fr",
|
|
239
|
-
translations: [
|
|
240
|
-
{ key: "hero_headline", original: "Ship faster with Acme Deploy",
|
|
241
|
-
translated: "Déployez plus vite avec Acme Deploy", context: "headline", max_length: 40, original_length: 29, translated_length: 36 },
|
|
242
|
-
{ key: "hero_sub", original: "The deployment platform built for modern engineering teams",
|
|
243
|
-
translated: "La plateforme de déploiement pour les équipes d'ingénierie modernes", context: "subheadline", max_length: 80, original_length: 57, translated_length: 67 },
|
|
244
|
-
{ key: "feature_1", original: "Push to production in seconds, not hours",
|
|
245
|
-
translated: "En production en secondes, pas en heures", context: "description", max_length: 60, original_length: 41, translated_length: 41 },
|
|
246
|
-
{ key: "feature_2", original: "Zero-downtime deploys, instant rollbacks, and real-time logs",
|
|
247
|
-
translated: "Déploiements sans interruption, rollbacks instantanés et logs en temps réel", context: "description", max_length: 80, original_length: 60, translated_length: 75 },
|
|
248
|
-
{ key: "cta_primary", original: "Start free — no credit card required",
|
|
249
|
-
translated: "Essai gratuit — sans carte bancaire", context: "cta", max_length: 50, original_length: 36, translated_length: 36 },
|
|
250
|
-
{ key: "testimonial", original: "Acme Deploy cut our deployment time from 45 minutes to 30 seconds.",
|
|
251
|
-
translated: "Acme Deploy a réduit notre temps de déploiement de 45 minutes à 30 secondes.", context: "testimonial", max_length: 100, original_length: 66, translated_length: 76 }
|
|
252
|
-
]
|
|
253
|
-
}.to_json
|
|
254
|
-
|
|
255
|
-
review_response = {
|
|
256
|
-
target_lang: "fr", total_segments: 6, passed_segments: 5,
|
|
257
|
-
reviews: [
|
|
258
|
-
{ key: "hero_headline", verdict: "pass", issue: "" },
|
|
259
|
-
{ key: "hero_sub", verdict: "pass", issue: "" },
|
|
260
|
-
{ key: "feature_1", verdict: "pass", issue: "" },
|
|
261
|
-
{ key: "feature_2", verdict: "warning", issue: "Slightly long — consider shorter phrasing for mobile" },
|
|
262
|
-
{ key: "cta_primary", verdict: "pass", issue: "" },
|
|
263
|
-
{ key: "testimonial", verdict: "pass", issue: "" }
|
|
264
|
-
]
|
|
265
|
-
}.to_json
|
|
266
|
-
|
|
267
|
-
puts "=" * 60
|
|
268
|
-
puts "TRANSLATION PIPELINE: en → fr"
|
|
269
|
-
puts "=" * 60
|
|
270
|
-
|
|
271
|
-
# Run each step with its own adapter
|
|
272
|
-
puts "\n--- Step 1: Extract segments ---"
|
|
273
|
-
r1 = ExtractSegments.run(input, context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: extract_response) })
|
|
274
|
-
puts "Status: #{r1.status} | Segments: #{r1.parsed_output[:segments].length}"
|
|
275
|
-
r1.parsed_output[:segments].each do |s|
|
|
276
|
-
puts " #{s[:key].ljust(16)} [#{s[:context].ljust(12)}] #{s[:text][0..50]}... (max: #{s[:max_length]})"
|
|
277
|
-
end
|
|
278
|
-
|
|
279
|
-
puts "\n--- Step 2: Translate ---"
|
|
280
|
-
r2 = TranslateSegments.run(r1.parsed_output,
|
|
281
|
-
context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: translate_response) })
|
|
282
|
-
puts "Status: #{r2.status}"
|
|
283
|
-
r2.parsed_output[:translations].each do |t|
|
|
284
|
-
len_ok = t[:translated_length] <= 80 ? "✓" : "⚠"
|
|
285
|
-
puts " #{len_ok} #{t[:key].ljust(16)} #{t[:translated][0..60]}"
|
|
286
|
-
end
|
|
287
|
-
|
|
288
|
-
puts "\n--- Step 3: Review ---"
|
|
289
|
-
r3 = ReviewTranslations.run(r2.parsed_output,
|
|
290
|
-
context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: review_response) })
|
|
291
|
-
puts "Status: #{r3.status} | Passed: #{r3.parsed_output[:passed_segments]}/#{r3.parsed_output[:total_segments]}"
|
|
292
|
-
r3.parsed_output[:reviews].each do |r|
|
|
293
|
-
icon = { "pass" => "✓", "warning" => "⚠", "fail" => "✗" }[r[:verdict]]
|
|
294
|
-
line = " #{icon} #{r[:key]}"
|
|
295
|
-
line += " — #{r[:issue]}" unless r[:issue].to_s.empty?
|
|
296
|
-
puts line
|
|
297
|
-
end
|
|
298
|
-
|
|
299
|
-
# =============================================================================
|
|
300
|
-
# INVARIANT CATCHES
|
|
301
|
-
# =============================================================================
|
|
302
|
-
|
|
303
|
-
puts "\n\n--- Invariant catches: missing translation ---"
|
|
304
|
-
incomplete = {
|
|
305
|
-
source_lang: "en", target_lang: "fr",
|
|
306
|
-
translations: [
|
|
307
|
-
{ key: "hero_headline", original: "Ship faster", translated: "Déployez vite", context: "headline", original_length: 11, translated_length: 13 }
|
|
308
|
-
# Missing 5 other segments!
|
|
309
|
-
]
|
|
310
|
-
}.to_json
|
|
311
|
-
|
|
312
|
-
r_bad = TranslateSegments.run(r1.parsed_output,
|
|
313
|
-
context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: incomplete) })
|
|
314
|
-
puts "Status: #{r_bad.status}"
|
|
315
|
-
puts "Errors: #{r_bad.validation_errors}"
|
|
316
|
-
|
|
317
|
-
puts "\n--- Invariant catches: translation too long ---"
|
|
318
|
-
too_long = translate_response.gsub(
|
|
319
|
-
"Déployez plus vite avec Acme Deploy",
|
|
320
|
-
"Déployez beaucoup plus rapidement et efficacement avec la plateforme Acme Deploy"
|
|
321
|
-
)
|
|
322
|
-
r_long = TranslateSegments.run(r1.parsed_output,
|
|
323
|
-
context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: too_long) })
|
|
324
|
-
puts "Status: #{r_long.status}"
|
|
325
|
-
puts "Errors: #{r_long.validation_errors}"
|
|
326
|
-
|
|
327
|
-
puts "\n--- Invariant catches: untranslated (echoed back) ---"
|
|
328
|
-
echoed = translate_response.gsub("Essai gratuit — sans carte bancaire", "Start free — no credit card required")
|
|
329
|
-
r_echo = TranslateSegments.run(r1.parsed_output,
|
|
330
|
-
context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: echoed) })
|
|
331
|
-
puts "Status: #{r_echo.status}"
|
|
332
|
-
puts "Errors: #{r_echo.validation_errors}"
|
|
333
|
-
|
|
334
|
-
# =============================================================================
|
|
335
|
-
# SUMMARY
|
|
336
|
-
#
|
|
337
|
-
# 3 steps, 3 different LLM skills:
|
|
338
|
-
# 1. Extract (analysis) — find segments, assign context and constraints
|
|
339
|
-
# 2. Translate (creative) — translate respecting tone and length
|
|
340
|
-
# 3. Review (evaluation) — quality-check each translation
|
|
341
|
-
#
|
|
342
|
-
# Why 3 steps, not 1:
|
|
343
|
-
# - Each step has focused attention and its own schema
|
|
344
|
-
# - Step 3 evaluates step 2's work (shouldn't self-evaluate)
|
|
345
|
-
# - If extraction fails, no tokens wasted on translation
|
|
346
|
-
# - Each step independently testable and retryable
|
|
347
|
-
#
|
|
348
|
-
# Invariants catch:
|
|
349
|
-
# - Missing translations (not all segments covered)
|
|
350
|
-
# - Translation too long (exceeds max_length from step 1)
|
|
351
|
-
# - Untranslated text (model echoed back original)
|
|
352
|
-
# - Review inconsistency (counts don't match verdicts)
|
|
353
|
-
# =============================================================================
|
data/examples/09_eval_dataset.rb
DELETED
|
@@ -1,287 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
# =============================================================================
|
|
4
|
-
# EXAMPLE 9: Dataset-based prompt evaluation
|
|
5
|
-
#
|
|
6
|
-
# Define test cases with expected outputs, run a step against all of them,
|
|
7
|
-
# and get an aggregate quality score. Like unit tests for your prompts.
|
|
8
|
-
#
|
|
9
|
-
# Shows:
|
|
10
|
-
# - Dataset DSL with cases (input + expected)
|
|
11
|
-
# - 4 evaluator types: exact, json_includes, regex, custom proc
|
|
12
|
-
# - expected_traits for multi-property checks
|
|
13
|
-
# - Aggregate scoring (0.0–1.0)
|
|
14
|
-
# - eval_case convenience for inline testing
|
|
15
|
-
# - Eval detecting quality regression
|
|
16
|
-
# =============================================================================
|
|
17
|
-
|
|
18
|
-
require_relative "../lib/ruby_llm/contract"
|
|
19
|
-
|
|
20
|
-
# =============================================================================
|
|
21
|
-
# STEP TO EVALUATE
|
|
22
|
-
# =============================================================================
|
|
23
|
-
|
|
24
|
-
class ClassifyIntent < RubyLLM::Contract::Step::Base
|
|
25
|
-
input_type String
|
|
26
|
-
|
|
27
|
-
output_schema do
|
|
28
|
-
string :intent, enum: %w[sales support billing other]
|
|
29
|
-
number :confidence, minimum: 0.0, maximum: 1.0
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
prompt do
|
|
33
|
-
system "Classify the user's intent."
|
|
34
|
-
user "{input}"
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
# =============================================================================
|
|
39
|
-
# STEP 1: Define a dataset — your "golden set" of test cases
|
|
40
|
-
# =============================================================================
|
|
41
|
-
|
|
42
|
-
puts "=" * 60
|
|
43
|
-
puts "STEP 1: Define a dataset"
|
|
44
|
-
puts "=" * 60
|
|
45
|
-
|
|
46
|
-
dataset = RubyLLM::Contract::Eval::Dataset.define("intent_classification") do
|
|
47
|
-
# Case with exact expected output
|
|
48
|
-
add_case "billing inquiry",
|
|
49
|
-
input: "I need help with my invoice",
|
|
50
|
-
expected: { intent: "billing" }
|
|
51
|
-
|
|
52
|
-
# Case with multiple expected fields
|
|
53
|
-
add_case "sales inquiry",
|
|
54
|
-
input: "I want to upgrade my plan",
|
|
55
|
-
expected: { intent: "sales" }
|
|
56
|
-
|
|
57
|
-
# Case with expected_traits (regex, ranges)
|
|
58
|
-
add_case "support with confidence",
|
|
59
|
-
input: "My app is crashing",
|
|
60
|
-
expected_traits: { intent: "support" }
|
|
61
|
-
|
|
62
|
-
# Case with custom evaluator (proc)
|
|
63
|
-
add_case "high confidence expected",
|
|
64
|
-
input: "URGENT: billing error!!!",
|
|
65
|
-
evaluator: ->(output) { output[:confidence] >= 0.8 }
|
|
66
|
-
|
|
67
|
-
# Case with no expected — just checks contract passes
|
|
68
|
-
add_case "contract smoke test",
|
|
69
|
-
input: "random text here"
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
puts "Dataset: #{dataset.name}"
|
|
73
|
-
puts "Cases: #{dataset.cases.length}"
|
|
74
|
-
dataset.cases.each { |c| puts " - #{c.name}" }
|
|
75
|
-
|
|
76
|
-
# =============================================================================
|
|
77
|
-
# STEP 2: Run the eval — good model (all pass)
|
|
78
|
-
# =============================================================================
|
|
79
|
-
|
|
80
|
-
puts "\n\n#{"=" * 60}"
|
|
81
|
-
puts "STEP 2: Run eval — good model (all cases pass)"
|
|
82
|
-
puts "=" * 60
|
|
83
|
-
|
|
84
|
-
# Simulate a good model that returns correct intents
|
|
85
|
-
good_responses = {
|
|
86
|
-
"I need help with my invoice" => '{"intent": "billing", "confidence": 0.92}',
|
|
87
|
-
"I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}',
|
|
88
|
-
"My app is crashing" => '{"intent": "support", "confidence": 0.95}',
|
|
89
|
-
"URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.97}',
|
|
90
|
-
"random text here" => '{"intent": "other", "confidence": 0.6}'
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
# Custom adapter that returns different responses per input
|
|
94
|
-
good_adapter = Object.new
|
|
95
|
-
good_adapter.define_singleton_method(:call) do |messages:, **_opts|
|
|
96
|
-
user_msg = messages.find { |m| m[:role] == :user }
|
|
97
|
-
response = good_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
|
|
98
|
-
RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
report = RubyLLM::Contract::Eval::Runner.run(
|
|
102
|
-
step: ClassifyIntent,
|
|
103
|
-
dataset: dataset,
|
|
104
|
-
context: { adapter: good_adapter }
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
puts "\nScore: #{report.score.round(2)}"
|
|
108
|
-
puts "Pass rate: #{report.pass_rate}"
|
|
109
|
-
puts "All passed: #{report.passed?}"
|
|
110
|
-
puts
|
|
111
|
-
report.each do |r|
|
|
112
|
-
icon = r.passed? ? "✓" : "✗"
|
|
113
|
-
puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
# =============================================================================
|
|
117
|
-
# STEP 3: Run eval — bad model (some fail)
|
|
118
|
-
# =============================================================================
|
|
119
|
-
|
|
120
|
-
puts "\n\n#{"=" * 60}"
|
|
121
|
-
puts "STEP 3: Run eval — bad model (quality regression)"
|
|
122
|
-
puts "=" * 60
|
|
123
|
-
|
|
124
|
-
# Simulate a worse model that misclassifies some intents
|
|
125
|
-
bad_responses = {
|
|
126
|
-
"I need help with my invoice" => '{"intent": "support", "confidence": 0.7}', # WRONG: billing → support
|
|
127
|
-
"I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}', # correct
|
|
128
|
-
"My app is crashing" => '{"intent": "other", "confidence": 0.4}', # WRONG: support → other
|
|
129
|
-
"URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.55}', # low confidence
|
|
130
|
-
"random text here" => '{"intent": "other", "confidence": 0.6}' # correct
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
bad_adapter = Object.new
|
|
134
|
-
bad_adapter.define_singleton_method(:call) do |messages:, **_opts|
|
|
135
|
-
user_msg = messages.find { |m| m[:role] == :user }
|
|
136
|
-
response = bad_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
|
|
137
|
-
RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
bad_report = RubyLLM::Contract::Eval::Runner.run(
|
|
141
|
-
step: ClassifyIntent,
|
|
142
|
-
dataset: dataset,
|
|
143
|
-
context: { adapter: bad_adapter }
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
puts "\nScore: #{bad_report.score.round(2)}"
|
|
147
|
-
puts "Pass rate: #{bad_report.pass_rate}"
|
|
148
|
-
puts "All passed: #{bad_report.passed?}"
|
|
149
|
-
puts
|
|
150
|
-
bad_report.each do |r|
|
|
151
|
-
icon = r.passed? ? "✓" : "✗"
|
|
152
|
-
puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
puts "\nRegression detected:"
|
|
156
|
-
puts " Score dropped: #{report.score.round(2)} → #{bad_report.score.round(2)} " \
|
|
157
|
-
"(#{((report.score - bad_report.score) * 100).round(1)}% drop)"
|
|
158
|
-
|
|
159
|
-
# =============================================================================
|
|
160
|
-
# STEP 4: eval_case — quick inline check
|
|
161
|
-
# =============================================================================
|
|
162
|
-
|
|
163
|
-
puts "\n\n#{"=" * 60}"
|
|
164
|
-
puts "STEP 4: eval_case — inline single-case eval"
|
|
165
|
-
puts "=" * 60
|
|
166
|
-
|
|
167
|
-
# No dataset needed — just check one case
|
|
168
|
-
result = ClassifyIntent.eval_case(
|
|
169
|
-
input: "I want to cancel my subscription",
|
|
170
|
-
expected: { intent: "billing" },
|
|
171
|
-
context: { adapter: good_adapter }
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
puts "Passed: #{result[:passed]}"
|
|
175
|
-
puts "Score: #{result[:score]}"
|
|
176
|
-
puts "Output: #{result[:output]}"
|
|
177
|
-
puts "Details: #{result[:details]}"
|
|
178
|
-
|
|
179
|
-
# With expected_traits
|
|
180
|
-
result2 = ClassifyIntent.eval_case(
|
|
181
|
-
input: "URGENT: server down!!!",
|
|
182
|
-
expected_traits: { intent: "support" },
|
|
183
|
-
context: {
|
|
184
|
-
adapter: RubyLLM::Contract::Adapters::Test.new(
|
|
185
|
-
response: '{"intent": "support", "confidence": 0.99}'
|
|
186
|
-
)
|
|
187
|
-
}
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
puts "\nTraits check:"
|
|
191
|
-
puts "Passed: #{result2[:passed]}"
|
|
192
|
-
puts "Details: #{result2[:details]}"
|
|
193
|
-
|
|
194
|
-
# With custom proc evaluator
|
|
195
|
-
result3 = ClassifyIntent.eval_case(
|
|
196
|
-
input: "test",
|
|
197
|
-
evaluator: ->(output) { output[:confidence] > 0.9 },
|
|
198
|
-
context: {
|
|
199
|
-
adapter: RubyLLM::Contract::Adapters::Test.new(
|
|
200
|
-
response: '{"intent": "other", "confidence": 0.95}'
|
|
201
|
-
)
|
|
202
|
-
}
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
puts "\nCustom proc:"
|
|
206
|
-
puts "Passed: #{result3[:passed]} (confidence > 0.9)"
|
|
207
|
-
|
|
208
|
-
# =============================================================================
|
|
209
|
-
# STEP 5: Evaluating a pipeline
|
|
210
|
-
# =============================================================================
|
|
211
|
-
|
|
212
|
-
puts "\n\n#{"=" * 60}"
|
|
213
|
-
puts "STEP 5: Evaluate a pipeline end-to-end"
|
|
214
|
-
puts "=" * 60
|
|
215
|
-
|
|
216
|
-
class SuggestAction < RubyLLM::Contract::Step::Base
|
|
217
|
-
input_type Hash
|
|
218
|
-
|
|
219
|
-
output_schema do
|
|
220
|
-
string :action
|
|
221
|
-
string :priority, enum: %w[low medium high urgent]
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
prompt do
|
|
225
|
-
system "Suggest an action based on the classified intent."
|
|
226
|
-
user "Intent: {intent}, Confidence: {confidence}"
|
|
227
|
-
end
|
|
228
|
-
end
|
|
229
|
-
|
|
230
|
-
class SupportPipeline < RubyLLM::Contract::Pipeline::Base
|
|
231
|
-
step ClassifyIntent, as: :classify
|
|
232
|
-
step SuggestAction, as: :action
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
pipeline_dataset = RubyLLM::Contract::Eval::Dataset.define("support_pipeline") do
|
|
236
|
-
add_case "billing → action",
|
|
237
|
-
input: "I need help with my invoice",
|
|
238
|
-
expected: { priority: "medium" }
|
|
239
|
-
|
|
240
|
-
add_case "urgent → action",
|
|
241
|
-
input: "URGENT: server is down!",
|
|
242
|
-
expected: { priority: "urgent" }
|
|
243
|
-
end
|
|
244
|
-
|
|
245
|
-
pipeline_adapter = RubyLLM::Contract::Adapters::Test.new(
|
|
246
|
-
response: '{"intent": "billing", "confidence": 0.9, "action": "Review invoice", "priority": "medium"}'
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
pipeline_report = RubyLLM::Contract::Eval::Runner.run(
|
|
250
|
-
step: SupportPipeline,
|
|
251
|
-
dataset: pipeline_dataset,
|
|
252
|
-
context: { adapter: pipeline_adapter }
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
puts "\nPipeline eval:"
|
|
256
|
-
puts "Score: #{pipeline_report.score.round(2)}"
|
|
257
|
-
puts "Pass rate: #{pipeline_report.pass_rate}"
|
|
258
|
-
pipeline_report.each do |r|
|
|
259
|
-
icon = r.passed? ? "✓" : "✗"
|
|
260
|
-
puts " #{icon} #{r.name.ljust(25)} #{r.details}"
|
|
261
|
-
end
|
|
262
|
-
|
|
263
|
-
# =============================================================================
|
|
264
|
-
# SUMMARY
|
|
265
|
-
#
|
|
266
|
-
# Dataset eval answers: "Is my prompt good?"
|
|
267
|
-
#
|
|
268
|
-
# Define cases:
|
|
269
|
-
# - expected: exact output match (or json_includes for partial)
|
|
270
|
-
# - expected_traits: multi-property checks (regex, values)
|
|
271
|
-
# - evaluator: custom proc for complex logic
|
|
272
|
-
# - no expected: just check contract passes
|
|
273
|
-
#
|
|
274
|
-
# Run eval:
|
|
275
|
-
# - report.score → 0.0-1.0 aggregate
|
|
276
|
-
# - report.pass_rate → "4/5"
|
|
277
|
-
# - report.each → per-case details
|
|
278
|
-
#
|
|
279
|
-
# Quick check:
|
|
280
|
-
# - MyStep.eval_case(input: ..., expected: ...) → single result
|
|
281
|
-
#
|
|
282
|
-
# Regression detection:
|
|
283
|
-
# - Compare report.score before/after prompt change
|
|
284
|
-
# - Drop from 1.0 to 0.6 → something broke
|
|
285
|
-
#
|
|
286
|
-
# Next: GH-8 adds Regression::Baseline to automate this comparison
|
|
287
|
-
# =============================================================================
|