ruby_llm-contract 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,353 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # =============================================================================
4
- # EXAMPLE 8: Translation pipeline with quality checks
5
- #
6
- # Real-world case: translate product page segments preserving tone,
7
- # length constraints, and key terms. Pipeline:
8
- #
9
- # 1. Extract — find translatable segments with context and max length
10
- # 2. Translate — translate each segment respecting constraints
11
- # 3. Review — quality-check translations (detect untranslated terms,
12
- # length violations, tone mismatches)
13
- #
14
- # Shows:
15
- # - Pipeline where each step has a fundamentally different LLM skill
16
- # (analysis → creative writing → evaluation)
17
- # - Cross-validation: all segment keys from step 1 must appear in step 2
18
- # - 2-arity invariant: max_length from extraction enforced on translations
19
- # - Content quality: detect untranslated source terms left in output
20
- # - Why 3 steps can't be 1: same model evaluating its own translation
21
- # has self-evaluation bias — step 3 should ideally use a different model
22
- # =============================================================================
23
-
24
- require_relative "../lib/ruby_llm/contract"
25
-
26
- # =============================================================================
27
- # STEP 1: Extract translatable segments
28
- #
29
- # Input: raw product page text
30
- # Output: structured segments with context, importance, and max length
31
- # =============================================================================
32
-
33
- class ExtractSegments < RubyLLM::Contract::Step::Base
34
- input_type RubyLLM::Contract::Types::Hash.schema(
35
- page_text: RubyLLM::Contract::Types::String,
36
- target_lang: RubyLLM::Contract::Types::String
37
- )
38
-
39
- output_schema do
40
- string :source_lang
41
- string :target_lang
42
- array :segments, min_items: 1 do
43
- string :key, description: "Unique identifier like hero_headline, cta_button"
44
- string :text, description: "Original text to translate"
45
- string :context, enum: %w[headline subheadline description cta legal testimonial]
46
- integer :max_length, description: "Max character count for the translation"
47
- string :tone, enum: %w[punchy professional casual formal technical]
48
- end
49
- end
50
-
51
- prompt do
52
- system "Extract translatable text segments from a product page."
53
- rule "Assign each segment a unique key based on its role (e.g., hero_headline, cta_primary)."
54
- rule "Determine context type and appropriate tone for translation."
55
- rule "Set max_length based on UI constraints — headlines short, descriptions longer."
56
-
57
- example input: "Ship faster. The deployment platform for modern teams. Try free →",
58
- output: '{"source_lang":"en","target_lang":"fr","segments":[' \
59
- '{"key":"hero_headline","text":"Ship faster","context":"headline","max_length":20,"tone":"punchy"},' \
60
- '{"key":"hero_sub","text":"The deployment platform for modern teams","context":"subheadline","max_length":60,"tone":"professional"},' \
61
- '{"key":"cta_primary","text":"Try free →","context":"cta","max_length":15,"tone":"punchy"}]}'
62
-
63
- user "Target language: {target_lang}\n\nPage text:\n{page_text}"
64
- end
65
-
66
- validate("target_lang preserved") do |output, input|
67
- output[:target_lang] == input[:target_lang]
68
- end
69
-
70
- validate("unique segment keys") do |o|
71
- keys = o[:segments].map { |s| s[:key] }
72
- keys.uniq.length == keys.length
73
- end
74
- end
75
-
76
- # =============================================================================
77
- # STEP 2: Translate segments
78
- #
79
- # Input: extracted segments with context and constraints
80
- # Output: translated segments preserving keys and respecting max_length
81
- # =============================================================================
82
-
83
- class TranslateSegments < RubyLLM::Contract::Step::Base
84
- input_type Hash
85
-
86
- output_schema do
87
- string :source_lang
88
- string :target_lang
89
- array :translations, min_items: 1 do
90
- string :key
91
- string :original
92
- string :translated
93
- string :context, enum: %w[headline subheadline description cta legal testimonial]
94
- integer :max_length, description: "Carried through from extraction for downstream validation"
95
- integer :original_length
96
- integer :translated_length
97
- end
98
- end
99
-
100
- prompt do
101
- system "Translate product page segments to the target language."
102
- rule "Preserve tone: headlines punchy, CTAs action-oriented, descriptions natural."
103
- rule "Respect max_length — abbreviate naturally if needed, never truncate mid-word."
104
- rule "Keep brand names, product names, and URLs untranslated."
105
- rule "Carry through max_length from the input segments."
106
- rule "Include original and translated length for quality tracking."
107
- user "Source: {source_lang} → Target: {target_lang}\n\nSegments:\n{segments}"
108
- end
109
-
110
- validate("all segments translated") do |output, input|
111
- output[:translations].map { |t| t[:key] }.sort ==
112
- (input[:segments] || []).map { |s| s[:key] }.sort
113
- end
114
-
115
- validate("translations within max_length") do |output, input|
116
- segments_by_key = (input[:segments] || []).to_h { |s| [s[:key], s] }
117
- output[:translations].all? do |t|
118
- max = segments_by_key.dig(t[:key], :max_length)
119
- max.nil? || t[:translated].to_s.length <= max
120
- end
121
- end
122
-
123
- validate("translations differ from originals") do |o|
124
- o[:translations].all? { |t| t[:translated] != t[:original] }
125
- end
126
-
127
- retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini]
128
- end
129
-
130
- # =============================================================================
131
- # STEP 3: Review translation quality
132
- #
133
- # Input: original segments + translations
134
- # Output: quality report with per-segment scores and issues
135
- #
136
- # This step uses a DIFFERENT LLM skill (evaluation, not generation).
137
- # A model reviewing its own translations has bias — in production,
138
- # you'd use a different model or temperature for this step.
139
- # =============================================================================
140
-
141
- class ReviewTranslations < RubyLLM::Contract::Step::Base
142
- input_type Hash
143
-
144
- output_schema do
145
- string :target_lang
146
- integer :total_segments
147
- integer :passed_segments
148
- array :reviews, min_items: 1 do
149
- string :key
150
- string :verdict, enum: %w[pass warning fail]
151
- string :issue, description: "Empty if pass, description if warning/fail"
152
- end
153
- end
154
-
155
- prompt do
156
- system "Review translations for quality. You are a professional translator and editor."
157
- rule "Check each translation for: accuracy, natural phrasing, tone match, length vs max_length."
158
- rule "Pass: translation is accurate, natural, and within max_length."
159
- rule "Warning: minor issue (slightly awkward phrasing, could be improved)."
160
- rule "Fail: wrong meaning, untranslated text left in, or translated_length exceeds max_length."
161
- user "Target language: {target_lang}\n\nTranslations:\n{translations}"
162
- end
163
-
164
- validate("all translations reviewed") do |output, input|
165
- output[:reviews].map { |r| r[:key] }.sort ==
166
- (input[:translations] || []).map { |t| t[:key] }.sort
167
- end
168
-
169
- validate("counts are consistent") do |o|
170
- o[:passed_segments] == o[:reviews].count { |r| r[:verdict] == "pass" }
171
- end
172
-
173
- validate("failed reviews have issues") do |o|
174
- o[:reviews].reject { |r| r[:verdict] == "pass" }.all? do |r|
175
- !r[:issue].to_s.strip.empty?
176
- end
177
- end
178
-
179
- validate("fail verdict for over-limit translations") do |output, input|
180
- translations_by_key = (input[:translations] || []).to_h { |t| [t[:key], t] }
181
- output[:reviews].all? do |r|
182
- t = translations_by_key[r[:key]]
183
- next true unless t && t[:max_length] && t[:translated_length]
184
- next true if t[:translated_length] <= t[:max_length]
185
-
186
- %w[warning fail].include?(r[:verdict])
187
- end
188
- end
189
- end
190
-
191
- # =============================================================================
192
- # PIPELINE
193
- # =============================================================================
194
-
195
- class TranslationPipeline < RubyLLM::Contract::Pipeline::Base
196
- step ExtractSegments, as: :extract
197
- step TranslateSegments, as: :translate
198
- step ReviewTranslations, as: :review
199
- end
200
-
201
- # =============================================================================
202
- # TEST WITH CANNED RESPONSES
203
- # =============================================================================
204
-
205
- page_text = <<~PAGE
206
- Ship faster with Acme Deploy
207
-
208
- The deployment platform built for modern engineering teams.
209
- Push to production in seconds, not hours. Zero-downtime deploys,
210
- instant rollbacks, and real-time logs.
211
-
212
- Start free — no credit card required.
213
-
214
- "Acme Deploy cut our deployment time from 45 minutes to 30 seconds."
215
- — Sarah Chen, CTO at Widgets Inc.
216
- PAGE
217
-
218
- input = { page_text: page_text, target_lang: "fr" }
219
-
220
- extract_response = {
221
- source_lang: "en", target_lang: "fr",
222
- segments: [
223
- { key: "hero_headline", text: "Ship faster with Acme Deploy", context: "headline", max_length: 40, tone: "punchy" },
224
- { key: "hero_sub", text: "The deployment platform built for modern engineering teams", context: "subheadline",
225
- max_length: 80, tone: "professional" },
226
- { key: "feature_1", text: "Push to production in seconds, not hours", context: "description", max_length: 60,
227
- tone: "punchy" },
228
- { key: "feature_2", text: "Zero-downtime deploys, instant rollbacks, and real-time logs", context: "description",
229
- max_length: 80, tone: "technical" },
230
- { key: "cta_primary", text: "Start free — no credit card required", context: "cta", max_length: 50,
231
- tone: "punchy" },
232
- { key: "testimonial", text: "Acme Deploy cut our deployment time from 45 minutes to 30 seconds.",
233
- context: "testimonial", max_length: 100, tone: "formal" }
234
- ]
235
- }.to_json
236
-
237
- translate_response = {
238
- source_lang: "en", target_lang: "fr",
239
- translations: [
240
- { key: "hero_headline", original: "Ship faster with Acme Deploy",
241
- translated: "Déployez plus vite avec Acme Deploy", context: "headline", max_length: 40, original_length: 29, translated_length: 36 },
242
- { key: "hero_sub", original: "The deployment platform built for modern engineering teams",
243
- translated: "La plateforme de déploiement pour les équipes d'ingénierie modernes", context: "subheadline", max_length: 80, original_length: 57, translated_length: 67 },
244
- { key: "feature_1", original: "Push to production in seconds, not hours",
245
- translated: "En production en secondes, pas en heures", context: "description", max_length: 60, original_length: 41, translated_length: 41 },
246
- { key: "feature_2", original: "Zero-downtime deploys, instant rollbacks, and real-time logs",
247
- translated: "Déploiements sans interruption, rollbacks instantanés et logs en temps réel", context: "description", max_length: 80, original_length: 60, translated_length: 75 },
248
- { key: "cta_primary", original: "Start free — no credit card required",
249
- translated: "Essai gratuit — sans carte bancaire", context: "cta", max_length: 50, original_length: 36, translated_length: 36 },
250
- { key: "testimonial", original: "Acme Deploy cut our deployment time from 45 minutes to 30 seconds.",
251
- translated: "Acme Deploy a réduit notre temps de déploiement de 45 minutes à 30 secondes.", context: "testimonial", max_length: 100, original_length: 66, translated_length: 76 }
252
- ]
253
- }.to_json
254
-
255
- review_response = {
256
- target_lang: "fr", total_segments: 6, passed_segments: 5,
257
- reviews: [
258
- { key: "hero_headline", verdict: "pass", issue: "" },
259
- { key: "hero_sub", verdict: "pass", issue: "" },
260
- { key: "feature_1", verdict: "pass", issue: "" },
261
- { key: "feature_2", verdict: "warning", issue: "Slightly long — consider shorter phrasing for mobile" },
262
- { key: "cta_primary", verdict: "pass", issue: "" },
263
- { key: "testimonial", verdict: "pass", issue: "" }
264
- ]
265
- }.to_json
266
-
267
- puts "=" * 60
268
- puts "TRANSLATION PIPELINE: en → fr"
269
- puts "=" * 60
270
-
271
- # Run each step with its own adapter
272
- puts "\n--- Step 1: Extract segments ---"
273
- r1 = ExtractSegments.run(input, context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: extract_response) })
274
- puts "Status: #{r1.status} | Segments: #{r1.parsed_output[:segments].length}"
275
- r1.parsed_output[:segments].each do |s|
276
- puts " #{s[:key].ljust(16)} [#{s[:context].ljust(12)}] #{s[:text][0..50]}... (max: #{s[:max_length]})"
277
- end
278
-
279
- puts "\n--- Step 2: Translate ---"
280
- r2 = TranslateSegments.run(r1.parsed_output,
281
- context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: translate_response) })
282
- puts "Status: #{r2.status}"
283
- r2.parsed_output[:translations].each do |t|
284
- len_ok = t[:translated_length] <= 80 ? "✓" : "⚠"
285
- puts " #{len_ok} #{t[:key].ljust(16)} #{t[:translated][0..60]}"
286
- end
287
-
288
- puts "\n--- Step 3: Review ---"
289
- r3 = ReviewTranslations.run(r2.parsed_output,
290
- context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: review_response) })
291
- puts "Status: #{r3.status} | Passed: #{r3.parsed_output[:passed_segments]}/#{r3.parsed_output[:total_segments]}"
292
- r3.parsed_output[:reviews].each do |r|
293
- icon = { "pass" => "✓", "warning" => "⚠", "fail" => "✗" }[r[:verdict]]
294
- line = " #{icon} #{r[:key]}"
295
- line += " — #{r[:issue]}" unless r[:issue].to_s.empty?
296
- puts line
297
- end
298
-
299
- # =============================================================================
300
- # INVARIANT CATCHES
301
- # =============================================================================
302
-
303
- puts "\n\n--- Invariant catches: missing translation ---"
304
- incomplete = {
305
- source_lang: "en", target_lang: "fr",
306
- translations: [
307
- { key: "hero_headline", original: "Ship faster", translated: "Déployez vite", context: "headline", original_length: 11, translated_length: 13 }
308
- # Missing 5 other segments!
309
- ]
310
- }.to_json
311
-
312
- r_bad = TranslateSegments.run(r1.parsed_output,
313
- context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: incomplete) })
314
- puts "Status: #{r_bad.status}"
315
- puts "Errors: #{r_bad.validation_errors}"
316
-
317
- puts "\n--- Invariant catches: translation too long ---"
318
- too_long = translate_response.gsub(
319
- "Déployez plus vite avec Acme Deploy",
320
- "Déployez beaucoup plus rapidement et efficacement avec la plateforme Acme Deploy"
321
- )
322
- r_long = TranslateSegments.run(r1.parsed_output,
323
- context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: too_long) })
324
- puts "Status: #{r_long.status}"
325
- puts "Errors: #{r_long.validation_errors}"
326
-
327
- puts "\n--- Invariant catches: untranslated (echoed back) ---"
328
- echoed = translate_response.gsub("Essai gratuit — sans carte bancaire", "Start free — no credit card required")
329
- r_echo = TranslateSegments.run(r1.parsed_output,
330
- context: { adapter: RubyLLM::Contract::Adapters::Test.new(response: echoed) })
331
- puts "Status: #{r_echo.status}"
332
- puts "Errors: #{r_echo.validation_errors}"
333
-
334
- # =============================================================================
335
- # SUMMARY
336
- #
337
- # 3 steps, 3 different LLM skills:
338
- # 1. Extract (analysis) — find segments, assign context and constraints
339
- # 2. Translate (creative) — translate respecting tone and length
340
- # 3. Review (evaluation) — quality-check each translation
341
- #
342
- # Why 3 steps, not 1:
343
- # - Each step has focused attention and its own schema
344
- # - Step 3 evaluates step 2's work (shouldn't self-evaluate)
345
- # - If extraction fails, no tokens wasted on translation
346
- # - Each step independently testable and retryable
347
- #
348
- # Invariants catch:
349
- # - Missing translations (not all segments covered)
350
- # - Translation too long (exceeds max_length from step 1)
351
- # - Untranslated text (model echoed back original)
352
- # - Review inconsistency (counts don't match verdicts)
353
- # =============================================================================
@@ -1,287 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # =============================================================================
4
- # EXAMPLE 9: Dataset-based prompt evaluation
5
- #
6
- # Define test cases with expected outputs, run a step against all of them,
7
- # and get an aggregate quality score. Like unit tests for your prompts.
8
- #
9
- # Shows:
10
- # - Dataset DSL with cases (input + expected)
11
- # - 4 evaluator types: exact, json_includes, regex, custom proc
12
- # - expected_traits for multi-property checks
13
- # - Aggregate scoring (0.0–1.0)
14
- # - eval_case convenience for inline testing
15
- # - Eval detecting quality regression
16
- # =============================================================================
17
-
18
- require_relative "../lib/ruby_llm/contract"
19
-
20
- # =============================================================================
21
- # STEP TO EVALUATE
22
- # =============================================================================
23
-
24
- class ClassifyIntent < RubyLLM::Contract::Step::Base
25
- input_type String
26
-
27
- output_schema do
28
- string :intent, enum: %w[sales support billing other]
29
- number :confidence, minimum: 0.0, maximum: 1.0
30
- end
31
-
32
- prompt do
33
- system "Classify the user's intent."
34
- user "{input}"
35
- end
36
- end
37
-
38
- # =============================================================================
39
- # STEP 1: Define a dataset — your "golden set" of test cases
40
- # =============================================================================
41
-
42
- puts "=" * 60
43
- puts "STEP 1: Define a dataset"
44
- puts "=" * 60
45
-
46
- dataset = RubyLLM::Contract::Eval::Dataset.define("intent_classification") do
47
- # Case with exact expected output
48
- add_case "billing inquiry",
49
- input: "I need help with my invoice",
50
- expected: { intent: "billing" }
51
-
52
- # Case with multiple expected fields
53
- add_case "sales inquiry",
54
- input: "I want to upgrade my plan",
55
- expected: { intent: "sales" }
56
-
57
- # Case with expected_traits (regex, ranges)
58
- add_case "support with confidence",
59
- input: "My app is crashing",
60
- expected_traits: { intent: "support" }
61
-
62
- # Case with custom evaluator (proc)
63
- add_case "high confidence expected",
64
- input: "URGENT: billing error!!!",
65
- evaluator: ->(output) { output[:confidence] >= 0.8 }
66
-
67
- # Case with no expected — just checks contract passes
68
- add_case "contract smoke test",
69
- input: "random text here"
70
- end
71
-
72
- puts "Dataset: #{dataset.name}"
73
- puts "Cases: #{dataset.cases.length}"
74
- dataset.cases.each { |c| puts " - #{c.name}" }
75
-
76
- # =============================================================================
77
- # STEP 2: Run the eval — good model (all pass)
78
- # =============================================================================
79
-
80
- puts "\n\n#{"=" * 60}"
81
- puts "STEP 2: Run eval — good model (all cases pass)"
82
- puts "=" * 60
83
-
84
- # Simulate a good model that returns correct intents
85
- good_responses = {
86
- "I need help with my invoice" => '{"intent": "billing", "confidence": 0.92}',
87
- "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}',
88
- "My app is crashing" => '{"intent": "support", "confidence": 0.95}',
89
- "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.97}',
90
- "random text here" => '{"intent": "other", "confidence": 0.6}'
91
- }
92
-
93
- # Custom adapter that returns different responses per input
94
- good_adapter = Object.new
95
- good_adapter.define_singleton_method(:call) do |messages:, **_opts|
96
- user_msg = messages.find { |m| m[:role] == :user }
97
- response = good_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
98
- RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
99
- end
100
-
101
- report = RubyLLM::Contract::Eval::Runner.run(
102
- step: ClassifyIntent,
103
- dataset: dataset,
104
- context: { adapter: good_adapter }
105
- )
106
-
107
- puts "\nScore: #{report.score.round(2)}"
108
- puts "Pass rate: #{report.pass_rate}"
109
- puts "All passed: #{report.passed?}"
110
- puts
111
- report.each do |r|
112
- icon = r.passed? ? "✓" : "✗"
113
- puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
114
- end
115
-
116
- # =============================================================================
117
- # STEP 3: Run eval — bad model (some fail)
118
- # =============================================================================
119
-
120
- puts "\n\n#{"=" * 60}"
121
- puts "STEP 3: Run eval — bad model (quality regression)"
122
- puts "=" * 60
123
-
124
- # Simulate a worse model that misclassifies some intents
125
- bad_responses = {
126
- "I need help with my invoice" => '{"intent": "support", "confidence": 0.7}', # WRONG: billing → support
127
- "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}', # correct
128
- "My app is crashing" => '{"intent": "other", "confidence": 0.4}', # WRONG: support → other
129
- "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.55}', # low confidence
130
- "random text here" => '{"intent": "other", "confidence": 0.6}' # correct
131
- }
132
-
133
- bad_adapter = Object.new
134
- bad_adapter.define_singleton_method(:call) do |messages:, **_opts|
135
- user_msg = messages.find { |m| m[:role] == :user }
136
- response = bad_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
137
- RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
138
- end
139
-
140
- bad_report = RubyLLM::Contract::Eval::Runner.run(
141
- step: ClassifyIntent,
142
- dataset: dataset,
143
- context: { adapter: bad_adapter }
144
- )
145
-
146
- puts "\nScore: #{bad_report.score.round(2)}"
147
- puts "Pass rate: #{bad_report.pass_rate}"
148
- puts "All passed: #{bad_report.passed?}"
149
- puts
150
- bad_report.each do |r|
151
- icon = r.passed? ? "✓" : "✗"
152
- puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
153
- end
154
-
155
- puts "\nRegression detected:"
156
- puts " Score dropped: #{report.score.round(2)} → #{bad_report.score.round(2)} " \
157
- "(#{((report.score - bad_report.score) * 100).round(1)}% drop)"
158
-
159
- # =============================================================================
160
- # STEP 4: eval_case — quick inline check
161
- # =============================================================================
162
-
163
- puts "\n\n#{"=" * 60}"
164
- puts "STEP 4: eval_case — inline single-case eval"
165
- puts "=" * 60
166
-
167
- # No dataset needed — just check one case
168
- result = ClassifyIntent.eval_case(
169
- input: "I want to cancel my subscription",
170
- expected: { intent: "billing" },
171
- context: { adapter: good_adapter }
172
- )
173
-
174
- puts "Passed: #{result[:passed]}"
175
- puts "Score: #{result[:score]}"
176
- puts "Output: #{result[:output]}"
177
- puts "Details: #{result[:details]}"
178
-
179
- # With expected_traits
180
- result2 = ClassifyIntent.eval_case(
181
- input: "URGENT: server down!!!",
182
- expected_traits: { intent: "support" },
183
- context: {
184
- adapter: RubyLLM::Contract::Adapters::Test.new(
185
- response: '{"intent": "support", "confidence": 0.99}'
186
- )
187
- }
188
- )
189
-
190
- puts "\nTraits check:"
191
- puts "Passed: #{result2[:passed]}"
192
- puts "Details: #{result2[:details]}"
193
-
194
- # With custom proc evaluator
195
- result3 = ClassifyIntent.eval_case(
196
- input: "test",
197
- evaluator: ->(output) { output[:confidence] > 0.9 },
198
- context: {
199
- adapter: RubyLLM::Contract::Adapters::Test.new(
200
- response: '{"intent": "other", "confidence": 0.95}'
201
- )
202
- }
203
- )
204
-
205
- puts "\nCustom proc:"
206
- puts "Passed: #{result3[:passed]} (confidence > 0.9)"
207
-
208
- # =============================================================================
209
- # STEP 5: Evaluating a pipeline
210
- # =============================================================================
211
-
212
- puts "\n\n#{"=" * 60}"
213
- puts "STEP 5: Evaluate a pipeline end-to-end"
214
- puts "=" * 60
215
-
216
- class SuggestAction < RubyLLM::Contract::Step::Base
217
- input_type Hash
218
-
219
- output_schema do
220
- string :action
221
- string :priority, enum: %w[low medium high urgent]
222
- end
223
-
224
- prompt do
225
- system "Suggest an action based on the classified intent."
226
- user "Intent: {intent}, Confidence: {confidence}"
227
- end
228
- end
229
-
230
- class SupportPipeline < RubyLLM::Contract::Pipeline::Base
231
- step ClassifyIntent, as: :classify
232
- step SuggestAction, as: :action
233
- end
234
-
235
- pipeline_dataset = RubyLLM::Contract::Eval::Dataset.define("support_pipeline") do
236
- add_case "billing → action",
237
- input: "I need help with my invoice",
238
- expected: { priority: "medium" }
239
-
240
- add_case "urgent → action",
241
- input: "URGENT: server is down!",
242
- expected: { priority: "urgent" }
243
- end
244
-
245
- pipeline_adapter = RubyLLM::Contract::Adapters::Test.new(
246
- response: '{"intent": "billing", "confidence": 0.9, "action": "Review invoice", "priority": "medium"}'
247
- )
248
-
249
- pipeline_report = RubyLLM::Contract::Eval::Runner.run(
250
- step: SupportPipeline,
251
- dataset: pipeline_dataset,
252
- context: { adapter: pipeline_adapter }
253
- )
254
-
255
- puts "\nPipeline eval:"
256
- puts "Score: #{pipeline_report.score.round(2)}"
257
- puts "Pass rate: #{pipeline_report.pass_rate}"
258
- pipeline_report.each do |r|
259
- icon = r.passed? ? "✓" : "✗"
260
- puts " #{icon} #{r.name.ljust(25)} #{r.details}"
261
- end
262
-
263
- # =============================================================================
264
- # SUMMARY
265
- #
266
- # Dataset eval answers: "Is my prompt good?"
267
- #
268
- # Define cases:
269
- # - expected: exact output match (or json_includes for partial)
270
- # - expected_traits: multi-property checks (regex, values)
271
- # - evaluator: custom proc for complex logic
272
- # - no expected: just check contract passes
273
- #
274
- # Run eval:
275
- # - report.score → 0.0-1.0 aggregate
276
- # - report.pass_rate → "4/5"
277
- # - report.each → per-case details
278
- #
279
- # Quick check:
280
- # - MyStep.eval_case(input: ..., expected: ...) → single result
281
- #
282
- # Regression detection:
283
- # - Compare report.score before/after prompt change
284
- # - Drop from 1.0 to 0.6 → something broke
285
- #
286
- # Next: GH-8 adds Regression::Baseline to automate this comparison
287
- # =============================================================================