ruby_llm-contract 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 1: Fallback showcase — see contracts work in 30 seconds
5
+ #
6
+ # This is the "why does this gem exist" demo, runnable with zero API keys.
7
+ # Uses the Test adapter to simulate a real production failure mode:
8
+ #
9
+ # 1. gpt-5-nano/mini and o-series run with temperature=1.0 server-side.
10
+ # The SAME prompt on the SAME model returns different outputs across
11
+ # calls. That is sampling variance, not a bug — it is the published
12
+ # behaviour of these models.
13
+ # 2. One unlucky sample can flip a correct tone to an incorrect one
14
+ # ("negative" → "positive" for an outage article). Schema passes
15
+ # both; the wrong answer silently ships.
16
+ # 3. A validate block that cross-checks fields against each other turns
17
+ # a flaky output into a deterministic rejection, and retry_policy
18
+ # escalates to a stronger model for the retry.
19
+ # 4. The caller gets valid output plus a trace showing exactly what
20
+ # happened across attempts.
21
+ #
22
+ # Run:
23
+ # ruby examples/01_fallback_showcase.rb
24
+ #
25
+ # Expected output:
26
+ #
27
+ # ======================================================================
28
+ # A — Schema-only (no cross-check, no retry):
29
+ # ======================================================================
30
+ # status: :ok # schema passes — no guard
31
+ # tone shipped: "positive"
32
+ # takeaway 1: "Mesh networking hardware failed under load"
33
+ # ^^ takeaways describe a failure; tone says positive
34
+ # ^^ customer-success "critical feedback" filter misses this case
35
+ #
36
+ # ======================================================================
37
+ # B — Full contract (cross-check validate + retry_policy fallback):
38
+ # ======================================================================
39
+ # status: :ok
40
+ # final model: "gpt-5-mini"
41
+ # total attempts: 2
42
+ #
43
+ # Per-attempt trace:
44
+ # attempt 1 model=gpt-5-nano status=validation_failed
45
+ # attempt 2 model=gpt-5-mini status=ok
46
+ #
47
+ # Final parsed_output:
48
+ # tldr: "Mesh networking hardware failed under load; ..."
49
+ # takeaways: 3 items
50
+ # tone: "negative"
51
+ #
52
+ # See also: examples/06_retry_variants.rb — same-model retry, reasoning_effort
53
+ # escalation, and cross-provider fallback (Ollama → Anthropic → OpenAI).
54
+ # =============================================================================
55
+
56
+ require_relative "../lib/ruby_llm/contract"
57
+
58
+ # The article being summarized — an outage complaint. The correct tone is
59
+ # "negative" (customer success routes these to a human).
60
+ ARTICLE = <<~ARTICLE
61
+ The mesh networking hardware failed under load during the product launch.
62
+ Two features crashed, the recovery took eight hours, and three enterprise
63
+ customers threatened to churn. The post-incident review identified a
64
+ single regression in the firmware update as the root cause.
65
+ ARTICLE
66
+
67
+ # What gpt-5-nano returns on an unlucky sample (temperature=1.0 cannot be
68
+ # lowered). Every field is schema-valid. Tone disagrees with the takeaways.
69
+ VARIANCE_RESPONSE = {
70
+ tldr: "Product launch covered mesh networking hardware with three enterprise customers.",
71
+ takeaways: [
72
+ "Mesh networking hardware failed under load",
73
+ "Two features crashed and recovery took eight hours",
74
+ "Firmware regression identified as root cause"
75
+ ],
76
+ tone: "positive"
77
+ }.freeze
78
+
79
+ # What gpt-5-mini returns on retry — a consistent sample where tone matches
80
+ # the severity keywords in the takeaways.
81
+ GOOD_RESPONSE = {
82
+ tldr: "Mesh networking hardware failed under load; firmware regression was the root cause.",
83
+ takeaways: [
84
+ "Mesh networking hardware failed under load during launch",
85
+ "Two features crashed and recovery took eight hours",
86
+ "Firmware regression identified as root cause; three customers threatened churn"
87
+ ],
88
+ tone: "negative"
89
+ }.freeze
90
+
91
+ # =============================================================================
92
+ # STEP 1 — Define the contract exactly as a production Rails app would
93
+ # =============================================================================
94
+
95
+ class SummarizeArticle < RubyLLM::Contract::Step::Base
96
+ prompt <<~PROMPT
97
+ Summarize this article for a UI card. Return a short TL;DR,
98
+ 3 to 5 key takeaways, and a tone label.
99
+
100
+ {input}
101
+ PROMPT
102
+
103
+ output_schema do
104
+ string :tldr
105
+ array :takeaways, of: :string, min_items: 3, max_items: 5
106
+ string :tone, enum: %w[neutral positive negative analytical]
107
+ end
108
+
109
+ validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
110
+
111
+ # The key cross-check: if takeaways mention severity / failure keywords,
112
+ # tone must reflect that. This catches tone/takeaways mismatch when the
113
+ # model's sample drifts between calls. Expand the keyword list from your
114
+ # own production failures; this is a demo.
115
+ SEVERITY_PATTERN = /fail|crash|outage|broken|bug|error|regression/i.freeze
116
+ validate("tone matches severity keywords") do |o, _|
117
+ flagged = o[:takeaways].any? { |t| t.match?(SEVERITY_PATTERN) }
118
+ next true unless flagged
119
+ %w[negative analytical].include?(o[:tone])
120
+ end
121
+
122
+ retry_policy models: %w[gpt-5-nano gpt-5-mini gpt-5]
123
+ end
124
+
125
+ # =============================================================================
126
+ # PART A — SCHEMA-ONLY (no cross-check, no retry)
127
+ #
128
+ # Demonstrates what a "schema is enough" mindset gets you: the tone/takeaways
129
+ # mismatch passes every shape check and would be persisted by the caller,
130
+ # breaking the customer-success routing filter downstream.
131
+ # =============================================================================
132
+
133
+ class SummarizeArticleSchemaOnly < RubyLLM::Contract::Step::Base
134
+ prompt "Summarize: {input}"
135
+
136
+ output_schema do
137
+ string :tldr
138
+ array :takeaways, of: :string, min_items: 3, max_items: 5
139
+ string :tone, enum: %w[neutral positive negative analytical]
140
+ end
141
+ end
142
+
143
+ puts "=" * 70
144
+ puts "A — Schema-only (no cross-check, no retry):"
145
+ puts "=" * 70
146
+
147
+ naive_adapter = RubyLLM::Contract::Adapters::Test.new(response: VARIANCE_RESPONSE)
148
+ naive_result = SummarizeArticleSchemaOnly.run(ARTICLE, context: { adapter: naive_adapter })
149
+
150
+ puts "status: #{naive_result.status.inspect} # schema passes — no guard"
151
+ puts "tone shipped: #{naive_result.parsed_output[:tone].inspect}"
152
+ puts "takeaway 1: #{naive_result.parsed_output[:takeaways].first.inspect}"
153
+ puts " ^^ takeaways describe a failure; tone says positive"
154
+ puts " ^^ customer-success \"critical feedback\" filter misses this case"
155
+ puts
156
+
157
+ # =============================================================================
158
+ # PART B — FULL CONTRACT: cross-check validate + retry_policy fallback
159
+ #
160
+ # The Test adapter returns:
161
+ # attempt 1 (gpt-5-nano) — tone/takeaways mismatch from variance → rejected
162
+ # attempt 2 (gpt-5-mini) — consistent sample → passes
163
+ #
164
+ # retry_policy handles the escalation automatically.
165
+ # =============================================================================
166
+
167
+ puts "=" * 70
168
+ puts "B — Full contract (cross-check validate + retry_policy fallback):"
169
+ puts "=" * 70
170
+
171
+ adapter = RubyLLM::Contract::Adapters::Test.new(responses: [VARIANCE_RESPONSE, GOOD_RESPONSE])
172
+ result = SummarizeArticle.run(ARTICLE, context: { adapter: adapter })
173
+
174
+ puts "status: #{result.status.inspect}"
175
+ puts "final model: #{result.trace[:model].inspect}"
176
+ puts "total attempts: #{result.trace[:attempts].size}"
177
+ puts
178
+
179
+ puts "Per-attempt trace:"
180
+ result.trace[:attempts].each do |a|
181
+ puts " attempt #{a[:attempt]} model=#{a[:model].ljust(12)} status=#{a[:status]}"
182
+ end
183
+ puts
184
+
185
+ puts "Final parsed_output:"
186
+ puts " tldr: #{result.parsed_output[:tldr].inspect}"
187
+ puts " takeaways: #{result.parsed_output[:takeaways].size} items"
188
+ puts " tone: #{result.parsed_output[:tone].inspect}"
189
+ puts
190
+
191
+ # =============================================================================
192
+ # TAKEAWAYS
193
+ #
194
+ # 1. gpt-5 / o-series force temperature=1.0. Output variance is the published
195
+ # behavior of these models — not a bug to fix.
196
+ # 2. Schema cannot catch a tone/takeaways mismatch — every field is the
197
+ # right type. Only a cross-field validate can express "these fields
198
+ # must agree".
199
+ # 3. retry_policy turns that rejection into an automatic escalation. Variance
200
+ # is absorbed before the caller (or a customer-success routing filter)
201
+ # ever sees the flaky sample.
202
+ # 4. result.trace[:attempts] gives you the per-attempt record for free, so
203
+ # you can log retry rate and the cost delta from escalation.
204
+ #
205
+ # Replace the Test adapter with Adapters::RubyLLM (see Step 8 in
206
+ # examples/00_basics.rb for the one-liner) and this exact same code runs
207
+ # against a real provider or a local Ollama server.
208
+ # =============================================================================
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 2: Swap the Test adapter for a real LLM — the one-liner
5
+ #
6
+ # Take any contract step from the other examples, point ruby_llm at your
7
+ # provider, and pass Adapters::RubyLLM.new in context. The step itself does
8
+ # not change — same prompt, schema, validates, retry_policy.
9
+ #
10
+ # Requires: gem install ruby_llm; export OPENAI_API_KEY=sk-...
11
+ # (Or an Anthropic / Gemini / Mistral key, or a local Ollama server.)
12
+ #
13
+ # Run: OPENAI_API_KEY=sk-... ruby examples/02_real_llm_minimal.rb
14
+ # =============================================================================
15
+
16
+ require_relative "../lib/ruby_llm/contract"
17
+
18
+ RubyLLM.configure { |c| c.openai_api_key = ENV.fetch("OPENAI_API_KEY") }
19
+
20
+ class SummarizeArticle < RubyLLM::Contract::Step::Base
21
+ prompt "Summarize for a UI card (short TL;DR, 3-5 takeaways, tone). {input}"
22
+
23
+ output_schema do
24
+ string :tldr, max_length: 200
25
+ array :takeaways, of: :string, min_items: 3, max_items: 5
26
+ string :tone, enum: %w[neutral positive negative analytical]
27
+ end
28
+
29
+ retry_policy models: %w[gpt-5-nano gpt-5-mini gpt-5]
30
+ end
31
+
32
+ article = "Ruby 3.4 ships frozen string literals by default, YJIT speedups, parser fixes."
33
+ adapter = RubyLLM::Contract::Adapters::RubyLLM.new
34
+ result = SummarizeArticle.run(article, context: { adapter: adapter })
35
+
36
+ puts "Status: #{result.status}" # => ok
37
+ puts "Final model: #{result.trace[:model]}" # => "gpt-5-nano" (or mini/gpt-5 after fallback)
38
+ puts "Latency: #{result.trace[:latency_ms]}ms" # real network time
39
+ puts "Tokens: #{result.trace[:usage]}" # real usage
40
+ puts "Cost: $#{result.trace[:cost]}" # sum across retries
41
+ puts "TL;DR: #{result.parsed_output[:tldr]}"
42
+
43
+ # Switch provider per call — ruby_llm resolves the provider from the model name:
44
+ # SummarizeArticle.run(article, context: { adapter: adapter, model: "claude-sonnet-4-6" })
45
+ # SummarizeArticle.run(article, context: { adapter: adapter, model: "gemma3:4b" }) # local Ollama
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 3: SummarizeArticle v2 — growing prompt with a keywords field
5
+ #
6
+ # A common evolution in a real Rails app: the UI card shipped with TL;DR,
7
+ # takeaways, and tone. Marketing now wants a "topic pills" row under the
8
+ # card — a sorted list of keywords with a confidence score so the UI can
9
+ # render stronger keywords larger.
10
+ #
11
+ # You could build a second step, but it is one more LLM call per article
12
+ # and the model already has the full context. Better: add one field to
13
+ # the existing SummarizeArticle step. The prompt grows, the schema grows,
14
+ # the validates grow — the contract keeps all three in lockstep.
15
+ #
16
+ # Run: ruby examples/03_summarize_with_keywords.rb
17
+ #
18
+ # Expected output:
19
+ #
20
+ # Status: ok
21
+ # TL;DR: Ruby 3.4 brings frozen string literals, YJIT speedups, parser fixes.
22
+ # Tone: analytical
23
+ #
24
+ # Keywords (sorted by probability):
25
+ # 0.95 ################### Ruby 3.4
26
+ # 0.9 ################## frozen string literals
27
+ # 0.85 ################# YJIT
28
+ # 0.7 ############## Rails workloads
29
+ # 0.6 ############ parser fixes
30
+ # =============================================================================
31
+
32
+ require_relative "../lib/ruby_llm/contract"
33
+
34
+ ARTICLE = <<~ARTICLE
35
+ Ruby 3.4 ships with frozen string literals on by default, measurable YJIT
36
+ speedups on Rails workloads, and tightened Warning.warn category filtering.
37
+ Parser fixes and faster keyword argument handling land alongside.
38
+ ARTICLE
39
+
40
+ GOOD_RESPONSE = {
41
+ tldr: "Ruby 3.4 brings frozen string literals, YJIT speedups, parser fixes.",
42
+ takeaways: [
43
+ "Frozen string literals are the default in Ruby 3.4",
44
+ "YJIT delivers measurable Rails speedups",
45
+ "Parser fixes and keyword argument handling improve"
46
+ ],
47
+ tone: "analytical",
48
+ keywords: [
49
+ { text: "Ruby 3.4", probability: 0.95 },
50
+ { text: "frozen string literals", probability: 0.90 },
51
+ { text: "YJIT", probability: 0.85 },
52
+ { text: "Rails workloads", probability: 0.70 },
53
+ { text: "parser fixes", probability: 0.60 }
54
+ ]
55
+ }.freeze
56
+
57
+ # =============================================================================
58
+ # SummarizeArticle v2: original three fields + keywords
59
+ # =============================================================================
60
+
61
+ class SummarizeArticleWithKeywords < RubyLLM::Contract::Step::Base
62
+ prompt <<~PROMPT
63
+ Summarize this article for a UI card. Return a short TL;DR,
64
+ 3 to 5 key takeaways, a tone label, and a ranked list of keywords.
65
+
66
+ For keywords: extract 3 to 8 phrases (1-3 words each) that appear in
67
+ or directly relate to the article. Give each a relevance probability
68
+ between 0.0 and 1.0. Sort by probability descending.
69
+
70
+ {input}
71
+ PROMPT
72
+
73
+ output_schema do
74
+ string :tldr, min_length: 20, max_length: 200
75
+ array :takeaways, of: :string, min_items: 3, max_items: 5
76
+ string :tone, enum: %w[neutral positive negative analytical]
77
+ array :keywords, min_items: 3, max_items: 8 do
78
+ object do
79
+ string :text, description: "1-3 word keyword or phrase"
80
+ number :probability, minimum: 0.0, maximum: 1.0
81
+ end
82
+ end
83
+ end
84
+
85
+ validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
86
+
87
+ validate("keywords sorted by probability descending") do |o, _|
88
+ probs = o[:keywords].map { |k| k[:probability] }
89
+ probs == probs.sort.reverse
90
+ end
91
+
92
+ validate("keywords are unique (case-insensitive)") do |o, _|
93
+ words = o[:keywords].map { |k| k[:text].downcase.strip }
94
+ words.uniq.size == words.size
95
+ end
96
+
97
+ # Cross-validation: catches hallucinated keywords not in the source text.
98
+ # "At least 70% of keywords must appear in the article (case-insensitive)."
99
+ validate("keywords relate to the source article") do |output, input|
100
+ text = input.downcase
101
+ grounded = output[:keywords].count { |k| text.include?(k[:text].downcase) }
102
+ grounded >= (output[:keywords].size * 0.7).ceil
103
+ end
104
+ end
105
+
106
+ adapter = RubyLLM::Contract::Adapters::Test.new(response: GOOD_RESPONSE)
107
+ result = SummarizeArticleWithKeywords.run(ARTICLE, context: { adapter: adapter })
108
+
109
+ puts "Status: #{result.status}" # => :ok
110
+ puts "TL;DR: #{result.parsed_output[:tldr]}"
111
+ puts "Tone: #{result.parsed_output[:tone]}"
112
+ puts
113
+ puts "Keywords (sorted by probability):"
114
+ result.parsed_output[:keywords].each do |k|
115
+ bar = "#" * (k[:probability] * 20).round
116
+ puts " #{k[:probability].to_s.ljust(5)} #{bar.ljust(20)} #{k[:text]}"
117
+ end
118
+
119
+ # =============================================================================
120
+ # What this showcases
121
+ #
122
+ # - One step, growing contract: the original SummarizeArticle schema + three
123
+ # rules, extended with a fourth field and three more rules. The prompt,
124
+ # schema, and validates all grow together and stay in sync.
125
+ # - Array of objects with per-item constraints (probability 0.0-1.0).
126
+ # - Cross-validation against the input (hallucination catch).
127
+ # - Uniqueness rule that schema cannot express on its own.
128
+ # =============================================================================
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 4: SummarizeArticle pipeline — summarize, translate, review
5
+ #
6
+ # Real scenario: the UI card ships summaries in EN, but the product just
7
+ # launched a French region. Rather than re-prompting the LLM to summarise
8
+ # in French (quality drops), split the work:
9
+ #
10
+ # 1. Summarize — SummarizeArticle in English (the case already tuned for).
11
+ # 2. Translate — convert the English TL;DR + takeaways to French.
12
+ # 3. Review — quality check: no untranslated terms, length fits UI.
13
+ #
14
+ # Pipeline::Base threads the output of step N into step N+1 automatically,
15
+ # fails fast on any step, and aggregates the trace. Each step uses a
16
+ # different LLM skill (analysis / creative / evaluation) — a single prompt
17
+ # asking the model to do all three at once loses to this chain.
18
+ #
19
+ # Run: ruby examples/04_summarize_and_translate.rb
20
+ # =============================================================================
21
+
22
+ require_relative "../lib/ruby_llm/contract"
23
+
24
+ # =============================================================================
25
+ # Step 1 — SummarizeArticle (English, unchanged from README)
26
+ # =============================================================================
27
+
28
+ class SummarizeArticle < RubyLLM::Contract::Step::Base
29
+ prompt <<~PROMPT
30
+ Summarize this article for a UI card. Return a short TL;DR,
31
+ 3 to 5 key takeaways, and a tone label.
32
+
33
+ {input}
34
+ PROMPT
35
+
36
+ output_schema do
37
+ string :tldr, max_length: 200
38
+ array :takeaways, of: :string, min_items: 3, max_items: 5
39
+ string :tone, enum: %w[neutral positive negative analytical]
40
+ end
41
+
42
+ validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
43
+ end
44
+
45
+ # =============================================================================
46
+ # Step 2 — Translate the English summary into the target language
47
+ # =============================================================================
48
+
49
+ class TranslateSummary < RubyLLM::Contract::Step::Base
50
+ input_type Hash
51
+
52
+ prompt do
53
+ system "Translate a UI summary to the target language. Preserve tone label exactly."
54
+ rule "Return JSON with translated tldr, translated takeaways, unchanged tone."
55
+ rule "Keep brand names, product names, and URLs untranslated."
56
+ rule "TL;DR must stay under 200 characters in the target language."
57
+ user "Target language: fr\n\nSummary JSON:\n{tldr}\n{takeaways}\n{tone}"
58
+ end
59
+
60
+ output_schema do
61
+ string :tldr, max_length: 200
62
+ array :takeaways, of: :string, min_items: 3, max_items: 5
63
+ string :tone, enum: %w[neutral positive negative analytical]
64
+ end
65
+
66
+ validate("tone preserved") { |o, input| o[:tone] == input[:tone] }
67
+
68
+ validate("takeaway count preserved") do |output, input|
69
+ output[:takeaways].size == input[:takeaways].size
70
+ end
71
+ end
72
+
73
+ # =============================================================================
74
+ # Step 3 — Review the translation: no untranslated terms, verdicts per takeaway
75
+ # =============================================================================
76
+
77
+ class ReviewTranslation < RubyLLM::Contract::Step::Base
78
+ input_type Hash
79
+
80
+ prompt do
81
+ system "Review a French translation of a UI summary for quality."
82
+ rule "Flag any English words that should have been translated (exclude proper nouns and URLs)."
83
+ rule "Return JSON with overall_verdict (pass/warning/fail) and per-takeaway review."
84
+ user "Translation:\n{tldr}\n{takeaways}"
85
+ end
86
+
87
+ output_schema do
88
+ string :overall_verdict, enum: %w[pass warning fail]
89
+ array :reviews, min_items: 1 do
90
+ object do
91
+ integer :takeaway_index, minimum: 0
92
+ string :verdict, enum: %w[pass warning fail]
93
+ string :issue, description: "Empty if pass"
94
+ end
95
+ end
96
+ end
97
+
98
+ validate("fail verdicts include an issue description") do |o, _|
99
+ o[:reviews].reject { |r| r[:verdict] == "pass" }.all? { |r| !r[:issue].to_s.strip.empty? }
100
+ end
101
+ end
102
+
103
+ # =============================================================================
104
+ # Pipeline: summarise → translate → review
105
+ # =============================================================================
106
+
107
+ class TranslatedSummaryPipeline < RubyLLM::Contract::Pipeline::Base
108
+ step SummarizeArticle, as: :summarise
109
+ step TranslateSummary, as: :translate
110
+ step ReviewTranslation, as: :review
111
+ end
112
+
113
+ # =============================================================================
114
+ # Demo with the Test adapter — each step gets its own canned response
115
+ # =============================================================================
116
+
117
+ adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
118
+ { tldr: "Ruby 3.4 ships frozen string literals, YJIT speedups, parser fixes.",
119
+ takeaways: ["Frozen string literals default", "YJIT Rails speedups", "Parser fixes"],
120
+ tone: "analytical" },
121
+ { tldr: "Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, des corrections d'analyseur.",
122
+ takeaways: ["Littéraux de chaînes figés par défaut", "YJIT accélère Rails", "Corrections de l'analyseur"],
123
+ tone: "analytical" },
124
+ { overall_verdict: "pass",
125
+ reviews: [
126
+ { takeaway_index: 0, verdict: "pass", issue: "" },
127
+ { takeaway_index: 1, verdict: "pass", issue: "" },
128
+ { takeaway_index: 2, verdict: "pass", issue: "" }
129
+ ] }
130
+ ])
131
+
132
+ ARTICLE = "Ruby 3.4 ships with frozen string literals on by default, measurable YJIT speedups on Rails workloads, parser fixes, and faster keyword argument handling."
133
+
134
+ result = TranslatedSummaryPipeline.run(ARTICLE, context: { adapter: adapter })
135
+
136
+ puts "Pipeline: #{result.ok? ? "ok" : "failed"}" # => Pipeline: ok
137
+ puts "Final TL;DR (FR): #{result.outputs_by_step[:translate][:tldr]}" # => "Ruby 3.4 arrive avec ..."
138
+ puts "Review verdict: #{result.outputs_by_step[:review][:overall_verdict]}" # => pass
139
+ puts "Total cost: $#{result.trace.total_cost || '0.0 (Test adapter)'}" # => real cost under Adapters::RubyLLM
140
+
141
+ # Example console output (with Test adapter):
142
+ #
143
+ # Pipeline: ok
144
+ # Final TL;DR (FR): Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, ...
145
+ # Review verdict: pass
146
+ # Total cost: $0.0 (Test adapter)
147
+
148
+ # =============================================================================
149
+ # Evaluating the whole pipeline
150
+ #
151
+ # A pipeline can run against a dataset the same way a single step does.
152
+ # The `expected:` hash matches the FINAL step's output — here the review
153
+ # verdict — so a regression anywhere along the chain shows up in one place.
154
+ # =============================================================================
155
+
156
+ TranslatedSummaryPipeline.define_eval("smoke") do
157
+ add_case "release post",
158
+ input: "Ruby 3.4 ships with frozen string literals, YJIT speedups, parser fixes.",
159
+ expected: { overall_verdict: "pass" }
160
+ end
161
+
162
+ # One Test adapter response per step in order (summarise → translate → review):
163
+ eval_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
164
+ { tldr: "Ruby 3.4 ships frozen string literals, YJIT speedups, parser fixes.",
165
+ takeaways: %w[frozen-strings yjit parser-fixes], tone: "analytical" },
166
+ { tldr: "Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, ...",
167
+ takeaways: %w[lit-figes yjit-fr parser-fr], tone: "analytical" },
168
+ { overall_verdict: "pass",
169
+ reviews: [{ takeaway_index: 0, verdict: "pass", issue: "" }] }
170
+ ])
171
+
172
+ report = TranslatedSummaryPipeline.run_eval("smoke", context: { adapter: eval_adapter })
173
+ puts "\nEval score: #{report.score}" # => 1.0
174
+ puts "Eval pass rate: #{report.pass_rate}" # => 1/1
175
+ puts "Eval passed?: #{report.passed?}" # => true
176
+
177
+ # Example console output (with Test adapter):
178
+ #
179
+ # Eval score: 1.0
180
+ # Eval pass rate: 1/1
181
+ # Eval passed?: true
182
+
183
+ # =============================================================================
184
+ # What this showcases
185
+ #
186
+ # - Pipeline::Base composes steps; data threads automatically from
187
+ # outputs_by_step[:summarise] into the translate step's inputs.
188
+ # - Different LLM skills per step (analysis / creative / evaluation) —
189
+ # one prompt asking for all three at once loses accuracy.
190
+ # - Fail-fast: if SummarizeArticle's "TL;DR fits the card" validate
191
+ # rejects, the translate and review steps never run — no downstream
192
+ # tokens wasted.
193
+ # - A pipeline has its own `define_eval` + `run_eval` pair; expectations
194
+ # match the final step's output, catching end-to-end regressions in one
195
+ # dataset instead of per-step duplicates.
196
+ # =============================================================================