RubyGems - ruby_llm-contract - Versions diffs - 0.7.1 → 0.8.0 - Mend

ruby_llm-contract 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +96 -0
data/Gemfile.lock +3 -3
data/README.md +64 -316
data/examples/00_basics.rb +110 -428
data/examples/01_fallback_showcase.rb +208 -0
data/examples/02_real_llm_minimal.rb +45 -0
data/examples/03_summarize_with_keywords.rb +128 -0
data/examples/04_summarize_and_translate.rb +196 -0
data/examples/05_eval_dataset.rb +144 -0
data/examples/06_retry_variants.rb +147 -0
data/examples/README.md +20 -128
data/lib/ruby_llm/contract/adapters/ruby_llm.rb +22 -1
data/lib/ruby_llm/contract/cost_calculator.rb +39 -0
data/lib/ruby_llm/contract/eval/model_comparison.rb +4 -4
data/lib/ruby_llm/contract/eval/retry_optimizer.rb +7 -3
data/lib/ruby_llm/contract/step/base.rb +18 -1
data/lib/ruby_llm/contract/step/dsl.rb +38 -0
data/lib/ruby_llm/contract/step/limit_checker.rb +2 -2
data/lib/ruby_llm/contract/token_estimator.rb +20 -3
data/lib/ruby_llm/contract/version.rb +1 -1
data/ruby_llm-contract.gemspec +6 -5
metadata +14 -16
data/examples/01_classify_threads.rb +0 -220
data/examples/02_generate_comment.rb +0 -203
data/examples/03_target_audience.rb +0 -201
data/examples/04_real_llm.rb +0 -410
data/examples/05_output_schema.rb +0 -258
data/examples/07_keyword_extraction.rb +0 -239
data/examples/08_translation.rb +0 -353
data/examples/09_eval_dataset.rb +0 -287
data/examples/10_reddit_full_showcase.rb +0 -363

data/examples/01_fallback_showcase.rb ADDED Viewed

@@ -0,0 +1,208 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 1: Fallback showcase — see contracts work in 30 seconds
+#
+# This is the "why does this gem exist" demo, runnable with zero API keys.
+# Uses the Test adapter to simulate a real production failure mode:
+#
+#   1. gpt-5-nano/mini and o-series run with temperature=1.0 server-side.
+#      The SAME prompt on the SAME model returns different outputs across
+#      calls. That is sampling variance, not a bug — it is the published
+#      behaviour of these models.
+#   2. One unlucky sample can flip a correct tone to an incorrect one
+#      ("negative" → "positive" for an outage article). Schema passes
+#      both; the wrong answer silently ships.
+#   3. A validate block that cross-checks fields against each other turns
+#      a flaky output into a deterministic rejection, and retry_policy
+#      escalates to a stronger model for the retry.
+#   4. The caller gets valid output plus a trace showing exactly what
+#      happened across attempts.
+#
+# Run:
+#   ruby examples/01_fallback_showcase.rb
+#
+# Expected output:
+#
+#   ======================================================================
+#   A — Schema-only (no cross-check, no retry):
+#   ======================================================================
+#   status:        :ok            # schema passes — no guard
+#   tone shipped:  "positive"
+#   takeaway 1:    "Mesh networking hardware failed under load"
+#                  ^^ takeaways describe a failure; tone says positive
+#                  ^^ customer-success "critical feedback" filter misses this case
+#
+#   ======================================================================
+#   B — Full contract (cross-check validate + retry_policy fallback):
+#   ======================================================================
+#   status:             :ok
+#   final model:        "gpt-5-mini"
+#   total attempts:     2
+#
+#   Per-attempt trace:
+#     attempt 1  model=gpt-5-nano   status=validation_failed
+#     attempt 2  model=gpt-5-mini   status=ok
+#
+#   Final parsed_output:
+#     tldr:       "Mesh networking hardware failed under load; ..."
+#     takeaways:  3 items
+#     tone:       "negative"
+#
+# See also: examples/06_retry_variants.rb — same-model retry, reasoning_effort
+# escalation, and cross-provider fallback (Ollama → Anthropic → OpenAI).
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+# The article being summarized — an outage complaint. The correct tone is
+# "negative" (customer success routes these to a human).
+ARTICLE = <<~ARTICLE
+  The mesh networking hardware failed under load during the product launch.
+  Two features crashed, the recovery took eight hours, and three enterprise
+  customers threatened to churn. The post-incident review identified a
+  single regression in the firmware update as the root cause.
+ARTICLE
+# What gpt-5-nano returns on an unlucky sample (temperature=1.0 cannot be
+# lowered). Every field is schema-valid. Tone disagrees with the takeaways.
+VARIANCE_RESPONSE = {
+  tldr: "Product launch covered mesh networking hardware with three enterprise customers.",
+  takeaways: [
+    "Mesh networking hardware failed under load",
+    "Two features crashed and recovery took eight hours",
+    "Firmware regression identified as root cause"
+  ],
+  tone: "positive"
+}.freeze
+# What gpt-5-mini returns on retry — a consistent sample where tone matches
+# the severity keywords in the takeaways.
+GOOD_RESPONSE = {
+  tldr: "Mesh networking hardware failed under load; firmware regression was the root cause.",
+  takeaways: [
+    "Mesh networking hardware failed under load during launch",
+    "Two features crashed and recovery took eight hours",
+    "Firmware regression identified as root cause; three customers threatened churn"
+  ],
+  tone: "negative"
+}.freeze
+# =============================================================================
+# STEP 1 — Define the contract exactly as a production Rails app would
+# =============================================================================
+class SummarizeArticle < RubyLLM::Contract::Step::Base
+  prompt <<~PROMPT
+    Summarize this article for a UI card. Return a short TL;DR,
+    3 to 5 key takeaways, and a tone label.
+    {input}
+  PROMPT
+  output_schema do
+    string :tldr
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
+  # The key cross-check: if takeaways mention severity / failure keywords,
+  # tone must reflect that. This catches tone/takeaways mismatch when the
+  # model's sample drifts between calls. Expand the keyword list from your
+  # own production failures; this is a demo.
+  SEVERITY_PATTERN = /fail|crash|outage|broken|bug|error|regression/i.freeze
+  validate("tone matches severity keywords") do |o, _|
+    flagged = o[:takeaways].any? { |t| t.match?(SEVERITY_PATTERN) }
+    next true unless flagged
+    %w[negative analytical].include?(o[:tone])
+  end
+  retry_policy models: %w[gpt-5-nano gpt-5-mini gpt-5]
+end
+# =============================================================================
+# PART A — SCHEMA-ONLY (no cross-check, no retry)
+#
+# Demonstrates what a "schema is enough" mindset gets you: the tone/takeaways
+# mismatch passes every shape check and would be persisted by the caller,
+# breaking the customer-success routing filter downstream.
+# =============================================================================
+class SummarizeArticleSchemaOnly < RubyLLM::Contract::Step::Base
+  prompt "Summarize: {input}"
+  output_schema do
+    string :tldr
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+end
+puts "=" * 70
+puts "A — Schema-only (no cross-check, no retry):"
+puts "=" * 70
+naive_adapter = RubyLLM::Contract::Adapters::Test.new(response: VARIANCE_RESPONSE)
+naive_result = SummarizeArticleSchemaOnly.run(ARTICLE, context: { adapter: naive_adapter })
+puts "status:        #{naive_result.status.inspect}            # schema passes — no guard"
+puts "tone shipped:  #{naive_result.parsed_output[:tone].inspect}"
+puts "takeaway 1:    #{naive_result.parsed_output[:takeaways].first.inspect}"
+puts "               ^^ takeaways describe a failure; tone says positive"
+puts "               ^^ customer-success \"critical feedback\" filter misses this case"
+puts
+# =============================================================================
+# PART B — FULL CONTRACT: cross-check validate + retry_policy fallback
+#
+# The Test adapter returns:
+#   attempt 1 (gpt-5-nano) — tone/takeaways mismatch from variance → rejected
+#   attempt 2 (gpt-5-mini) — consistent sample             → passes
+#
+# retry_policy handles the escalation automatically.
+# =============================================================================
+puts "=" * 70
+puts "B — Full contract (cross-check validate + retry_policy fallback):"
+puts "=" * 70
+adapter = RubyLLM::Contract::Adapters::Test.new(responses: [VARIANCE_RESPONSE, GOOD_RESPONSE])
+result = SummarizeArticle.run(ARTICLE, context: { adapter: adapter })
+puts "status:             #{result.status.inspect}"
+puts "final model:        #{result.trace[:model].inspect}"
+puts "total attempts:     #{result.trace[:attempts].size}"
+puts
+puts "Per-attempt trace:"
+result.trace[:attempts].each do |a|
+  puts "  attempt #{a[:attempt]}  model=#{a[:model].ljust(12)} status=#{a[:status]}"
+end
+puts
+puts "Final parsed_output:"
+puts "  tldr:       #{result.parsed_output[:tldr].inspect}"
+puts "  takeaways:  #{result.parsed_output[:takeaways].size} items"
+puts "  tone:       #{result.parsed_output[:tone].inspect}"
+puts
+# =============================================================================
+# TAKEAWAYS
+#
+# 1. gpt-5 / o-series force temperature=1.0. Output variance is the published
+#    behavior of these models — not a bug to fix.
+# 2. Schema cannot catch a tone/takeaways mismatch — every field is the
+#    right type. Only a cross-field validate can express "these fields
+#    must agree".
+# 3. retry_policy turns that rejection into an automatic escalation. Variance
+#    is absorbed before the caller (or a customer-success routing filter)
+#    ever sees the flaky sample.
+# 4. result.trace[:attempts] gives you the per-attempt record for free, so
+#    you can log retry rate and the cost delta from escalation.
+#
+# Replace the Test adapter with Adapters::RubyLLM (see Step 8 in
+# examples/00_basics.rb for the one-liner) and this exact same code runs
+# against a real provider or a local Ollama server.
+# =============================================================================

data/examples/02_real_llm_minimal.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 2: Swap the Test adapter for a real LLM — the one-liner
+#
+# Take any contract step from the other examples, point ruby_llm at your
+# provider, and pass Adapters::RubyLLM.new in context. The step itself does
+# not change — same prompt, schema, validates, retry_policy.
+#
+# Requires: gem install ruby_llm; export OPENAI_API_KEY=sk-...
+# (Or an Anthropic / Gemini / Mistral key, or a local Ollama server.)
+#
+# Run: OPENAI_API_KEY=sk-... ruby examples/02_real_llm_minimal.rb
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+RubyLLM.configure { |c| c.openai_api_key = ENV.fetch("OPENAI_API_KEY") }
+class SummarizeArticle < RubyLLM::Contract::Step::Base
+  prompt "Summarize for a UI card (short TL;DR, 3-5 takeaways, tone). {input}"
+  output_schema do
+    string :tldr, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  retry_policy models: %w[gpt-5-nano gpt-5-mini gpt-5]
+end
+article = "Ruby 3.4 ships frozen string literals by default, YJIT speedups, parser fixes."
+adapter = RubyLLM::Contract::Adapters::RubyLLM.new
+result  = SummarizeArticle.run(article, context: { adapter: adapter })
+puts "Status:      #{result.status}"                 # => ok
+puts "Final model: #{result.trace[:model]}"          # => "gpt-5-nano" (or mini/gpt-5 after fallback)
+puts "Latency:     #{result.trace[:latency_ms]}ms"   # real network time
+puts "Tokens:      #{result.trace[:usage]}"          # real usage
+puts "Cost:        $#{result.trace[:cost]}"          # sum across retries
+puts "TL;DR:       #{result.parsed_output[:tldr]}"
+# Switch provider per call — ruby_llm resolves the provider from the model name:
+#   SummarizeArticle.run(article, context: { adapter: adapter, model: "claude-sonnet-4-6" })
+#   SummarizeArticle.run(article, context: { adapter: adapter, model: "gemma3:4b" })  # local Ollama

data/examples/03_summarize_with_keywords.rb ADDED Viewed

@@ -0,0 +1,128 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 3: SummarizeArticle v2 — growing prompt with a keywords field
+#
+# A common evolution in a real Rails app: the UI card shipped with TL;DR,
+# takeaways, and tone. Marketing now wants a "topic pills" row under the
+# card — a sorted list of keywords with a confidence score so the UI can
+# render stronger keywords larger.
+#
+# You could build a second step, but it is one more LLM call per article
+# and the model already has the full context. Better: add one field to
+# the existing SummarizeArticle step. The prompt grows, the schema grows,
+# the validates grow — the contract keeps all three in lockstep.
+#
+# Run: ruby examples/03_summarize_with_keywords.rb
+#
+# Expected output:
+#
+#   Status:    ok
+#   TL;DR:     Ruby 3.4 brings frozen string literals, YJIT speedups, parser fixes.
+#   Tone:      analytical
+#
+#   Keywords (sorted by probability):
+#     0.95  ###################  Ruby 3.4
+#     0.9   ##################   frozen string literals
+#     0.85  #################    YJIT
+#     0.7   ##############       Rails workloads
+#     0.6   ############         parser fixes
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+ARTICLE = <<~ARTICLE
+  Ruby 3.4 ships with frozen string literals on by default, measurable YJIT
+  speedups on Rails workloads, and tightened Warning.warn category filtering.
+  Parser fixes and faster keyword argument handling land alongside.
+ARTICLE
+GOOD_RESPONSE = {
+  tldr: "Ruby 3.4 brings frozen string literals, YJIT speedups, parser fixes.",
+  takeaways: [
+    "Frozen string literals are the default in Ruby 3.4",
+    "YJIT delivers measurable Rails speedups",
+    "Parser fixes and keyword argument handling improve"
+  ],
+  tone: "analytical",
+  keywords: [
+    { text: "Ruby 3.4",              probability: 0.95 },
+    { text: "frozen string literals", probability: 0.90 },
+    { text: "YJIT",                   probability: 0.85 },
+    { text: "Rails workloads",        probability: 0.70 },
+    { text: "parser fixes",           probability: 0.60 }
+  ]
+}.freeze
+# =============================================================================
+# SummarizeArticle v2: original three fields + keywords
+# =============================================================================
+class SummarizeArticleWithKeywords < RubyLLM::Contract::Step::Base
+  prompt <<~PROMPT
+    Summarize this article for a UI card. Return a short TL;DR,
+    3 to 5 key takeaways, a tone label, and a ranked list of keywords.
+    For keywords: extract 3 to 8 phrases (1-3 words each) that appear in
+    or directly relate to the article. Give each a relevance probability
+    between 0.0 and 1.0. Sort by probability descending.
+    {input}
+  PROMPT
+  output_schema do
+    string :tldr, min_length: 20, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+    array :keywords, min_items: 3, max_items: 8 do
+      object do
+        string :text, description: "1-3 word keyword or phrase"
+        number :probability, minimum: 0.0, maximum: 1.0
+      end
+    end
+  end
+  validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
+  validate("keywords sorted by probability descending") do |o, _|
+    probs = o[:keywords].map { |k| k[:probability] }
+    probs == probs.sort.reverse
+  end
+  validate("keywords are unique (case-insensitive)") do |o, _|
+    words = o[:keywords].map { |k| k[:text].downcase.strip }
+    words.uniq.size == words.size
+  end
+  # Cross-validation: catches hallucinated keywords not in the source text.
+  # "At least 70% of keywords must appear in the article (case-insensitive)."
+  validate("keywords relate to the source article") do |output, input|
+    text = input.downcase
+    grounded = output[:keywords].count { |k| text.include?(k[:text].downcase) }
+    grounded >= (output[:keywords].size * 0.7).ceil
+  end
+end
+adapter = RubyLLM::Contract::Adapters::Test.new(response: GOOD_RESPONSE)
+result = SummarizeArticleWithKeywords.run(ARTICLE, context: { adapter: adapter })
+puts "Status:    #{result.status}"                      # => :ok
+puts "TL;DR:     #{result.parsed_output[:tldr]}"
+puts "Tone:      #{result.parsed_output[:tone]}"
+puts
+puts "Keywords (sorted by probability):"
+result.parsed_output[:keywords].each do |k|
+  bar = "#" * (k[:probability] * 20).round
+  puts "  #{k[:probability].to_s.ljust(5)} #{bar.ljust(20)} #{k[:text]}"
+end
+# =============================================================================
+# What this showcases
+#
+# - One step, growing contract: the original SummarizeArticle schema + three
+#   rules, extended with a fourth field and three more rules. The prompt,
+#   schema, and validates all grow together and stay in sync.
+# - Array of objects with per-item constraints (probability 0.0-1.0).
+# - Cross-validation against the input (hallucination catch).
+# - Uniqueness rule that schema cannot express on its own.
+# =============================================================================

data/examples/04_summarize_and_translate.rb ADDED Viewed

@@ -0,0 +1,196 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 4: SummarizeArticle pipeline — summarize, translate, review
+#
+# Real scenario: the UI card ships summaries in EN, but the product just
+# launched a French region. Rather than re-prompting the LLM to summarise
+# in French (quality drops), split the work:
+#
+#   1. Summarize — SummarizeArticle in English (the case already tuned for).
+#   2. Translate — convert the English TL;DR + takeaways to French.
+#   3. Review    — quality check: no untranslated terms, length fits UI.
+#
+# Pipeline::Base threads the output of step N into step N+1 automatically,
+# fails fast on any step, and aggregates the trace. Each step uses a
+# different LLM skill (analysis / creative / evaluation) — a single prompt
+# asking the model to do all three at once loses to this chain.
+#
+# Run: ruby examples/04_summarize_and_translate.rb
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+# =============================================================================
+# Step 1 — SummarizeArticle (English, unchanged from README)
+# =============================================================================
+class SummarizeArticle < RubyLLM::Contract::Step::Base
+  prompt <<~PROMPT
+    Summarize this article for a UI card. Return a short TL;DR,
+    3 to 5 key takeaways, and a tone label.
+    {input}
+  PROMPT
+  output_schema do
+    string :tldr, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
+end
+# =============================================================================
+# Step 2 — Translate the English summary into the target language
+# =============================================================================
+class TranslateSummary < RubyLLM::Contract::Step::Base
+  input_type Hash
+  prompt do
+    system "Translate a UI summary to the target language. Preserve tone label exactly."
+    rule   "Return JSON with translated tldr, translated takeaways, unchanged tone."
+    rule   "Keep brand names, product names, and URLs untranslated."
+    rule   "TL;DR must stay under 200 characters in the target language."
+    user   "Target language: fr\n\nSummary JSON:\n{tldr}\n{takeaways}\n{tone}"
+  end
+  output_schema do
+    string :tldr, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  validate("tone preserved") { |o, input| o[:tone] == input[:tone] }
+  validate("takeaway count preserved") do |output, input|
+    output[:takeaways].size == input[:takeaways].size
+  end
+end
+# =============================================================================
+# Step 3 — Review the translation: no untranslated terms, verdicts per takeaway
+# =============================================================================
+class ReviewTranslation < RubyLLM::Contract::Step::Base
+  input_type Hash
+  prompt do
+    system "Review a French translation of a UI summary for quality."
+    rule   "Flag any English words that should have been translated (exclude proper nouns and URLs)."
+    rule   "Return JSON with overall_verdict (pass/warning/fail) and per-takeaway review."
+    user   "Translation:\n{tldr}\n{takeaways}"
+  end
+  output_schema do
+    string :overall_verdict, enum: %w[pass warning fail]
+    array  :reviews, min_items: 1 do
+      object do
+        integer :takeaway_index, minimum: 0
+        string  :verdict, enum: %w[pass warning fail]
+        string  :issue, description: "Empty if pass"
+      end
+    end
+  end
+  validate("fail verdicts include an issue description") do |o, _|
+    o[:reviews].reject { |r| r[:verdict] == "pass" }.all? { |r| !r[:issue].to_s.strip.empty? }
+  end
+end
+# =============================================================================
+# Pipeline: summarise → translate → review
+# =============================================================================
+class TranslatedSummaryPipeline < RubyLLM::Contract::Pipeline::Base
+  step SummarizeArticle,   as: :summarise
+  step TranslateSummary,   as: :translate
+  step ReviewTranslation,  as: :review
+end
+# =============================================================================
+# Demo with the Test adapter — each step gets its own canned response
+# =============================================================================
+adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
+  { tldr: "Ruby 3.4 ships frozen string literals, YJIT speedups, parser fixes.",
+    takeaways: ["Frozen string literals default", "YJIT Rails speedups", "Parser fixes"],
+    tone: "analytical" },
+  { tldr: "Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, des corrections d'analyseur.",
+    takeaways: ["Littéraux de chaînes figés par défaut", "YJIT accélère Rails", "Corrections de l'analyseur"],
+    tone: "analytical" },
+  { overall_verdict: "pass",
+    reviews: [
+      { takeaway_index: 0, verdict: "pass", issue: "" },
+      { takeaway_index: 1, verdict: "pass", issue: "" },
+      { takeaway_index: 2, verdict: "pass", issue: "" }
+    ] }
+])
+ARTICLE = "Ruby 3.4 ships with frozen string literals on by default, measurable YJIT speedups on Rails workloads, parser fixes, and faster keyword argument handling."
+result = TranslatedSummaryPipeline.run(ARTICLE, context: { adapter: adapter })
+puts "Pipeline: #{result.ok? ? "ok" : "failed"}"                      # => Pipeline: ok
+puts "Final TL;DR (FR):  #{result.outputs_by_step[:translate][:tldr]}" # => "Ruby 3.4 arrive avec ..."
+puts "Review verdict:    #{result.outputs_by_step[:review][:overall_verdict]}" # => pass
+puts "Total cost:        $#{result.trace.total_cost || '0.0 (Test adapter)'}"  # => real cost under Adapters::RubyLLM
+# Example console output (with Test adapter):
+#
+#   Pipeline: ok
+#   Final TL;DR (FR):  Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, ...
+#   Review verdict:    pass
+#   Total cost:        $0.0 (Test adapter)
+# =============================================================================
+# Evaluating the whole pipeline
+#
+# A pipeline can run against a dataset the same way a single step does.
+# The `expected:` hash matches the FINAL step's output — here the review
+# verdict — so a regression anywhere along the chain shows up in one place.
+# =============================================================================
+TranslatedSummaryPipeline.define_eval("smoke") do
+  add_case "release post",
+           input: "Ruby 3.4 ships with frozen string literals, YJIT speedups, parser fixes.",
+           expected: { overall_verdict: "pass" }
+end
+# One Test adapter response per step in order (summarise → translate → review):
+eval_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
+  { tldr: "Ruby 3.4 ships frozen string literals, YJIT speedups, parser fixes.",
+    takeaways: %w[frozen-strings yjit parser-fixes], tone: "analytical" },
+  { tldr: "Ruby 3.4 arrive avec les littéraux de chaînes figés, des gains YJIT, ...",
+    takeaways: %w[lit-figes yjit-fr parser-fr], tone: "analytical" },
+  { overall_verdict: "pass",
+    reviews: [{ takeaway_index: 0, verdict: "pass", issue: "" }] }
+])
+report = TranslatedSummaryPipeline.run_eval("smoke", context: { adapter: eval_adapter })
+puts "\nEval score:      #{report.score}"           # => 1.0
+puts "Eval pass rate:  #{report.pass_rate}"         # => 1/1
+puts "Eval passed?:    #{report.passed?}"           # => true
+# Example console output (with Test adapter):
+#
+#   Eval score:      1.0
+#   Eval pass rate:  1/1
+#   Eval passed?:    true
+# =============================================================================
+# What this showcases
+#
+# - Pipeline::Base composes steps; data threads automatically from
+#   outputs_by_step[:summarise] into the translate step's inputs.
+# - Different LLM skills per step (analysis / creative / evaluation) —
+#   one prompt asking for all three at once loses accuracy.
+# - Fail-fast: if SummarizeArticle's "TL;DR fits the card" validate
+#   rejects, the translate and review steps never run — no downstream
+#   tokens wasted.
+# - A pipeline has its own `define_eval` + `run_eval` pair; expectations
+#   match the final step's output, catching end-to-end regressions in one
+#   dataset instead of per-step duplicates.
+# =============================================================================