RubyGems - ruby_llm-contract - Versions diffs - 0.2.0 - Mend

ruby_llm-contract 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +55 -0
data/CHANGELOG.md +76 -0
data/Gemfile +11 -0
data/Gemfile.lock +176 -0
data/LICENSE +21 -0
data/README.md +154 -0
data/Rakefile +8 -0
data/examples/00_basics.rb +500 -0
data/examples/01_classify_threads.rb +220 -0
data/examples/02_generate_comment.rb +203 -0
data/examples/03_target_audience.rb +201 -0
data/examples/04_real_llm.rb +410 -0
data/examples/05_output_schema.rb +258 -0
data/examples/07_keyword_extraction.rb +239 -0
data/examples/08_translation.rb +353 -0
data/examples/09_eval_dataset.rb +287 -0
data/examples/10_reddit_full_showcase.rb +363 -0
data/examples/README.md +140 -0
data/lib/ruby_llm/contract/adapters/base.rb +13 -0
data/lib/ruby_llm/contract/adapters/response.rb +17 -0
data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
data/lib/ruby_llm/contract/adapters/test.rb +44 -0
data/lib/ruby_llm/contract/adapters.rb +6 -0
data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
data/lib/ruby_llm/contract/configuration.rb +21 -0
data/lib/ruby_llm/contract/contract/definition.rb +39 -0
data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
data/lib/ruby_llm/contract/contract/parser.rb +143 -0
data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
data/lib/ruby_llm/contract/contract/validator.rb +104 -0
data/lib/ruby_llm/contract/contract.rb +7 -0
data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
data/lib/ruby_llm/contract/dsl.rb +13 -0
data/lib/ruby_llm/contract/errors.rb +19 -0
data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
data/lib/ruby_llm/contract/eval/report.rb +115 -0
data/lib/ruby_llm/contract/eval/runner.rb +162 -0
data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
data/lib/ruby_llm/contract/eval.rb +16 -0
data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
data/lib/ruby_llm/contract/pipeline.rb +6 -0
data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
data/lib/ruby_llm/contract/prompt/node.rb +25 -0
data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
data/lib/ruby_llm/contract/railtie.rb +20 -0
data/lib/ruby_llm/contract/rake_task.rb +78 -0
data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
data/lib/ruby_llm/contract/rspec.rb +6 -0
data/lib/ruby_llm/contract/step/base.rb +138 -0
data/lib/ruby_llm/contract/step/dsl.rb +144 -0
data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
data/lib/ruby_llm/contract/step/result.rb +38 -0
data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
data/lib/ruby_llm/contract/step/runner.rb +126 -0
data/lib/ruby_llm/contract/step/trace.rb +70 -0
data/lib/ruby_llm/contract/step.rb +10 -0
data/lib/ruby_llm/contract/token_estimator.rb +19 -0
data/lib/ruby_llm/contract/types.rb +11 -0
data/lib/ruby_llm/contract/version.rb +7 -0
data/lib/ruby_llm/contract.rb +108 -0
data/ruby_llm-contract.gemspec +33 -0
metadata +172 -0

data/examples/09_eval_dataset.rb ADDED Viewed

@@ -0,0 +1,287 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 9: Dataset-based prompt evaluation
+#
+# Define test cases with expected outputs, run a step against all of them,
+# and get an aggregate quality score. Like unit tests for your prompts.
+#
+# Shows:
+#   - Dataset DSL with cases (input + expected)
+#   - 4 evaluator types: exact, json_includes, regex, custom proc
+#   - expected_traits for multi-property checks
+#   - Aggregate scoring (0.0–1.0)
+#   - eval_case convenience for inline testing
+#   - Eval detecting quality regression
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+# =============================================================================
+# STEP TO EVALUATE
+# =============================================================================
+class ClassifyIntent < RubyLLM::Contract::Step::Base
+  input_type String
+  output_schema do
+    string :intent, enum: %w[sales support billing other]
+    number :confidence, minimum: 0.0, maximum: 1.0
+  end
+  prompt do
+    system "Classify the user's intent."
+    user "{input}"
+  end
+end
+# =============================================================================
+# STEP 1: Define a dataset — your "golden set" of test cases
+# =============================================================================
+puts "=" * 60
+puts "STEP 1: Define a dataset"
+puts "=" * 60
+dataset = RubyLLM::Contract::Eval::Dataset.define("intent_classification") do
+  # Case with exact expected output
+  add_case "billing inquiry",
+           input: "I need help with my invoice",
+           expected: { intent: "billing" }
+  # Case with multiple expected fields
+  add_case "sales inquiry",
+           input: "I want to upgrade my plan",
+           expected: { intent: "sales" }
+  # Case with expected_traits (regex, ranges)
+  add_case "support with confidence",
+           input: "My app is crashing",
+           expected_traits: { intent: "support" }
+  # Case with custom evaluator (proc)
+  add_case "high confidence expected",
+           input: "URGENT: billing error!!!",
+           evaluator: ->(output) { output[:confidence] >= 0.8 }
+  # Case with no expected — just checks contract passes
+  add_case "contract smoke test",
+           input: "random text here"
+end
+puts "Dataset: #{dataset.name}"
+puts "Cases: #{dataset.cases.length}"
+dataset.cases.each { |c| puts "  - #{c.name}" }
+# =============================================================================
+# STEP 2: Run the eval — good model (all pass)
+# =============================================================================
+puts "\n\n#{"=" * 60}"
+puts "STEP 2: Run eval — good model (all cases pass)"
+puts "=" * 60
+# Simulate a good model that returns correct intents
+good_responses = {
+  "I need help with my invoice" => '{"intent": "billing", "confidence": 0.92}',
+  "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}',
+  "My app is crashing" => '{"intent": "support", "confidence": 0.95}',
+  "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.97}',
+  "random text here" => '{"intent": "other", "confidence": 0.6}'
+}
+# Custom adapter that returns different responses per input
+good_adapter = Object.new
+good_adapter.define_singleton_method(:call) do |messages:, **_opts|
+  user_msg = messages.find { |m| m[:role] == :user }
+  response = good_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
+  RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
+end
+report = RubyLLM::Contract::Eval::Runner.run(
+  step: ClassifyIntent,
+  dataset: dataset,
+  context: { adapter: good_adapter }
+)
+puts "\nScore: #{report.score.round(2)}"
+puts "Pass rate: #{report.pass_rate}"
+puts "All passed: #{report.passed?}"
+puts
+report.each do |r|
+  icon = r.passed? ? "✓" : "✗"
+  puts "  #{icon} #{r.name.ljust(30)} score=#{r.score}  #{r.details}"
+end
+# =============================================================================
+# STEP 3: Run eval — bad model (some fail)
+# =============================================================================
+puts "\n\n#{"=" * 60}"
+puts "STEP 3: Run eval — bad model (quality regression)"
+puts "=" * 60
+# Simulate a worse model that misclassifies some intents
+bad_responses = {
+  "I need help with my invoice" => '{"intent": "support", "confidence": 0.7}', # WRONG: billing → support
+  "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}', # correct
+  "My app is crashing" => '{"intent": "other", "confidence": 0.4}', # WRONG: support → other
+  "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.55}', # low confidence
+  "random text here" => '{"intent": "other", "confidence": 0.6}' # correct
+}
+bad_adapter = Object.new
+bad_adapter.define_singleton_method(:call) do |messages:, **_opts|
+  user_msg = messages.find { |m| m[:role] == :user }
+  response = bad_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
+  RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
+end
+bad_report = RubyLLM::Contract::Eval::Runner.run(
+  step: ClassifyIntent,
+  dataset: dataset,
+  context: { adapter: bad_adapter }
+)
+puts "\nScore: #{bad_report.score.round(2)}"
+puts "Pass rate: #{bad_report.pass_rate}"
+puts "All passed: #{bad_report.passed?}"
+puts
+bad_report.each do |r|
+  icon = r.passed? ? "✓" : "✗"
+  puts "  #{icon} #{r.name.ljust(30)} score=#{r.score}  #{r.details}"
+end
+puts "\nRegression detected:"
+puts "  Score dropped: #{report.score.round(2)} → #{bad_report.score.round(2)} " \
+     "(#{((report.score - bad_report.score) * 100).round(1)}% drop)"
+# =============================================================================
+# STEP 4: eval_case — quick inline check
+# =============================================================================
+puts "\n\n#{"=" * 60}"
+puts "STEP 4: eval_case — inline single-case eval"
+puts "=" * 60
+# No dataset needed — just check one case
+result = ClassifyIntent.eval_case(
+  input: "I want to cancel my subscription",
+  expected: { intent: "billing" },
+  context: { adapter: good_adapter }
+)
+puts "Passed: #{result[:passed]}"
+puts "Score: #{result[:score]}"
+puts "Output: #{result[:output]}"
+puts "Details: #{result[:details]}"
+# With expected_traits
+result2 = ClassifyIntent.eval_case(
+  input: "URGENT: server down!!!",
+  expected_traits: { intent: "support" },
+  context: {
+    adapter: RubyLLM::Contract::Adapters::Test.new(
+      response: '{"intent": "support", "confidence": 0.99}'
+    )
+  }
+)
+puts "\nTraits check:"
+puts "Passed: #{result2[:passed]}"
+puts "Details: #{result2[:details]}"
+# With custom proc evaluator
+result3 = ClassifyIntent.eval_case(
+  input: "test",
+  evaluator: ->(output) { output[:confidence] > 0.9 },
+  context: {
+    adapter: RubyLLM::Contract::Adapters::Test.new(
+      response: '{"intent": "other", "confidence": 0.95}'
+    )
+  }
+)
+puts "\nCustom proc:"
+puts "Passed: #{result3[:passed]} (confidence > 0.9)"
+# =============================================================================
+# STEP 5: Evaluating a pipeline
+# =============================================================================
+puts "\n\n#{"=" * 60}"
+puts "STEP 5: Evaluate a pipeline end-to-end"
+puts "=" * 60
+class SuggestAction < RubyLLM::Contract::Step::Base
+  input_type Hash
+  output_schema do
+    string :action
+    string :priority, enum: %w[low medium high urgent]
+  end
+  prompt do
+    system "Suggest an action based on the classified intent."
+    user "Intent: {intent}, Confidence: {confidence}"
+  end
+end
+class SupportPipeline < RubyLLM::Contract::Pipeline::Base
+  step ClassifyIntent, as: :classify
+  step SuggestAction,  as: :action
+end
+pipeline_dataset = RubyLLM::Contract::Eval::Dataset.define("support_pipeline") do
+  add_case "billing → action",
+           input: "I need help with my invoice",
+           expected: { priority: "medium" }
+  add_case "urgent → action",
+           input: "URGENT: server is down!",
+           expected: { priority: "urgent" }
+end
+pipeline_adapter = RubyLLM::Contract::Adapters::Test.new(
+  response: '{"intent": "billing", "confidence": 0.9, "action": "Review invoice", "priority": "medium"}'
+)
+pipeline_report = RubyLLM::Contract::Eval::Runner.run(
+  step: SupportPipeline,
+  dataset: pipeline_dataset,
+  context: { adapter: pipeline_adapter }
+)
+puts "\nPipeline eval:"
+puts "Score: #{pipeline_report.score.round(2)}"
+puts "Pass rate: #{pipeline_report.pass_rate}"
+pipeline_report.each do |r|
+  icon = r.passed? ? "✓" : "✗"
+  puts "  #{icon} #{r.name.ljust(25)} #{r.details}"
+end
+# =============================================================================
+# SUMMARY
+#
+# Dataset eval answers: "Is my prompt good?"
+#
+# Define cases:
+#   - expected: exact output match (or json_includes for partial)
+#   - expected_traits: multi-property checks (regex, values)
+#   - evaluator: custom proc for complex logic
+#   - no expected: just check contract passes
+#
+# Run eval:
+#   - report.score → 0.0-1.0 aggregate
+#   - report.pass_rate → "4/5"
+#   - report.each → per-case details
+#
+# Quick check:
+#   - MyStep.eval_case(input: ..., expected: ...) → single result
+#
+# Regression detection:
+#   - Compare report.score before/after prompt change
+#   - Drop from 1.0 to 0.6 → something broke
+#
+# Next: GH-8 adds Regression::Baseline to automate this comparison
+# =============================================================================

data/examples/10_reddit_full_showcase.rb ADDED Viewed

@@ -0,0 +1,363 @@
+# frozen_string_literal: true
+# =============================================================================
+# Reddit Promo Pipeline — 5-step campaign from URL to comment
+#
+# A real-world pipeline that takes a product URL and produces a natural
+# Reddit comment ready to post. Each step has a contract that catches
+# the kind of failures LLMs actually produce in production.
+#
+#   ruby examples/10_reddit_full_showcase.rb
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+# ===========================================================================
+# Step 1 — Analyze the product
+#
+# Takes a plain String URL. Returns audience profile.
+# Contract catches: invalid locale ("USA" instead of "en"), vague audiences.
+# ===========================================================================
+class AnalyzeProduct < RubyLLM::Contract::Step::Base
+  output_schema do
+    string :product_description, description: "What the product does (1-2 sentences)"
+    string :locale, description: "ISO 639-1 language code"
+    string :audience_group_1
+    string :audience_group_2
+    string :audience_group_3
+  end
+  prompt <<~PROMPT
+    You are a marketing analyst. Analyze the product and identify target audiences.
+    locale must be a 2-letter ISO 639-1 code (en, pl, de), NOT a country name.
+    Audience groups must be specific, not generic.
+    {input}
+  PROMPT
+  max_input 3_000  # refuse before LLM call if prompt too large
+  max_cost 0.01    # refuse before LLM call if estimated cost > $0.01
+  validate("locale is valid ISO 639-1") { |o| o[:locale].to_s.match?(/\A[a-z]{2}\z/) }
+  validate("description is substantive") { |o| o[:product_description].to_s.split.size >= 5 }
+  validate("audience groups are specific") do |o|
+    [o[:audience_group_1], o[:audience_group_2], o[:audience_group_3]].all? { |g| g.to_s.size > 5 }
+  end
+end
+# ===========================================================================
+# Step 2 — Find subreddits and a sample thread
+#
+# Receives the audience profile, returns subreddits + a thread to work with.
+# Contract catches: empty subreddit names, missing thread language.
+# ===========================================================================
+class IdentifySubreddits < RubyLLM::Contract::Step::Base
+  input_type Hash
+  output_schema do
+    string :product_description
+    string :locale
+    string :subreddit_1
+    string :subreddit_2
+    string :subreddit_3
+    string :thread_title, description: "A representative thread title"
+    string :thread_selftext, description: "Thread body text"
+    string :thread_subreddit
+    string :thread_language, description: "ISO 639-1 code of the thread's language"
+  end
+  prompt <<~PROMPT
+    You are a Reddit marketing researcher.
+    Find subreddits where the target audience hangs out.
+    Pick one representative thread that would be perfect for a product mention.
+    Pass through product_description and locale from input.
+    SUBREDDIT CRITERIA:
+    - Active community (>10k members)
+    - Allows product discussions
+    - Not hostile to recommendations
+    {input}
+  PROMPT
+  validate("has subreddits") do |o|
+    [o[:subreddit_1], o[:subreddit_2], o[:subreddit_3]].all? { |s| s.to_s.size >= 2 }
+  end
+  validate("thread has content") { |o| o[:thread_title].to_s.size > 5 }
+  validate("thread language is valid") { |o| o[:thread_language].to_s.match?(/\A[a-z]{2}\z/) }
+end
+# ===========================================================================
+# Step 3 — Classify the thread
+#
+# PROMO / FILLER / SKIP with relevance score.
+# Uses `validate` and a 2-arity invariant that cross-checks the output
+# language against the input language.
+# Contract catches: PROMO with score 2, SKIP with score 8, wrong language.
+# ===========================================================================
+class ClassifyThread < RubyLLM::Contract::Step::Base
+  input_type Hash
+  # Block DSL because we use `example` (few-shot learning)
+  prompt do
+    system "You are a thread classifier for Reddit marketing."
+    rule "Classify the thread as PROMO, FILLER, or SKIP based on product relevance."
+    rule "Return JSON with: classification, relevance_score (1-10), reasoning, thread_title, thread_language."
+    rule "PROMO: score >= 6. FILLER: 3-5. SKIP: 1-2."
+    example input: '{"thread_title":"Best invoicing tool?","product_description":"invoicing SaaS"}',
+            output: '{"classification":"PROMO","relevance_score":9,"reasoning":"Direct fit","thread_title":"Best invoicing tool?","thread_language":"en"}'
+    user "{input}"
+  end
+  validate("valid classification") { |o| %w[PROMO FILLER SKIP].include?(o[:classification]) }
+  validate("relevance score in range") { |o| o[:relevance_score].is_a?(Integer) && o[:relevance_score].between?(1, 10) }
+  validate("PROMO score >= 6") { |o| o[:classification] != "PROMO" || o[:relevance_score] >= 6 }
+  validate("SKIP score <= 2") { |o| o[:classification] != "SKIP" || o[:relevance_score] <= 2 }
+  validate("thread language preserved from input") do |output, input|
+    next true unless input.is_a?(Hash) && input[:thread_language]
+    output[:thread_language] == input[:thread_language]
+  end
+end
+# ===========================================================================
+# Step 4 — Plan the comment
+#
+# Decides approach, tone, and key points before writing.
+# Contract catches: missing strategy, invalid tone.
+# ===========================================================================
+class PlanComment < RubyLLM::Contract::Step::Base
+  input_type Hash
+  prompt <<~PROMPT
+    You are a Reddit comment strategist.
+    Plan a helpful, non-spammy comment for the classified thread.
+    Return JSON with: approach, tone, key_points, link_strategy, thread_title.
+    GUIDELINES:
+    - Never use aggressive marketing language.
+    - Be genuinely helpful first.
+    - Mention product naturally.
+    TONE OPTIONS:
+    - casual — peer sharing experience
+    - professional — industry expert
+    - empathetic — I had the same problem
+    {input}
+  PROMPT
+  validate("has approach") { |o| o[:approach].to_s.size > 5 }
+  validate("valid tone") { |o| %w[casual professional empathetic].include?(o[:tone]) }
+  validate("has link strategy") { |o| o[:link_strategy].to_s.size > 3 }
+end
+# ===========================================================================
+# Step 5 — Write the comment
+#
+# Retry policy: starts with gpt-4.1-nano (cheap), escalates to mini then
+# full if the contract catches problems. In practice, nano often writes
+# comments that are too short or forget the link.
+# Contract catches: spam phrases, banned openings, missing links, too short.
+# ===========================================================================
+class GenerateComment < RubyLLM::Contract::Step::Base
+  input_type Hash
+  # Block DSL here because we use `example` (few-shot) — needs user/assistant pairs.
+  # Steps without examples use plain heredoc (see AnalyzeProduct, PlanComment above).
+  prompt do
+    system "You are a helpful Reddit commenter promoting a SaaS product."
+    rule "Write the comment based on the plan."
+    rule "Return JSON with: comment, word_count (integer)."
+    rule "No markdown headers. No emojis. No bullet lists."
+    rule "Include https://acme-invoice.com naturally, maximum once."
+    section "ANTI-SPAM",
+            "Never use: buy now, limited offer, click here, act fast, discount.\n" \
+            "Never start with: Great question!, As a, I'm an AI, Hey there!"
+    example input: '{"approach":"share experience","tone":"casual"}',
+            output: '{"comment":"I switched to Acme Invoice last year and it cut my invoicing time ' \
+                    "in half. The automatic reminders are a lifesaver. https://acme-invoice.com if " \
+                    'you want to check it out.","word_count":30}'
+    user "{input}"
+  end
+  validate("comment long enough") { |o| o[:comment].to_s.strip.size > 30 }
+  validate("no markdown headers") { |o| !o[:comment].to_s.match?(/^\#{2,}/) }
+  validate("has word count") { |o| o[:word_count].is_a?(Integer) && o[:word_count].positive? }
+  validate("contains product link") { |o| o[:comment].to_s.include?("acme-invoice.com") }
+  validate("no spam phrases") do |o|
+    spam = ["buy now", "limited offer", "click here", "act fast", "discount"]
+    spam.none? { |s| o[:comment].to_s.downcase.include?(s) }
+  end
+  validate("no banned openings") do |o|
+    banned = ["Great question", "As a", "I'm an AI", "Hey there!", "Check this out"]
+    banned.none? { |b| o[:comment].to_s.start_with?(b) }
+  end
+  max_output 300 # tokens — don't let the model ramble
+  retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini gpt-4.1]
+end
+# ===========================================================================
+# Pipeline — wires the 5 steps together, with per-step model hints
+# ===========================================================================
+class RedditPromoPipeline < RubyLLM::Contract::Pipeline::Base
+  step AnalyzeProduct,     as: :analyze,    model: "gpt-4.1-mini"
+  step IdentifySubreddits, as: :subreddits, model: "gpt-4.1-mini"
+  step ClassifyThread,     as: :classify,   model: "gpt-4.1-nano"
+  step PlanComment,        as: :plan,       model: "gpt-4.1-nano"
+  step GenerateComment,    as: :comment # uses retry_policy escalation
+  token_budget 15_000 # max tokens across all steps — halt if exceeded
+end
+# ===========================================================================
+# Eval — defined OUTSIDE the step class (like specs live outside models)
+# In production: eval/generate_comment_eval.rb
+# ===========================================================================
+GenerateComment.define_eval("smoke") do
+  default_input({
+                  approach: "Share personal experience with invoicing frustration, then mention Acme Invoice",
+                  tone: "casual",
+                  key_points: '["empathize","mention recurring invoices","highlight reminders"]',
+                  link_strategy: "Drop link naturally after mentioning the tool",
+                  thread_title: "What invoicing tool do you use?"
+                })
+  sample_response({
+                    comment: "I was in the exact same boat — spreadsheets worked until I had more than " \
+                             "10 clients, then tracking who paid became a nightmare. I switched to Acme " \
+                             "Invoice about a year ago and it's been great. Recurring invoices are " \
+                             "set-and-forget, and the automatic payment reminders saved me so many awkward " \
+                             "follow-up emails. It's affordable too. https://acme-invoice.com if you want " \
+                             "to check it out.",
+                    word_count: 62
+                  })
+  # Zero verify needed — step's validate blocks already check:
+  # comment long enough, no markdown headers, has word count,
+  # contains product link, no spam phrases, no banned openings.
+end
+# ===========================================================================
+# Simulated LLM responses (what a real model would return)
+# ===========================================================================
+RESPONSES = {
+  analyze: {
+    product_description: "Simple invoicing and billing platform for freelancers and small businesses",
+    locale: "en",
+    audience_group_1: "freelance designers and developers",
+    audience_group_2: "small business owners under 10 employees",
+    audience_group_3: "accountants serving freelance clients"
+  },
+  subreddits: {
+    product_description: "Simple invoicing and billing platform for freelancers",
+    locale: "en",
+    subreddit_1: "freelance", subreddit_2: "smallbusiness", subreddit_3: "Entrepreneur",
+    thread_title: "What invoicing tool do you use for your freelance business?",
+    thread_selftext: "I've been using spreadsheets but it's getting out of hand. " \
+                     "Need something for recurring invoices and payment reminders.",
+    thread_subreddit: "freelance",
+    thread_language: "en"
+  },
+  classify: {
+    classification: "PROMO", relevance_score: 9,
+    reasoning: "Thread directly asks for invoicing tool — perfect fit",
+    thread_title: "What invoicing tool do you use for your freelance business?",
+    thread_language: "en"
+  },
+  plan: {
+    approach: "Share personal experience with invoicing frustration, then mention Acme Invoice",
+    tone: "casual",
+    key_points: '["empathize with spreadsheet pain","mention recurring invoices",' \
+                '"highlight payment reminders","note affordability"]',
+    link_strategy: "Drop link naturally after mentioning the tool by name",
+    thread_title: "What invoicing tool do you use for your freelance business?"
+  },
+  comment: {
+    comment: "I was in the exact same boat — spreadsheets worked until I had more than " \
+             "10 clients, then tracking who paid became a nightmare. I switched to Acme " \
+             "Invoice about a year ago and it's been great. Recurring invoices are " \
+             "set-and-forget, and the automatic payment reminders saved me so many awkward " \
+             "follow-up emails. It's affordable too. https://acme-invoice.com if you want " \
+             "to check it out.",
+    word_count: 62
+  }
+}.freeze
+# ===========================================================================
+# Run — Pipeline.test with named responses (no adapter setup needed)
+# ===========================================================================
+result = RedditPromoPipeline.test(
+  "https://acme-invoice.com — Simple invoicing for freelancers",
+  responses: RESPONSES
+)
+# ===========================================================================
+# Results
+# ===========================================================================
+puts result
+# Pipeline: ok  5 steps  0ms  0+0 tokens  $0.000000  trace=...
+#   analyze        ok         gpt-4.1-mini 0ms 0+0 tokens $0.000000
+#   subreddits     ok         gpt-4.1-mini 0ms 0+0 tokens $0.000000
+#   classify       ok         gpt-4.1-nano 0ms 0+0 tokens $0.000000
+#   plan           ok         gpt-4.1-nano 0ms 0+0 tokens $0.000000
+#   comment        ok         gpt-4.1-nano 0ms 0+0 tokens $0.000000
+# (costs are $0 here because Test adapter reports 0 tokens —
+#  with a real LLM you'd see actual costs from model registry)
+puts
+result.pretty_print
+# +----------------------------------------------------------------------------------+
+# | Pipeline: ok  5 steps  0ms  ...                                                  |
+# +----------------+------------+----------------------------------------------------+
+# | Step           | Status     | Output                                             |
+# +----------------+------------+----------------------------------------------------+
+# | analyze        | ok         | product_description: Simple invoicing and billi... |
+# |                |            | locale: en                                         |
+# |                |            | audience_group_1: freelance designers and devel... |
+# +----------------+------------+----------------------------------------------------+
+# | subreddits     | ok         | subreddit_1: freelance                             |
+# |                |            | thread_title: What invoicing tool do you use fo... |
+# +----------------+------------+----------------------------------------------------+
+# | classify       | ok         | classification: PROMO                              |
+# |                |            | relevance_score: 9                                 |
+# |                |            | reasoning: Thread directly asks for invoicing t... |
+# +----------------+------------+----------------------------------------------------+
+# | plan           | ok         | approach: Share personal experience with invoic... |
+# |                |            | tone: casual                                       |
+# +----------------+------------+----------------------------------------------------+
+# | comment        | ok         | comment: I was in the exact same boat — spreads... |
+# |                |            | word_count: 62                                     |
+# +----------------------------------------------------------------------------------+
+puts
+# ===========================================================================
+# Quality check — zero setup, eval has its own sample_response
+# ===========================================================================
+puts GenerateComment.run_eval("smoke")
+# smoke: 1/1 checks passed