RubyGems - ruby_llm-contract - Versions diffs - 0.7.1 → 0.8.0 - Mend

ruby_llm-contract 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +96 -0
data/Gemfile.lock +3 -3
data/README.md +64 -316
data/examples/00_basics.rb +110 -428
data/examples/01_fallback_showcase.rb +208 -0
data/examples/02_real_llm_minimal.rb +45 -0
data/examples/03_summarize_with_keywords.rb +128 -0
data/examples/04_summarize_and_translate.rb +196 -0
data/examples/05_eval_dataset.rb +144 -0
data/examples/06_retry_variants.rb +147 -0
data/examples/README.md +20 -128
data/lib/ruby_llm/contract/adapters/ruby_llm.rb +22 -1
data/lib/ruby_llm/contract/cost_calculator.rb +39 -0
data/lib/ruby_llm/contract/eval/model_comparison.rb +4 -4
data/lib/ruby_llm/contract/eval/retry_optimizer.rb +7 -3
data/lib/ruby_llm/contract/step/base.rb +18 -1
data/lib/ruby_llm/contract/step/dsl.rb +38 -0
data/lib/ruby_llm/contract/step/limit_checker.rb +2 -2
data/lib/ruby_llm/contract/token_estimator.rb +20 -3
data/lib/ruby_llm/contract/version.rb +1 -1
data/ruby_llm-contract.gemspec +6 -5
metadata +14 -16
data/examples/01_classify_threads.rb +0 -220
data/examples/02_generate_comment.rb +0 -203
data/examples/03_target_audience.rb +0 -201
data/examples/04_real_llm.rb +0 -410
data/examples/05_output_schema.rb +0 -258
data/examples/07_keyword_extraction.rb +0 -239
data/examples/08_translation.rb +0 -353
data/examples/09_eval_dataset.rb +0 -287
data/examples/10_reddit_full_showcase.rb +0 -363

data/examples/05_eval_dataset.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 5: Dataset-driven evals on SummarizeArticle
+#
+# The pattern that stops silent prompt regressions:
+#   1. Define an eval with a handful of real articles and expected outcomes.
+#   2. Run it on your current configuration — that is the baseline.
+#   3. Change a prompt, swap a model, upgrade a gem — re-run.
+#   4. A drop in score blocks the merge before it ships.
+#
+# Every piece of the workflow is shown in one file: define_eval, add_case
+# with expected traits, running the eval, comparing a "good" to a "bad"
+# model, and the inline eval_case helper for quick checks.
+#
+# Run: ruby examples/05_eval_dataset.rb
+#
+# Expected output:
+#
+#   Run 1 — good configuration (baseline)
+#     Score:      1.0
+#     Pass rate:  3/3
+#     Passed?:    true
+#
+#   Run 2 — a prompt tweak broke tone classification on complaints
+#     Score:      0.67
+#     Pass rate:  2/3
+#       ✓ ruby release         all expected keys present and matching
+#       ✗ outage complaint     tone: expected "negative", got "analytical"
+#       ✓ product launch       all expected keys present and matching
+#   Regression detected: 1.0 → 0.67 (33% drop)
+#
+#   Inline eval_case (quick one-off check)
+#     Passed:   true
+#     Score:    1.0
+#     Details:  all expected keys present and matching
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+class SummarizeArticle < RubyLLM::Contract::Step::Base
+  prompt "Summarize: {input}"
+  output_schema do
+    string :tldr, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
+  define_eval "regression" do
+    add_case "ruby release",
+             input: "Ruby 3.4 ships with frozen string literals, YJIT speedups, parser fixes.",
+             expected: { tone: "analytical" }
+    add_case "outage complaint",
+             input: "The mesh hardware failed under load. Three customers threatened churn.",
+             expected: { tone: "negative" }
+    add_case "product launch",
+             input: "We are thrilled to announce our new billing feature ships this week.",
+             expected: { tone: "positive" }
+  end
+end
+# =============================================================================
+# Good run — every case lands on the expected tone
+# =============================================================================
+puts "=" * 60
+puts "Run 1 — good configuration (baseline)"
+puts "=" * 60
+good_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
+  { tldr: "Ruby 3.4 summary",   takeaways: %w[a b c], tone: "analytical" },
+  { tldr: "Outage complaint",    takeaways: %w[a b c], tone: "negative" },
+  { tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
+])
+baseline = SummarizeArticle.run_eval("regression", context: { adapter: good_adapter })
+puts "Score:      #{baseline.score.round(2)}"     # => 1.0
+puts "Pass rate:  #{baseline.pass_rate}"          # => 3/3
+puts "Passed?:    #{baseline.passed?}"            # => true
+# =============================================================================
+# Bad run — simulates a prompt tweak that broke "outage" classification
+# =============================================================================
+puts
+puts "=" * 60
+puts "Run 2 — a prompt tweak broke tone classification on complaints"
+puts "=" * 60
+bad_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
+  { tldr: "Ruby 3.4 summary",   takeaways: %w[a b c], tone: "analytical" },
+  { tldr: "Outage complaint",    takeaways: %w[a b c], tone: "analytical" }, # expected negative!
+  { tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
+])
+regression = SummarizeArticle.run_eval("regression", context: { adapter: bad_adapter })
+puts "Score:      #{regression.score.round(2)}"   # => 0.67
+puts "Pass rate:  #{regression.pass_rate}"        # => 2/3
+regression.each do |r|
+  icon = r.passed? ? "✓" : "✗"
+  puts "  #{icon} #{r.name.ljust(20)} #{r.details}"
+end
+puts
+puts "Regression detected: #{baseline.score.round(2)} → #{regression.score.round(2)} " \
+     "(#{((baseline.score - regression.score) * 100).round}% drop)"
+# =============================================================================
+# eval_case — inline single-case check without defining a full dataset
+# =============================================================================
+puts
+puts "=" * 60
+puts "Inline eval_case (quick one-off check)"
+puts "=" * 60
+one = SummarizeArticle.eval_case(
+  input: "Ruby 3.4 ships with frozen string literals.",
+  expected: { tone: "analytical" },
+  context: { adapter: RubyLLM::Contract::Adapters::Test.new(
+    response: { tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" }
+  ) }
+)
+puts "Passed:   #{one.passed?}"   # => true
+puts "Score:    #{one.score}"     # => 1.0
+puts "Details:  #{one.details}"
+# =============================================================================
+# What this showcases
+#
+# - define_eval keeps dataset + expectations next to the step definition.
+#   One class, one truth.
+# - run_eval returns a Report with score, pass_rate, per-case CaseResult.
+# - The same dataset detects a regression when a "good" adapter is swapped
+#   for a "bad" one — same signal you get from a prompt change in prod.
+# - eval_case is the lightweight alternative for one-off inline checks.
+# =============================================================================

data/examples/06_retry_variants.rb ADDED Viewed

@@ -0,0 +1,147 @@
+# frozen_string_literal: true
+# =============================================================================
+# EXAMPLE 6: retry_policy variants on SummarizeArticle
+#
+# Example 01 covered the most common pattern: fall back from a cheap model
+# to a stronger one (gpt-5-nano → mini → gpt-5). This file runs the three
+# other retry_policy shapes, each on the same SummarizeArticle step with
+# the Test adapter so no API keys are required.
+#
+# Run: ruby examples/06_retry_variants.rb
+#
+# Expected output (abridged):
+#
+#   A — attempts: 3 (same model, sampling-variance absorption)
+#       attempt 1  model=gpt-5-nano  status=validation_failed
+#       attempt 3  model=gpt-5-nano  status=ok
+#
+#   B — reasoning_effort low → medium → high (same model)
+#       attempt 1  effort=low     status=validation_failed
+#       attempt 3  effort=high    status=ok
+#
+#   C — cross-provider Ollama → Anthropic → OpenAI
+#       attempt 1  model=gemma3:4b          status=validation_failed
+#       attempt 3  model=gpt-5-nano         status=ok
+# =============================================================================
+require_relative "../lib/ruby_llm/contract"
+# =============================================================================
+# Base step — same SummarizeArticle from the README, used by every variant
+# =============================================================================
+class SummarizeArticle < RubyLLM::Contract::Step::Base
+  model "gpt-5-nano"
+  prompt "Summarize: {input}"
+  output_schema do
+    string :tldr, max_length: 200
+    array  :takeaways, of: :string, min_items: 3, max_items: 5
+    string :tone, enum: %w[neutral positive negative analytical]
+  end
+  validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
+end
+# Canned responses — first two fail the "TL;DR fits the card" validate
+# (oversized TL;DR), the third succeeds. Every variant lands on attempt 3,
+# so the trace shows the retry policy's shape clearly.
+RESPONSES = [
+  { tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
+  { tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
+  { tldr: "Ruby 3.4 ships with frozen string literals and YJIT speedups.",
+    takeaways: %w[frozen-strings yjit parser-fixes], tone: "analytical" }
+].freeze
+def print_trace(label, result)
+  puts "#{label} — status=#{result.status}, final model=#{result.trace[:model].inspect}"
+  result.trace[:attempts].each do |a|
+    cfg = a[:config] && a[:config][:reasoning_effort] ? "  effort=#{a[:config][:reasoning_effort].ljust(6)}" : ""
+    puts "    attempt #{a[:attempt]}  model=#{a[:model].ljust(20)}#{cfg}  status=#{a[:status]}"
+  end
+  puts
+end
+# =============================================================================
+# VARIANT A — attempts: 3 on the same model
+#
+# When to use: the model is correct on most samples, but sampling variance
+# (gpt-5 / o-series enforce temperature=1.0 server-side) flips it occasionally.
+# Re-sampling the same model absorbs the variance without paying for a
+# stronger tier.
+#
+# Replaces: the hand-rolled begin/rescue/retry loop with an attempts counter.
+# =============================================================================
+class SummarizeArticleSameModelRetry < SummarizeArticle
+  retry_policy attempts: 3
+end
+puts "=" * 70
+puts "A — attempts: 3 (same model, sampling-variance absorption)"
+puts "=" * 70
+adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
+print_trace("same-model retry", SummarizeArticleSameModelRetry.run("article", context: { adapter: adapter }))
+# =============================================================================
+# VARIANT B — reasoning_effort escalation on one model
+#
+# When to use: the model can get the right answer with more thinking budget,
+# but you do not want to pay the high-reasoning price on every call. Start
+# at low, let validate filter out the cheap misses, pay for medium or high
+# only on the cases that actually need it.
+# =============================================================================
+class SummarizeArticleReasoningEscalation < SummarizeArticle
+  retry_policy models: [
+    { model: "gpt-5-nano", reasoning_effort: "low" },
+    { model: "gpt-5-nano", reasoning_effort: "medium" },
+    { model: "gpt-5-nano", reasoning_effort: "high" }
+  ]
+end
+puts "=" * 70
+puts "B — reasoning_effort escalation (low → medium → high)"
+puts "=" * 70
+adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
+print_trace("reasoning escalation", SummarizeArticleReasoningEscalation.run("article", context: { adapter: adapter }))
+# =============================================================================
+# VARIANT C — cross-provider fallback (Ollama → Anthropic → OpenAI)
+#
+# When to use: you want to start on a local model (cheap, private, no quota)
+# and fall back to hosted providers only when the local one cannot satisfy
+# the contract. Each tier is a different provider — ruby_llm detects the
+# provider from the model name.
+#
+# To run against real backends: configure ruby_llm for all three providers
+# (ollama_api_base + anthropic_api_key + openai_api_key) and swap the Test
+# adapter for Adapters::RubyLLM. The retry_policy itself is unchanged.
+#
+# Order matters: local first (costs nothing); hosted last (most accurate).
+# =============================================================================
+class SummarizeArticleCrossProvider < SummarizeArticle
+  retry_policy models: %w[gemma3:4b claude-haiku-4-5 gpt-5-nano]
+end
+puts "=" * 70
+puts "C — cross-provider fallback (Ollama → Anthropic → OpenAI)"
+puts "=" * 70
+adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
+print_trace("cross-provider", SummarizeArticleCrossProvider.run("article", context: { adapter: adapter }))
+# =============================================================================
+# TAKEAWAYS
+#
+# 1. `attempts: 3` is the shortest path from a hand-rolled begin/rescue/retry
+#    loop to a contract-backed retry with a trace you can log.
+# 2. `reasoning_effort` escalation is cheaper than model escalation when the
+#    model is right but needs more thinking, not a stronger backbone.
+# 3. Cross-provider retry uses the same DSL — ruby_llm resolves the provider
+#    from the model name. Start cheapest (often a local Ollama model), end
+#    on the most accurate hosted provider.
+# 4. The per-attempt trace (model, config, status, cost) is identical across
+#    variants — your logging does not care which retry shape you picked.
+# =============================================================================

data/examples/README.md CHANGED Viewed

@@ -1,140 +1,32 @@
 # Examples
-## 00_basics.rb — From zero to ruby_llm-contract
+Seven runnable examples, every one using the `SummarizeArticle` step from the [README](../README.md) — a Rails app turning article text into a UI card with TL;DR, takeaways, and tone. Zero API keys (Test adapter is the default). Only `02_real_llm_minimal.rb` needs a provider key.
-Step-by-step tutorial covering every feature. Start here.
+Pedagogical order: hook → activation → evolution → composition → quality → advanced.
-| Step | Feature | What it shows |
-|------|---------|---------------|
-| 1 | Plain string prompt | Simplest case — `user "{input}"` and nothing else |
-| 2 | System + user | Separate instructions from data |
-| 3 | Rules + output_schema | Requirements as statements + declarative output structure |
-| 4 | Invariants | Custom business logic on top of schema |
-| 5 | Examples | Few-shot (example input/output pairs) |
-| 6 | Sections | Labeled context blocks (heredoc replacement, with before/after) |
-| 7 | Hash input | Multiple fields with auto-interpolation |
-| 8 | 2-arity invariants | Cross-validate output against input |
-| 9 | Context override | Per-run adapter and model switching |
-| 10 | StepResult | Full inspection: status, output, errors, trace |
-| 11 | Pipeline | Chain steps with fail-fast data threading |
+| # | File | Answers |
+|---|------|---------|
+| 00 | `00_basics.rb` | **"How do I start?"** — seven incremental layers: plain prompt → output_schema → validate → structured prompt → Hash input → cross-input validate → retry_policy → trace inspection, plus real-LLM swap pointer. |
+| 01 | `01_fallback_showcase.rb` | **"Show me the gem in 30 seconds."** — Part A: schema-only ships a flaky sample. Part B: full contract rejects it and retry_policy escalates to the next model. Per-attempt trace printed inline. |
+| 02 | `02_real_llm_minimal.rb` | **"How do I plug in a real LLM?"** — ~30 lines. `Adapters::RubyLLM.new` in context, same step. Also shows per-call provider switch (OpenAI → Anthropic → Ollama). |
+| 03 | `03_summarize_with_keywords.rb` | **"How does the contract evolve when the product grows?"** — marketing wants a "topic pills" row, so `SummarizeArticle` gains a keywords field with probability and cross-validation. Prompt, schema, and validates stay in lockstep. |
+| 04 | `04_summarize_and_translate.rb` | **"How do steps compose into a pipeline?"** — 3 steps threaded by `Pipeline::Base`: English summary → translate to French → quality review. Fail-fast: a rejected summary means translate and review never run. |
+| 05 | `05_eval_dataset.rb` | **"How do I stop silent prompt regressions?"** — define_eval with real cases, baseline vs regressed adapter, regression detection signal, inline eval_case. |
+| 06 | `06_retry_variants.rb` | **"What retry shapes exist beyond cross-model?"** — `attempts: 3` (variance absorption), `reasoning_effort` escalation (low→medium→high), cross-provider fallback (Ollama → Anthropic → OpenAI). |
-Every step has a corresponding test in `spec/integration/examples_00_basics_spec.rb`.
-## 01_classify_threads.rb — Thread classification
-Real-world before/after: classify Reddit threads as PROMO/FILLER/SKIP.
-Shows ID matching, enum validation, score consistency invariants.
-## 02_generate_comment.rb — Comment generation
-Real-world before/after: generate Reddit comments with persona.
-Shows sections, banned openings, link presence, length constraints, 2-arity invariants.
-## 03_target_audience.rb — Audience profiling
-Real-world before/after: generate target audience profiles.
-Shows cascade failure prevention, locale validation, structural invariants.
-## 04_real_llm.rb — Real LLM calls via ruby_llm
-Connect to real LLM providers (OpenAI, Anthropic, Google, etc.) using Adapters::RubyLLM.
-Shows configuration, model switching, temperature/max_tokens control, provider-agnostic steps.
-| Step | Feature | What it shows |
-|------|---------|---------------|
-| 1 | Configure ruby_llm | Set API keys for your provider |
-| 2 | Set RubyLLM adapter | Swap Test adapter for production |
-| 3 | Define a step | Identical to Test adapter — provider-agnostic |
-| 4 | Run with real LLM | Real call, real tokens, full contract enforcement |
-| 5 | Compare models | A/B test different models per call |
-| 6 | Generation params | Temperature, max_tokens forwarding |
-| 7 | Switch providers | Same step, different provider — just change model name |
-| 8 | Error handling | Contract enforcement with real LLM responses |
-| 9 | Full power | Every feature combined in AnalyzeTicket |
-| 10 | Pipeline | Chain steps with real LLM calls |
-**Requires:** `export OPENAI_API_KEY=sk-...` (or another provider key)
-## 05_output_schema.rb — Declarative output schema
-Replace manual invariants with a schema DSL (ruby_llm-schema).
-| Step | Feature | What it shows |
-|------|---------|---------------|
-| 1 | Before (invariants) | Manual enum, range, required checks |
-| 2 | After (schema) | Same constraints in declarative DSL |
-| 3 | Schema + invariants | Schema for structure, invariants for business logic |
-| 4 | Complex schema | Nested objects, arrays, constraints |
-| 5 | Provider-agnostic | Same schema works with Test and RubyLLM adapters |
-| 6 | Pipeline + schemas | Fully typed multi-step composition |
+Every example has an "Expected output" section in the file header — you can read what each one prints without running it.
 ## Running
 ```bash
 # Test adapter — no API keys needed:
 ruby examples/00_basics.rb
-ruby examples/01_classify_threads.rb
-ruby examples/02_generate_comment.rb
-ruby examples/03_target_audience.rb
-ruby examples/05_output_schema.rb
-# Real LLM — requires API key:
-ruby examples/04_real_llm.rb
+ruby examples/01_fallback_showcase.rb
+ruby examples/03_summarize_with_keywords.rb
+ruby examples/04_summarize_and_translate.rb
+ruby examples/05_eval_dataset.rb
+ruby examples/06_retry_variants.rb
+# Real LLM — requires a provider API key or a local Ollama server:
+ruby examples/02_real_llm_minimal.rb
 ```
-## 06_reddit_promo.rb — Real-world Reddit promo pipeline
-3-step pipeline from the reddit_promo_planner case study:
-| Step | Role | Invariants catch |
-|------|------|------------------|
-| 1 | TargetAudience | `locale: "USA"` instead of `"en"`, vague summary |
-| 2 | ClassifyThreads | PROMO with score 2, SKIP with score 8 |
-| 3 | GenerateComment | `{PRODUCT}` instead of URL, banned openings |
-Runs with test adapter by default. `REAL_LLM=1` for Ollama, `MODEL=gemma:latest` to pick model.
-## 07_keyword_extraction.rb — Keyword extraction with probability
-Extract up to 15 keywords from an article, each with relevance probability.
-| Feature | What it shows |
-|---------|---------------|
-| Array schema | `min_items: 1, max_items: 15` with nested objects |
-| Number range | `probability: 0.0–1.0` |
-| Sorting invariant | Schema can't express "sorted descending" |
-| Uniqueness invariant | Schema can't express "no duplicates" |
-| Cross-validation | Keywords must appear in source text (catches hallucination) |
-| Pipeline | Keywords → Related Topics |
-## 08_translation.rb — Translation pipeline with quality review
-3-step pipeline: extract segments → translate → review quality.
-| Step | LLM Skill | Invariants catch |
-|------|-----------|------------------|
-| Extract | Analysis | Duplicate keys, wrong target_lang |
-| Translate | Creative | Missing segments, too long, echoed back untranslated |
-| Review | Evaluation | Inconsistent counts, failed reviews without issues |
-## Running
-```bash
-# Test adapter — no API keys needed:
-ruby examples/00_basics.rb
-ruby examples/01_classify_threads.rb
-ruby examples/02_generate_comment.rb
-ruby examples/03_target_audience.rb
-ruby examples/05_output_schema.rb
-ruby examples/06_reddit_promo.rb
-ruby examples/07_keyword_extraction.rb
-ruby examples/08_translation.rb
-# Real LLM — requires Ollama or API key:
-ruby examples/04_real_llm.rb
-REAL_LLM=1 ruby examples/06_reddit_promo.rb
-REAL_LLM=1 MODEL=llama3.2:3b ruby examples/06_reddit_promo.rb
-```
-Examples 00–03, 05–06 use the test adapter by default — no API keys needed.
-Example 04 and 06 with `REAL_LLM=1` require Ollama or an API key.

data/lib/ruby_llm/contract/adapters/ruby_llm.rb CHANGED Viewed

@@ -52,12 +52,33 @@ module RubyLLM
           CHAT_OPTION_METHODS.each do |key, method_name|
             chat.public_send(method_name, options[key]) if options[key]
           end
+          # Resolve thinking config from BOTH sources, with `:reasoning_effort`
+          # taking precedence over `:thinking[:effort]`. This is the per-attempt
+          # override path used by `retry_policy { escalate({model:, reasoning_effort:}) }`
+          # — the attempt-specific effort must win over the class-level default.
+          # Forwarded provider-agnostically via `chat.with_thinking(**)` —
+          # available since RubyLLM 1.12 (gemspec enforces this minimum).
+          thinking_config = resolve_thinking_config(options)
+          chat.with_thinking(**thinking_config) if thinking_config
+          # `with_params` carries only raw passthroughs (currently `max_tokens`).
+          # `reasoning_effort` is no longer forwarded here — it goes through
+          # `with_thinking` above, which is the canonical RubyLLM API.
           params = {}
           params[:max_tokens] = options[:max_tokens] if options[:max_tokens]
-          params[:reasoning_effort] = options[:reasoning_effort] if options[:reasoning_effort]
           chat.with_params(**params) if params.any?
         end
+        # Returns merged `{ effort:, budget: }` or nil. `options[:reasoning_effort]`
+        # overrides any inherited `options[:thinking][:effort]`; budget is
+        # taken from `options[:thinking][:budget]` only.
+        def resolve_thinking_config(options)
+          base = options[:thinking].is_a?(Hash) ? options[:thinking].dup : {}
+          base[:effort] = options[:reasoning_effort] if options[:reasoning_effort]
+          base.empty? ? nil : base
+        end
         def build_response(response)
           content = response.content
           content = content.to_s unless content.is_a?(Hash) || content.is_a?(Array)

data/lib/ruby_llm/contract/cost_calculator.rb CHANGED Viewed

@@ -2,6 +2,29 @@
 module RubyLLM
   module Contract
+    # Pricing lookup for `max_cost` budget gating + retry usage aggregation.
+    #
+    # **What this module does (public surface):**
+    #
+    # 1. **Fine-tune / custom-model pricing registry** — `register_model`
+    #    fills the gap left by RubyLLM 1.14's models.json: there is no
+    #    upstream `RubyLLM::Models.register` API, so fine-tuned models
+    #    (e.g. `ft:gpt-4o-custom`) need their pricing supplied locally.
+    # 2. **Lookup with fallback chain** — `calculate(model_name:, usage:)`
+    #    checks the custom registry first, falls back to
+    #    `RubyLLM.models.find(model_name)`, returns `nil` on miss.
+    #
+    # **What this module is NOT:**
+    #
+    # - Not a "cost calculator" feature — the math itself
+    #   (`tokens × price_per_million / 1_000_000`) is trivial and lives
+    #   in `private_class_method :compute_cost` for internal use only.
+    # - Not a substitute for RubyLLM's pricing data — for any model in
+    #   `RubyLLM.models`, this module simply queries it.
+    #
+    # The reason this module exists at all is the registry + retry usage
+    # aggregation across attempts (the latter sits in `Step::RetryExecutor`,
+    # which calls `calculate` per attempt and sums; not in this module).
     module CostCalculator
       # Simple struct for custom-registered model pricing
       RegisteredModel = Struct.new(:input_price_per_million, :output_price_per_million, keyword_init: true)
@@ -9,6 +32,8 @@ module RubyLLM
       @custom_models = {}
       # Register pricing for custom or fine-tuned models not in the RubyLLM registry.
+      # This is the gem's primary value-add for cost computation; everything
+      # else falls back to RubyLLM's own model registry.
       #
       #   CostCalculator.register_model("ft:gpt-4o-custom",
       #     input_per_1m: 3.0, output_per_1m: 6.0)
@@ -33,6 +58,20 @@ module RubyLLM
         @custom_models.clear
       end
+      # Look up cost for a single model + usage hash.
+      # Returns nil if model is unknown (custom registry miss + RubyLLM miss),
+      # so callers can decide whether to refuse the call or proceed (see
+      # `on_unknown_pricing:` step option for the budget-gating policy).
+      #
+      #   CostCalculator.calculate(
+      #     model_name: "gpt-4o-mini",
+      #     usage: { input_tokens: 1_500, output_tokens: 800 }
+      #   )
+      #   # => 0.00069 (or nil if model not registered)
+      #
+      # Math is intentionally simple and private — this method is the
+      # primary public entry point. Aggregating across retry attempts is
+      # done in `Step::RetryExecutor`, not here.
       def self.calculate(model_name:, usage:)
         return nil unless model_name && usage.is_a?(Hash)

data/lib/ruby_llm/contract/eval/model_comparison.rb CHANGED Viewed

@@ -72,9 +72,9 @@ module RubyLLM
           end
           chain_width = [rows.map { |r| r[:chain].length }.max || 0, 20].max
-          lines = [format("  %-#{chain_width}s  %-11s  %-10s  %-14s  %-9s  %s",
-                          "Chain", "single-shot", "escalation", "effective cost", "latency", "score")]
-          lines << "  #{"-" * (chain_width + 60)}"
+          lines = [format("  %-#{chain_width}s  %-13s  %-10s  %-14s  %-9s  %s",
+                          "Chain", "first-attempt", "fallback %", "effective cost", "latency", "score")]
+          lines << "  #{"-" * (chain_width + 62)}"
           rows.each do |row|
             lines << format_production_row(row, chain_width)
@@ -95,7 +95,7 @@ module RubyLLM
         def format_production_row(row, chain_width)
           report = row[:report]
-          format("  %-#{chain_width}s  %-11s  %-10s  %-14s  %-9s  %6.2f",
+          format("  %-#{chain_width}s  %-13s  %-10s  %-14s  %-9s  %6.2f",
                  row[:chain],
                  format_money(report.single_shot_cost || report.total_cost),
                  format_escalation(row, report),

data/lib/ruby_llm/contract/eval/retry_optimizer.rb CHANGED Viewed

@@ -15,8 +15,12 @@ module RubyLLM
       class RetryOptimizer
         Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
                             :constraining_eval, :chain, :chain_details, keyword_init: true) do
+          # Terminology alias — `hardest_eval` is the narrative name used in docs;
+          # `constraining_eval` is preserved as the original field name.
+          alias_method :hardest_eval, :constraining_eval
           def print_summary(io = $stdout)
-            io.puts "#{step_name} — retry chain optimization"
+            io.puts "#{step_name} — fallback list optimization"
             io.puts
             print_table(io)
             io.puts
@@ -59,7 +63,7 @@ module RubyLLM
             end
             io.puts
-            io.puts "  Constraining eval: #{constraining_eval}" if constraining_eval
+            io.puts "  Hardest eval: #{constraining_eval}" if constraining_eval
           end
           def print_chain(io)
@@ -68,7 +72,7 @@ module RubyLLM
               return
             end
-            io.puts "  Suggested chain:"
+            io.puts "  Suggested fallback list:"
             chain_details.each_with_index do |detail, i|
               suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
               io.puts "    #{detail[:label]} — #{suffix}"

data/lib/ruby_llm/contract/step/base.rb CHANGED Viewed

@@ -159,10 +159,27 @@ module RubyLLM
           def runtime_settings(context)
             policy = context.key?(:retry_policy_override) ? context[:retry_policy_override] : retry_policy
+            extra = context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort)
+            # Always pass the class-level `thinking` config to the adapter when
+            # set, so fields like `budget` survive a per-call `reasoning_effort`
+            # override. The adapter's `resolve_thinking_config` merges
+            # `reasoning_effort` over `thinking[:effort]` while keeping the
+            # rest of the hash intact.
+            #
+            # `reasoning_effort` is also seeded into extra_options for
+            # backward compat with eval_host / production_mode paths that
+            # read it from there — but only when the caller did not already
+            # provide one in context.
+            if respond_to?(:thinking) && thinking
+              extra[:thinking] = thinking
+              extra[:reasoning_effort] = thinking[:effort] if !extra.key?(:reasoning_effort) && thinking[:effort]
+            end
             {
               model: context[:model] || model || RubyLLM::Contract.configuration.default_model,
               temperature: context[:temperature],
-              extra_options: context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort),
+              extra_options: extra,
               policy: policy
             }
           end