ruby_llm-contract 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 5: Dataset-driven evals on SummarizeArticle
5
+ #
6
+ # The pattern that stops silent prompt regressions:
7
+ # 1. Define an eval with a handful of real articles and expected outcomes.
8
+ # 2. Run it on your current configuration — that is the baseline.
9
+ # 3. Change a prompt, swap a model, upgrade a gem — re-run.
10
+ # 4. A drop in score blocks the merge before it ships.
11
+ #
12
+ # Every piece of the workflow is shown in one file: define_eval, add_case
13
+ # with expected traits, running the eval, comparing a "good" to a "bad"
14
+ # model, and the inline eval_case helper for quick checks.
15
+ #
16
+ # Run: ruby examples/05_eval_dataset.rb
17
+ #
18
+ # Expected output:
19
+ #
20
+ # Run 1 — good configuration (baseline)
21
+ # Score: 1.0
22
+ # Pass rate: 3/3
23
+ # Passed?: true
24
+ #
25
+ # Run 2 — a prompt tweak broke tone classification on complaints
26
+ # Score: 0.67
27
+ # Pass rate: 2/3
28
+ # ✓ ruby release all expected keys present and matching
29
+ # ✗ outage complaint tone: expected "negative", got "analytical"
30
+ # ✓ product launch all expected keys present and matching
31
+ # Regression detected: 1.0 → 0.67 (33% drop)
32
+ #
33
+ # Inline eval_case (quick one-off check)
34
+ # Passed: true
35
+ # Score: 1.0
36
+ # Details: all expected keys present and matching
37
+ # =============================================================================
38
+
39
+ require_relative "../lib/ruby_llm/contract"
40
+
41
+ class SummarizeArticle < RubyLLM::Contract::Step::Base
42
+ prompt "Summarize: {input}"
43
+
44
+ output_schema do
45
+ string :tldr, max_length: 200
46
+ array :takeaways, of: :string, min_items: 3, max_items: 5
47
+ string :tone, enum: %w[neutral positive negative analytical]
48
+ end
49
+
50
+ validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
51
+
52
+ define_eval "regression" do
53
+ add_case "ruby release",
54
+ input: "Ruby 3.4 ships with frozen string literals, YJIT speedups, parser fixes.",
55
+ expected: { tone: "analytical" }
56
+
57
+ add_case "outage complaint",
58
+ input: "The mesh hardware failed under load. Three customers threatened churn.",
59
+ expected: { tone: "negative" }
60
+
61
+ add_case "product launch",
62
+ input: "We are thrilled to announce our new billing feature ships this week.",
63
+ expected: { tone: "positive" }
64
+ end
65
+ end
66
+
67
+ # =============================================================================
68
+ # Good run — every case lands on the expected tone
69
+ # =============================================================================
70
+
71
+ puts "=" * 60
72
+ puts "Run 1 — good configuration (baseline)"
73
+ puts "=" * 60
74
+
75
+ good_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
76
+ { tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" },
77
+ { tldr: "Outage complaint", takeaways: %w[a b c], tone: "negative" },
78
+ { tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
79
+ ])
80
+
81
+ baseline = SummarizeArticle.run_eval("regression", context: { adapter: good_adapter })
82
+ puts "Score: #{baseline.score.round(2)}" # => 1.0
83
+ puts "Pass rate: #{baseline.pass_rate}" # => 3/3
84
+ puts "Passed?: #{baseline.passed?}" # => true
85
+
86
+ # =============================================================================
87
+ # Bad run — simulates a prompt tweak that broke "outage" classification
88
+ # =============================================================================
89
+
90
+ puts
91
+ puts "=" * 60
92
+ puts "Run 2 — a prompt tweak broke tone classification on complaints"
93
+ puts "=" * 60
94
+
95
+ bad_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
96
+ { tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" },
97
+ { tldr: "Outage complaint", takeaways: %w[a b c], tone: "analytical" }, # expected negative!
98
+ { tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
99
+ ])
100
+
101
+ regression = SummarizeArticle.run_eval("regression", context: { adapter: bad_adapter })
102
+ puts "Score: #{regression.score.round(2)}" # => 0.67
103
+ puts "Pass rate: #{regression.pass_rate}" # => 2/3
104
+
105
+ regression.each do |r|
106
+ icon = r.passed? ? "✓" : "✗"
107
+ puts " #{icon} #{r.name.ljust(20)} #{r.details}"
108
+ end
109
+
110
+ puts
111
+ puts "Regression detected: #{baseline.score.round(2)} → #{regression.score.round(2)} " \
112
+ "(#{((baseline.score - regression.score) * 100).round}% drop)"
113
+
114
+ # =============================================================================
115
+ # eval_case — inline single-case check without defining a full dataset
116
+ # =============================================================================
117
+
118
+ puts
119
+ puts "=" * 60
120
+ puts "Inline eval_case (quick one-off check)"
121
+ puts "=" * 60
122
+
123
+ one = SummarizeArticle.eval_case(
124
+ input: "Ruby 3.4 ships with frozen string literals.",
125
+ expected: { tone: "analytical" },
126
+ context: { adapter: RubyLLM::Contract::Adapters::Test.new(
127
+ response: { tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" }
128
+ ) }
129
+ )
130
+
131
+ puts "Passed: #{one.passed?}" # => true
132
+ puts "Score: #{one.score}" # => 1.0
133
+ puts "Details: #{one.details}"
134
+
135
+ # =============================================================================
136
+ # What this showcases
137
+ #
138
+ # - define_eval keeps dataset + expectations next to the step definition.
139
+ # One class, one truth.
140
+ # - run_eval returns a Report with score, pass_rate, per-case CaseResult.
141
+ # - The same dataset detects a regression when a "good" adapter is swapped
142
+ # for a "bad" one — same signal you get from a prompt change in prod.
143
+ # - eval_case is the lightweight alternative for one-off inline checks.
144
+ # =============================================================================
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 6: retry_policy variants on SummarizeArticle
5
+ #
6
+ # Example 01 covered the most common pattern: fall back from a cheap model
7
+ # to a stronger one (gpt-5-nano → mini → gpt-5). This file runs the three
8
+ # other retry_policy shapes, each on the same SummarizeArticle step with
9
+ # the Test adapter so no API keys are required.
10
+ #
11
+ # Run: ruby examples/06_retry_variants.rb
12
+ #
13
+ # Expected output (abridged):
14
+ #
15
+ # A — attempts: 3 (same model, sampling-variance absorption)
16
+ # attempt 1 model=gpt-5-nano status=validation_failed
17
+ # attempt 3 model=gpt-5-nano status=ok
18
+ #
19
+ # B — reasoning_effort low → medium → high (same model)
20
+ # attempt 1 effort=low status=validation_failed
21
+ # attempt 3 effort=high status=ok
22
+ #
23
+ # C — cross-provider Ollama → Anthropic → OpenAI
24
+ # attempt 1 model=gemma3:4b status=validation_failed
25
+ # attempt 3 model=gpt-5-nano status=ok
26
+ # =============================================================================
27
+
28
+ require_relative "../lib/ruby_llm/contract"
29
+
30
+ # =============================================================================
31
+ # Base step — same SummarizeArticle from the README, used by every variant
32
+ # =============================================================================
33
+
34
+ class SummarizeArticle < RubyLLM::Contract::Step::Base
35
+ model "gpt-5-nano"
36
+ prompt "Summarize: {input}"
37
+
38
+ output_schema do
39
+ string :tldr, max_length: 200
40
+ array :takeaways, of: :string, min_items: 3, max_items: 5
41
+ string :tone, enum: %w[neutral positive negative analytical]
42
+ end
43
+
44
+ validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
45
+ end
46
+
47
+ # Canned responses — first two fail the "TL;DR fits the card" validate
48
+ # (oversized TL;DR), the third succeeds. Every variant lands on attempt 3,
49
+ # so the trace shows the retry policy's shape clearly.
50
+ RESPONSES = [
51
+ { tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
52
+ { tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
53
+ { tldr: "Ruby 3.4 ships with frozen string literals and YJIT speedups.",
54
+ takeaways: %w[frozen-strings yjit parser-fixes], tone: "analytical" }
55
+ ].freeze
56
+
57
+ def print_trace(label, result)
58
+ puts "#{label} — status=#{result.status}, final model=#{result.trace[:model].inspect}"
59
+ result.trace[:attempts].each do |a|
60
+ cfg = a[:config] && a[:config][:reasoning_effort] ? " effort=#{a[:config][:reasoning_effort].ljust(6)}" : ""
61
+ puts " attempt #{a[:attempt]} model=#{a[:model].ljust(20)}#{cfg} status=#{a[:status]}"
62
+ end
63
+ puts
64
+ end
65
+
66
+ # =============================================================================
67
+ # VARIANT A — attempts: 3 on the same model
68
+ #
69
+ # When to use: the model is correct on most samples, but sampling variance
70
+ # (gpt-5 / o-series enforce temperature=1.0 server-side) flips it occasionally.
71
+ # Re-sampling the same model absorbs the variance without paying for a
72
+ # stronger tier.
73
+ #
74
+ # Replaces: the hand-rolled begin/rescue/retry loop with an attempts counter.
75
+ # =============================================================================
76
+
77
+ class SummarizeArticleSameModelRetry < SummarizeArticle
78
+ retry_policy attempts: 3
79
+ end
80
+
81
+ puts "=" * 70
82
+ puts "A — attempts: 3 (same model, sampling-variance absorption)"
83
+ puts "=" * 70
84
+ adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
85
+ print_trace("same-model retry", SummarizeArticleSameModelRetry.run("article", context: { adapter: adapter }))
86
+
87
+ # =============================================================================
88
+ # VARIANT B — reasoning_effort escalation on one model
89
+ #
90
+ # When to use: the model can get the right answer with more thinking budget,
91
+ # but you do not want to pay the high-reasoning price on every call. Start
92
+ # at low, let validate filter out the cheap misses, pay for medium or high
93
+ # only on the cases that actually need it.
94
+ # =============================================================================
95
+
96
+ class SummarizeArticleReasoningEscalation < SummarizeArticle
97
+ retry_policy models: [
98
+ { model: "gpt-5-nano", reasoning_effort: "low" },
99
+ { model: "gpt-5-nano", reasoning_effort: "medium" },
100
+ { model: "gpt-5-nano", reasoning_effort: "high" }
101
+ ]
102
+ end
103
+
104
+ puts "=" * 70
105
+ puts "B — reasoning_effort escalation (low → medium → high)"
106
+ puts "=" * 70
107
+ adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
108
+ print_trace("reasoning escalation", SummarizeArticleReasoningEscalation.run("article", context: { adapter: adapter }))
109
+
110
+ # =============================================================================
111
+ # VARIANT C — cross-provider fallback (Ollama → Anthropic → OpenAI)
112
+ #
113
+ # When to use: you want to start on a local model (cheap, private, no quota)
114
+ # and fall back to hosted providers only when the local one cannot satisfy
115
+ # the contract. Each tier is a different provider — ruby_llm detects the
116
+ # provider from the model name.
117
+ #
118
+ # To run against real backends: configure ruby_llm for all three providers
119
+ # (ollama_api_base + anthropic_api_key + openai_api_key) and swap the Test
120
+ # adapter for Adapters::RubyLLM. The retry_policy itself is unchanged.
121
+ #
122
+ # Order matters: local first (costs nothing); hosted last (most accurate).
123
+ # =============================================================================
124
+
125
+ class SummarizeArticleCrossProvider < SummarizeArticle
126
+ retry_policy models: %w[gemma3:4b claude-haiku-4-5 gpt-5-nano]
127
+ end
128
+
129
+ puts "=" * 70
130
+ puts "C — cross-provider fallback (Ollama → Anthropic → OpenAI)"
131
+ puts "=" * 70
132
+ adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
133
+ print_trace("cross-provider", SummarizeArticleCrossProvider.run("article", context: { adapter: adapter }))
134
+
135
+ # =============================================================================
136
+ # TAKEAWAYS
137
+ #
138
+ # 1. `attempts: 3` is the shortest path from a hand-rolled begin/rescue/retry
139
+ # loop to a contract-backed retry with a trace you can log.
140
+ # 2. `reasoning_effort` escalation is cheaper than model escalation when the
141
+ # model is right but needs more thinking, not a stronger backbone.
142
+ # 3. Cross-provider retry uses the same DSL — ruby_llm resolves the provider
143
+ # from the model name. Start cheapest (often a local Ollama model), end
144
+ # on the most accurate hosted provider.
145
+ # 4. The per-attempt trace (model, config, status, cost) is identical across
146
+ # variants — your logging does not care which retry shape you picked.
147
+ # =============================================================================
data/examples/README.md CHANGED
@@ -1,140 +1,32 @@
1
1
  # Examples
2
2
 
3
- ## 00_basics.rbFrom zero to ruby_llm-contract
3
+ Seven runnable examples, every one using the `SummarizeArticle` step from the [README](../README.md)a Rails app turning article text into a UI card with TL;DR, takeaways, and tone. Zero API keys (Test adapter is the default). Only `02_real_llm_minimal.rb` needs a provider key.
4
4
 
5
- Step-by-step tutorial covering every feature. Start here.
5
+ Pedagogical order: hook activation evolution → composition → quality → advanced.
6
6
 
7
- | Step | Feature | What it shows |
8
- |------|---------|---------------|
9
- | 1 | Plain string prompt | Simplest case`user "{input}"` and nothing else |
10
- | 2 | System + user | Separate instructions from data |
11
- | 3 | Rules + output_schema | Requirements as statements + declarative output structure |
12
- | 4 | Invariants | Custom business logic on top of schema |
13
- | 5 | Examples | Few-shot (example input/output pairs) |
14
- | 6 | Sections | Labeled context blocks (heredoc replacement, with before/after) |
15
- | 7 | Hash input | Multiple fields with auto-interpolation |
16
- | 8 | 2-arity invariants | Cross-validate output against input |
17
- | 9 | Context override | Per-run adapter and model switching |
18
- | 10 | StepResult | Full inspection: status, output, errors, trace |
19
- | 11 | Pipeline | Chain steps with fail-fast data threading |
7
+ | # | File | Answers |
8
+ |---|------|---------|
9
+ | 00 | `00_basics.rb` | **"How do I start?"**seven incremental layers: plain prompt → output_schema → validate → structured prompt → Hash input cross-input validate → retry_policy → trace inspection, plus real-LLM swap pointer. |
10
+ | 01 | `01_fallback_showcase.rb` | **"Show me the gem in 30 seconds."** — Part A: schema-only ships a flaky sample. Part B: full contract rejects it and retry_policy escalates to the next model. Per-attempt trace printed inline. |
11
+ | 02 | `02_real_llm_minimal.rb` | **"How do I plug in a real LLM?"** ~30 lines. `Adapters::RubyLLM.new` in context, same step. Also shows per-call provider switch (OpenAI → Anthropic → Ollama). |
12
+ | 03 | `03_summarize_with_keywords.rb` | **"How does the contract evolve when the product grows?"** — marketing wants a "topic pills" row, so `SummarizeArticle` gains a keywords field with probability and cross-validation. Prompt, schema, and validates stay in lockstep. |
13
+ | 04 | `04_summarize_and_translate.rb` | **"How do steps compose into a pipeline?"** — 3 steps threaded by `Pipeline::Base`: English summary → translate to French → quality review. Fail-fast: a rejected summary means translate and review never run. |
14
+ | 05 | `05_eval_dataset.rb` | **"How do I stop silent prompt regressions?"** — define_eval with real cases, baseline vs regressed adapter, regression detection signal, inline eval_case. |
15
+ | 06 | `06_retry_variants.rb` | **"What retry shapes exist beyond cross-model?"** — `attempts: 3` (variance absorption), `reasoning_effort` escalation (low→medium→high), cross-provider fallback (Ollama → Anthropic → OpenAI). |
20
16
 
21
- Every step has a corresponding test in `spec/integration/examples_00_basics_spec.rb`.
22
-
23
- ## 01_classify_threads.rb — Thread classification
24
-
25
- Real-world before/after: classify Reddit threads as PROMO/FILLER/SKIP.
26
- Shows ID matching, enum validation, score consistency invariants.
27
-
28
- ## 02_generate_comment.rb — Comment generation
29
-
30
- Real-world before/after: generate Reddit comments with persona.
31
- Shows sections, banned openings, link presence, length constraints, 2-arity invariants.
32
-
33
- ## 03_target_audience.rb — Audience profiling
34
-
35
- Real-world before/after: generate target audience profiles.
36
- Shows cascade failure prevention, locale validation, structural invariants.
37
-
38
- ## 04_real_llm.rb — Real LLM calls via ruby_llm
39
-
40
- Connect to real LLM providers (OpenAI, Anthropic, Google, etc.) using Adapters::RubyLLM.
41
- Shows configuration, model switching, temperature/max_tokens control, provider-agnostic steps.
42
-
43
- | Step | Feature | What it shows |
44
- |------|---------|---------------|
45
- | 1 | Configure ruby_llm | Set API keys for your provider |
46
- | 2 | Set RubyLLM adapter | Swap Test adapter for production |
47
- | 3 | Define a step | Identical to Test adapter — provider-agnostic |
48
- | 4 | Run with real LLM | Real call, real tokens, full contract enforcement |
49
- | 5 | Compare models | A/B test different models per call |
50
- | 6 | Generation params | Temperature, max_tokens forwarding |
51
- | 7 | Switch providers | Same step, different provider — just change model name |
52
- | 8 | Error handling | Contract enforcement with real LLM responses |
53
- | 9 | Full power | Every feature combined in AnalyzeTicket |
54
- | 10 | Pipeline | Chain steps with real LLM calls |
55
-
56
- **Requires:** `export OPENAI_API_KEY=sk-...` (or another provider key)
57
-
58
- ## 05_output_schema.rb — Declarative output schema
59
-
60
- Replace manual invariants with a schema DSL (ruby_llm-schema).
61
-
62
- | Step | Feature | What it shows |
63
- |------|---------|---------------|
64
- | 1 | Before (invariants) | Manual enum, range, required checks |
65
- | 2 | After (schema) | Same constraints in declarative DSL |
66
- | 3 | Schema + invariants | Schema for structure, invariants for business logic |
67
- | 4 | Complex schema | Nested objects, arrays, constraints |
68
- | 5 | Provider-agnostic | Same schema works with Test and RubyLLM adapters |
69
- | 6 | Pipeline + schemas | Fully typed multi-step composition |
17
+ Every example has an "Expected output" section in the file header — you can read what each one prints without running it.
70
18
 
71
19
  ## Running
72
20
 
73
21
  ```bash
74
22
  # Test adapter — no API keys needed:
75
23
  ruby examples/00_basics.rb
76
- ruby examples/01_classify_threads.rb
77
- ruby examples/02_generate_comment.rb
78
- ruby examples/03_target_audience.rb
79
- ruby examples/05_output_schema.rb
80
-
81
- # Real LLM — requires API key:
82
- ruby examples/04_real_llm.rb
24
+ ruby examples/01_fallback_showcase.rb
25
+ ruby examples/03_summarize_with_keywords.rb
26
+ ruby examples/04_summarize_and_translate.rb
27
+ ruby examples/05_eval_dataset.rb
28
+ ruby examples/06_retry_variants.rb
29
+
30
+ # Real LLM — requires a provider API key or a local Ollama server:
31
+ ruby examples/02_real_llm_minimal.rb
83
32
  ```
84
-
85
- ## 06_reddit_promo.rb — Real-world Reddit promo pipeline
86
-
87
- 3-step pipeline from the reddit_promo_planner case study:
88
-
89
- | Step | Role | Invariants catch |
90
- |------|------|------------------|
91
- | 1 | TargetAudience | `locale: "USA"` instead of `"en"`, vague summary |
92
- | 2 | ClassifyThreads | PROMO with score 2, SKIP with score 8 |
93
- | 3 | GenerateComment | `{PRODUCT}` instead of URL, banned openings |
94
-
95
- Runs with test adapter by default. `REAL_LLM=1` for Ollama, `MODEL=gemma:latest` to pick model.
96
-
97
- ## 07_keyword_extraction.rb — Keyword extraction with probability
98
-
99
- Extract up to 15 keywords from an article, each with relevance probability.
100
-
101
- | Feature | What it shows |
102
- |---------|---------------|
103
- | Array schema | `min_items: 1, max_items: 15` with nested objects |
104
- | Number range | `probability: 0.0–1.0` |
105
- | Sorting invariant | Schema can't express "sorted descending" |
106
- | Uniqueness invariant | Schema can't express "no duplicates" |
107
- | Cross-validation | Keywords must appear in source text (catches hallucination) |
108
- | Pipeline | Keywords → Related Topics |
109
-
110
- ## 08_translation.rb — Translation pipeline with quality review
111
-
112
- 3-step pipeline: extract segments → translate → review quality.
113
-
114
- | Step | LLM Skill | Invariants catch |
115
- |------|-----------|------------------|
116
- | Extract | Analysis | Duplicate keys, wrong target_lang |
117
- | Translate | Creative | Missing segments, too long, echoed back untranslated |
118
- | Review | Evaluation | Inconsistent counts, failed reviews without issues |
119
-
120
- ## Running
121
-
122
- ```bash
123
- # Test adapter — no API keys needed:
124
- ruby examples/00_basics.rb
125
- ruby examples/01_classify_threads.rb
126
- ruby examples/02_generate_comment.rb
127
- ruby examples/03_target_audience.rb
128
- ruby examples/05_output_schema.rb
129
- ruby examples/06_reddit_promo.rb
130
- ruby examples/07_keyword_extraction.rb
131
- ruby examples/08_translation.rb
132
-
133
- # Real LLM — requires Ollama or API key:
134
- ruby examples/04_real_llm.rb
135
- REAL_LLM=1 ruby examples/06_reddit_promo.rb
136
- REAL_LLM=1 MODEL=llama3.2:3b ruby examples/06_reddit_promo.rb
137
- ```
138
-
139
- Examples 00–03, 05–06 use the test adapter by default — no API keys needed.
140
- Example 04 and 06 with `REAL_LLM=1` require Ollama or an API key.
@@ -72,9 +72,9 @@ module RubyLLM
72
72
  end
73
73
 
74
74
  chain_width = [rows.map { |r| r[:chain].length }.max || 0, 20].max
75
- lines = [format(" %-#{chain_width}s %-11s %-10s %-14s %-9s %s",
76
- "Chain", "single-shot", "escalation", "effective cost", "latency", "score")]
77
- lines << " #{"-" * (chain_width + 60)}"
75
+ lines = [format(" %-#{chain_width}s %-13s %-10s %-14s %-9s %s",
76
+ "Chain", "first-attempt", "fallback %", "effective cost", "latency", "score")]
77
+ lines << " #{"-" * (chain_width + 62)}"
78
78
 
79
79
  rows.each do |row|
80
80
  lines << format_production_row(row, chain_width)
@@ -95,7 +95,7 @@ module RubyLLM
95
95
 
96
96
  def format_production_row(row, chain_width)
97
97
  report = row[:report]
98
- format(" %-#{chain_width}s %-11s %-10s %-14s %-9s %6.2f",
98
+ format(" %-#{chain_width}s %-13s %-10s %-14s %-9s %6.2f",
99
99
  row[:chain],
100
100
  format_money(report.single_shot_cost || report.total_cost),
101
101
  format_escalation(row, report),
@@ -15,8 +15,12 @@ module RubyLLM
15
15
  class RetryOptimizer
16
16
  Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
17
17
  :constraining_eval, :chain, :chain_details, keyword_init: true) do
18
+ # Terminology alias — `hardest_eval` is the narrative name used in docs;
19
+ # `constraining_eval` is preserved as the original field name.
20
+ alias_method :hardest_eval, :constraining_eval
21
+
18
22
  def print_summary(io = $stdout)
19
- io.puts "#{step_name} — retry chain optimization"
23
+ io.puts "#{step_name} — fallback list optimization"
20
24
  io.puts
21
25
  print_table(io)
22
26
  io.puts
@@ -59,7 +63,7 @@ module RubyLLM
59
63
  end
60
64
 
61
65
  io.puts
62
- io.puts " Constraining eval: #{constraining_eval}" if constraining_eval
66
+ io.puts " Hardest eval: #{constraining_eval}" if constraining_eval
63
67
  end
64
68
 
65
69
  def print_chain(io)
@@ -68,7 +72,7 @@ module RubyLLM
68
72
  return
69
73
  end
70
74
 
71
- io.puts " Suggested chain:"
75
+ io.puts " Suggested fallback list:"
72
76
  chain_details.each_with_index do |detail, i|
73
77
  suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
74
78
  io.puts " #{detail[:label]} — #{suffix}"
@@ -186,20 +186,36 @@ module RubyLLM
186
186
  "{ |c| c.default_adapter = ... } or pass context: { adapter: ... }"
187
187
  end
188
188
 
189
+ # ADR-0021 deliverable 2: narrow ArgumentError rescue to DSL-setup phase only.
190
+ #
191
+ # DSL misconfiguration (e.g. `prompt has not been set`, missing required
192
+ # attributes) surfaces as ArgumentError when constructing Runner. We catch
193
+ # that and return :input_error — these are contract-definition issues the
194
+ # caller can handle as "bad input to the step definition".
195
+ #
196
+ # Runner#call itself does NOT get a blanket rescue: input-type validation
197
+ # failures return :input_error from within InputValidator; adapter/runtime
198
+ # programmer bugs (NoMethodError, adapter-code ArgumentError) must propagate
199
+ # instead of being silently masked as :input_error.
189
200
  def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
190
201
  effective_temp = context_temperature || temperature
191
- Runner.new(
192
- input_type: input_type, output_type: output_type,
193
- prompt_block: prompt, contract_definition: effective_contract,
194
- adapter: adapter, model: model, output_schema: output_schema,
195
- max_output: max_output, max_input: max_input, max_cost: max_cost,
196
- on_unknown_pricing: on_unknown_pricing,
197
- temperature: effective_temp, extra_options: extra_options,
198
- observers: class_observers
199
- ).call(input)
200
- rescue ArgumentError => e
201
- Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
202
- validation_errors: [e.message])
202
+ runner =
203
+ begin
204
+ Runner.new(
205
+ input_type: input_type, output_type: output_type,
206
+ prompt_block: prompt, contract_definition: effective_contract,
207
+ adapter: adapter, model: model, output_schema: output_schema,
208
+ max_output: max_output, max_input: max_input, max_cost: max_cost,
209
+ on_unknown_pricing: on_unknown_pricing,
210
+ temperature: effective_temp, extra_options: extra_options,
211
+ observers: class_observers
212
+ )
213
+ rescue ArgumentError => e
214
+ return Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
215
+ validation_errors: [e.message])
216
+ end
217
+
218
+ runner.call(input)
203
219
  end
204
220
 
205
221
  def log_result(result)
@@ -2,6 +2,6 @@
2
2
 
3
3
  module RubyLLM
4
4
  module Contract
5
- VERSION = "0.7.0"
5
+ VERSION = "0.7.3"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-contract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justyna
@@ -69,15 +69,12 @@ files:
69
69
  - README.md
70
70
  - Rakefile
71
71
  - examples/00_basics.rb
72
- - examples/01_classify_threads.rb
73
- - examples/02_generate_comment.rb
74
- - examples/03_target_audience.rb
75
- - examples/04_real_llm.rb
76
- - examples/05_output_schema.rb
77
- - examples/07_keyword_extraction.rb
78
- - examples/08_translation.rb
79
- - examples/09_eval_dataset.rb
80
- - examples/10_reddit_full_showcase.rb
72
+ - examples/01_fallback_showcase.rb
73
+ - examples/02_real_llm_minimal.rb
74
+ - examples/03_summarize_with_keywords.rb
75
+ - examples/04_summarize_and_translate.rb
76
+ - examples/05_eval_dataset.rb
77
+ - examples/06_retry_variants.rb
81
78
  - examples/README.md
82
79
  - lib/ruby_llm/contract.rb
83
80
  - lib/ruby_llm/contract/adapters.rb