ruby_llm-contract 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +66 -0
- data/Gemfile.lock +2 -2
- data/README.md +51 -252
- data/examples/00_basics.rb +110 -428
- data/examples/01_fallback_showcase.rb +208 -0
- data/examples/02_real_llm_minimal.rb +45 -0
- data/examples/03_summarize_with_keywords.rb +128 -0
- data/examples/04_summarize_and_translate.rb +196 -0
- data/examples/05_eval_dataset.rb +144 -0
- data/examples/06_retry_variants.rb +147 -0
- data/examples/README.md +20 -128
- data/lib/ruby_llm/contract/eval/model_comparison.rb +4 -4
- data/lib/ruby_llm/contract/eval/retry_optimizer.rb +7 -3
- data/lib/ruby_llm/contract/step/base.rb +28 -12
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +7 -10
- data/examples/01_classify_threads.rb +0 -220
- data/examples/02_generate_comment.rb +0 -203
- data/examples/03_target_audience.rb +0 -201
- data/examples/04_real_llm.rb +0 -410
- data/examples/05_output_schema.rb +0 -258
- data/examples/07_keyword_extraction.rb +0 -239
- data/examples/08_translation.rb +0 -353
- data/examples/09_eval_dataset.rb +0 -287
- data/examples/10_reddit_full_showcase.rb +0 -363
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# EXAMPLE 5: Dataset-driven evals on SummarizeArticle
|
|
5
|
+
#
|
|
6
|
+
# The pattern that stops silent prompt regressions:
|
|
7
|
+
# 1. Define an eval with a handful of real articles and expected outcomes.
|
|
8
|
+
# 2. Run it on your current configuration — that is the baseline.
|
|
9
|
+
# 3. Change a prompt, swap a model, upgrade a gem — re-run.
|
|
10
|
+
# 4. A drop in score blocks the merge before it ships.
|
|
11
|
+
#
|
|
12
|
+
# Every piece of the workflow is shown in one file: define_eval, add_case
|
|
13
|
+
# with expected traits, running the eval, comparing a "good" to a "bad"
|
|
14
|
+
# model, and the inline eval_case helper for quick checks.
|
|
15
|
+
#
|
|
16
|
+
# Run: ruby examples/05_eval_dataset.rb
|
|
17
|
+
#
|
|
18
|
+
# Expected output:
|
|
19
|
+
#
|
|
20
|
+
# Run 1 — good configuration (baseline)
|
|
21
|
+
# Score: 1.0
|
|
22
|
+
# Pass rate: 3/3
|
|
23
|
+
# Passed?: true
|
|
24
|
+
#
|
|
25
|
+
# Run 2 — a prompt tweak broke tone classification on complaints
|
|
26
|
+
# Score: 0.67
|
|
27
|
+
# Pass rate: 2/3
|
|
28
|
+
# ✓ ruby release all expected keys present and matching
|
|
29
|
+
# ✗ outage complaint tone: expected "negative", got "analytical"
|
|
30
|
+
# ✓ product launch all expected keys present and matching
|
|
31
|
+
# Regression detected: 1.0 → 0.67 (33% drop)
|
|
32
|
+
#
|
|
33
|
+
# Inline eval_case (quick one-off check)
|
|
34
|
+
# Passed: true
|
|
35
|
+
# Score: 1.0
|
|
36
|
+
# Details: all expected keys present and matching
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
require_relative "../lib/ruby_llm/contract"
|
|
40
|
+
|
|
41
|
+
class SummarizeArticle < RubyLLM::Contract::Step::Base
|
|
42
|
+
prompt "Summarize: {input}"
|
|
43
|
+
|
|
44
|
+
output_schema do
|
|
45
|
+
string :tldr, max_length: 200
|
|
46
|
+
array :takeaways, of: :string, min_items: 3, max_items: 5
|
|
47
|
+
string :tone, enum: %w[neutral positive negative analytical]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
|
|
51
|
+
|
|
52
|
+
define_eval "regression" do
|
|
53
|
+
add_case "ruby release",
|
|
54
|
+
input: "Ruby 3.4 ships with frozen string literals, YJIT speedups, parser fixes.",
|
|
55
|
+
expected: { tone: "analytical" }
|
|
56
|
+
|
|
57
|
+
add_case "outage complaint",
|
|
58
|
+
input: "The mesh hardware failed under load. Three customers threatened churn.",
|
|
59
|
+
expected: { tone: "negative" }
|
|
60
|
+
|
|
61
|
+
add_case "product launch",
|
|
62
|
+
input: "We are thrilled to announce our new billing feature ships this week.",
|
|
63
|
+
expected: { tone: "positive" }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# =============================================================================
|
|
68
|
+
# Good run — every case lands on the expected tone
|
|
69
|
+
# =============================================================================
|
|
70
|
+
|
|
71
|
+
puts "=" * 60
|
|
72
|
+
puts "Run 1 — good configuration (baseline)"
|
|
73
|
+
puts "=" * 60
|
|
74
|
+
|
|
75
|
+
good_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
|
|
76
|
+
{ tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" },
|
|
77
|
+
{ tldr: "Outage complaint", takeaways: %w[a b c], tone: "negative" },
|
|
78
|
+
{ tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
|
|
79
|
+
])
|
|
80
|
+
|
|
81
|
+
baseline = SummarizeArticle.run_eval("regression", context: { adapter: good_adapter })
|
|
82
|
+
puts "Score: #{baseline.score.round(2)}" # => 1.0
|
|
83
|
+
puts "Pass rate: #{baseline.pass_rate}" # => 3/3
|
|
84
|
+
puts "Passed?: #{baseline.passed?}" # => true
|
|
85
|
+
|
|
86
|
+
# =============================================================================
|
|
87
|
+
# Bad run — simulates a prompt tweak that broke "outage" classification
|
|
88
|
+
# =============================================================================
|
|
89
|
+
|
|
90
|
+
puts
|
|
91
|
+
puts "=" * 60
|
|
92
|
+
puts "Run 2 — a prompt tweak broke tone classification on complaints"
|
|
93
|
+
puts "=" * 60
|
|
94
|
+
|
|
95
|
+
bad_adapter = RubyLLM::Contract::Adapters::Test.new(responses: [
|
|
96
|
+
{ tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" },
|
|
97
|
+
{ tldr: "Outage complaint", takeaways: %w[a b c], tone: "analytical" }, # expected negative!
|
|
98
|
+
{ tldr: "Product launch news", takeaways: %w[a b c], tone: "positive" }
|
|
99
|
+
])
|
|
100
|
+
|
|
101
|
+
regression = SummarizeArticle.run_eval("regression", context: { adapter: bad_adapter })
|
|
102
|
+
puts "Score: #{regression.score.round(2)}" # => 0.67
|
|
103
|
+
puts "Pass rate: #{regression.pass_rate}" # => 2/3
|
|
104
|
+
|
|
105
|
+
regression.each do |r|
|
|
106
|
+
icon = r.passed? ? "✓" : "✗"
|
|
107
|
+
puts " #{icon} #{r.name.ljust(20)} #{r.details}"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
puts
|
|
111
|
+
puts "Regression detected: #{baseline.score.round(2)} → #{regression.score.round(2)} " \
|
|
112
|
+
"(#{((baseline.score - regression.score) * 100).round}% drop)"
|
|
113
|
+
|
|
114
|
+
# =============================================================================
|
|
115
|
+
# eval_case — inline single-case check without defining a full dataset
|
|
116
|
+
# =============================================================================
|
|
117
|
+
|
|
118
|
+
puts
|
|
119
|
+
puts "=" * 60
|
|
120
|
+
puts "Inline eval_case (quick one-off check)"
|
|
121
|
+
puts "=" * 60
|
|
122
|
+
|
|
123
|
+
one = SummarizeArticle.eval_case(
|
|
124
|
+
input: "Ruby 3.4 ships with frozen string literals.",
|
|
125
|
+
expected: { tone: "analytical" },
|
|
126
|
+
context: { adapter: RubyLLM::Contract::Adapters::Test.new(
|
|
127
|
+
response: { tldr: "Ruby 3.4 summary", takeaways: %w[a b c], tone: "analytical" }
|
|
128
|
+
) }
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
puts "Passed: #{one.passed?}" # => true
|
|
132
|
+
puts "Score: #{one.score}" # => 1.0
|
|
133
|
+
puts "Details: #{one.details}"
|
|
134
|
+
|
|
135
|
+
# =============================================================================
|
|
136
|
+
# What this showcases
|
|
137
|
+
#
|
|
138
|
+
# - define_eval keeps dataset + expectations next to the step definition.
|
|
139
|
+
# One class, one truth.
|
|
140
|
+
# - run_eval returns a Report with score, pass_rate, per-case CaseResult.
|
|
141
|
+
# - The same dataset detects a regression when a "good" adapter is swapped
|
|
142
|
+
# for a "bad" one — same signal you get from a prompt change in prod.
|
|
143
|
+
# - eval_case is the lightweight alternative for one-off inline checks.
|
|
144
|
+
# =============================================================================
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# EXAMPLE 6: retry_policy variants on SummarizeArticle
|
|
5
|
+
#
|
|
6
|
+
# Example 01 covered the most common pattern: fall back from a cheap model
|
|
7
|
+
# to a stronger one (gpt-5-nano → mini → gpt-5). This file runs the three
|
|
8
|
+
# other retry_policy shapes, each on the same SummarizeArticle step with
|
|
9
|
+
# the Test adapter so no API keys are required.
|
|
10
|
+
#
|
|
11
|
+
# Run: ruby examples/06_retry_variants.rb
|
|
12
|
+
#
|
|
13
|
+
# Expected output (abridged):
|
|
14
|
+
#
|
|
15
|
+
# A — attempts: 3 (same model, sampling-variance absorption)
|
|
16
|
+
# attempt 1 model=gpt-5-nano status=validation_failed
|
|
17
|
+
# attempt 3 model=gpt-5-nano status=ok
|
|
18
|
+
#
|
|
19
|
+
# B — reasoning_effort low → medium → high (same model)
|
|
20
|
+
# attempt 1 effort=low status=validation_failed
|
|
21
|
+
# attempt 3 effort=high status=ok
|
|
22
|
+
#
|
|
23
|
+
# C — cross-provider Ollama → Anthropic → OpenAI
|
|
24
|
+
# attempt 1 model=gemma3:4b status=validation_failed
|
|
25
|
+
# attempt 3 model=gpt-5-nano status=ok
|
|
26
|
+
# =============================================================================
|
|
27
|
+
|
|
28
|
+
require_relative "../lib/ruby_llm/contract"
|
|
29
|
+
|
|
30
|
+
# =============================================================================
|
|
31
|
+
# Base step — same SummarizeArticle from the README, used by every variant
|
|
32
|
+
# =============================================================================
|
|
33
|
+
|
|
34
|
+
class SummarizeArticle < RubyLLM::Contract::Step::Base
|
|
35
|
+
model "gpt-5-nano"
|
|
36
|
+
prompt "Summarize: {input}"
|
|
37
|
+
|
|
38
|
+
output_schema do
|
|
39
|
+
string :tldr, max_length: 200
|
|
40
|
+
array :takeaways, of: :string, min_items: 3, max_items: 5
|
|
41
|
+
string :tone, enum: %w[neutral positive negative analytical]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
validate("TL;DR fits the card") { |o, _| o[:tldr].length <= 200 }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Canned responses — first two fail the "TL;DR fits the card" validate
|
|
48
|
+
# (oversized TL;DR), the third succeeds. Every variant lands on attempt 3,
|
|
49
|
+
# so the trace shows the retry policy's shape clearly.
|
|
50
|
+
RESPONSES = [
|
|
51
|
+
{ tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
|
|
52
|
+
{ tldr: "x" * 500, takeaways: %w[a b c], tone: "neutral" },
|
|
53
|
+
{ tldr: "Ruby 3.4 ships with frozen string literals and YJIT speedups.",
|
|
54
|
+
takeaways: %w[frozen-strings yjit parser-fixes], tone: "analytical" }
|
|
55
|
+
].freeze
|
|
56
|
+
|
|
57
|
+
def print_trace(label, result)
|
|
58
|
+
puts "#{label} — status=#{result.status}, final model=#{result.trace[:model].inspect}"
|
|
59
|
+
result.trace[:attempts].each do |a|
|
|
60
|
+
cfg = a[:config] && a[:config][:reasoning_effort] ? " effort=#{a[:config][:reasoning_effort].ljust(6)}" : ""
|
|
61
|
+
puts " attempt #{a[:attempt]} model=#{a[:model].ljust(20)}#{cfg} status=#{a[:status]}"
|
|
62
|
+
end
|
|
63
|
+
puts
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# =============================================================================
|
|
67
|
+
# VARIANT A — attempts: 3 on the same model
|
|
68
|
+
#
|
|
69
|
+
# When to use: the model is correct on most samples, but sampling variance
|
|
70
|
+
# (gpt-5 / o-series enforce temperature=1.0 server-side) flips it occasionally.
|
|
71
|
+
# Re-sampling the same model absorbs the variance without paying for a
|
|
72
|
+
# stronger tier.
|
|
73
|
+
#
|
|
74
|
+
# Replaces: the hand-rolled begin/rescue/retry loop with an attempts counter.
|
|
75
|
+
# =============================================================================
|
|
76
|
+
|
|
77
|
+
class SummarizeArticleSameModelRetry < SummarizeArticle
|
|
78
|
+
retry_policy attempts: 3
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
puts "=" * 70
|
|
82
|
+
puts "A — attempts: 3 (same model, sampling-variance absorption)"
|
|
83
|
+
puts "=" * 70
|
|
84
|
+
adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
|
|
85
|
+
print_trace("same-model retry", SummarizeArticleSameModelRetry.run("article", context: { adapter: adapter }))
|
|
86
|
+
|
|
87
|
+
# =============================================================================
|
|
88
|
+
# VARIANT B — reasoning_effort escalation on one model
|
|
89
|
+
#
|
|
90
|
+
# When to use: the model can get the right answer with more thinking budget,
|
|
91
|
+
# but you do not want to pay the high-reasoning price on every call. Start
|
|
92
|
+
# at low, let validate filter out the cheap misses, pay for medium or high
|
|
93
|
+
# only on the cases that actually need it.
|
|
94
|
+
# =============================================================================
|
|
95
|
+
|
|
96
|
+
class SummarizeArticleReasoningEscalation < SummarizeArticle
|
|
97
|
+
retry_policy models: [
|
|
98
|
+
{ model: "gpt-5-nano", reasoning_effort: "low" },
|
|
99
|
+
{ model: "gpt-5-nano", reasoning_effort: "medium" },
|
|
100
|
+
{ model: "gpt-5-nano", reasoning_effort: "high" }
|
|
101
|
+
]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
puts "=" * 70
|
|
105
|
+
puts "B — reasoning_effort escalation (low → medium → high)"
|
|
106
|
+
puts "=" * 70
|
|
107
|
+
adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
|
|
108
|
+
print_trace("reasoning escalation", SummarizeArticleReasoningEscalation.run("article", context: { adapter: adapter }))
|
|
109
|
+
|
|
110
|
+
# =============================================================================
|
|
111
|
+
# VARIANT C — cross-provider fallback (Ollama → Anthropic → OpenAI)
|
|
112
|
+
#
|
|
113
|
+
# When to use: you want to start on a local model (cheap, private, no quota)
|
|
114
|
+
# and fall back to hosted providers only when the local one cannot satisfy
|
|
115
|
+
# the contract. Each tier is a different provider — ruby_llm detects the
|
|
116
|
+
# provider from the model name.
|
|
117
|
+
#
|
|
118
|
+
# To run against real backends: configure ruby_llm for all three providers
|
|
119
|
+
# (ollama_api_base + anthropic_api_key + openai_api_key) and swap the Test
|
|
120
|
+
# adapter for Adapters::RubyLLM. The retry_policy itself is unchanged.
|
|
121
|
+
#
|
|
122
|
+
# Order matters: local first (costs nothing); hosted last (most accurate).
|
|
123
|
+
# =============================================================================
|
|
124
|
+
|
|
125
|
+
class SummarizeArticleCrossProvider < SummarizeArticle
|
|
126
|
+
retry_policy models: %w[gemma3:4b claude-haiku-4-5 gpt-5-nano]
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
puts "=" * 70
|
|
130
|
+
puts "C — cross-provider fallback (Ollama → Anthropic → OpenAI)"
|
|
131
|
+
puts "=" * 70
|
|
132
|
+
adapter = RubyLLM::Contract::Adapters::Test.new(responses: RESPONSES)
|
|
133
|
+
print_trace("cross-provider", SummarizeArticleCrossProvider.run("article", context: { adapter: adapter }))
|
|
134
|
+
|
|
135
|
+
# =============================================================================
|
|
136
|
+
# TAKEAWAYS
|
|
137
|
+
#
|
|
138
|
+
# 1. `attempts: 3` is the shortest path from a hand-rolled begin/rescue/retry
|
|
139
|
+
# loop to a contract-backed retry with a trace you can log.
|
|
140
|
+
# 2. `reasoning_effort` escalation is cheaper than model escalation when the
|
|
141
|
+
# model is right but needs more thinking, not a stronger backbone.
|
|
142
|
+
# 3. Cross-provider retry uses the same DSL — ruby_llm resolves the provider
|
|
143
|
+
# from the model name. Start cheapest (often a local Ollama model), end
|
|
144
|
+
# on the most accurate hosted provider.
|
|
145
|
+
# 4. The per-attempt trace (model, config, status, cost) is identical across
|
|
146
|
+
# variants — your logging does not care which retry shape you picked.
|
|
147
|
+
# =============================================================================
|
data/examples/README.md
CHANGED
|
@@ -1,140 +1,32 @@
|
|
|
1
1
|
# Examples
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Seven runnable examples, every one using the `SummarizeArticle` step from the [README](../README.md) — a Rails app turning article text into a UI card with TL;DR, takeaways, and tone. Zero API keys (Test adapter is the default). Only `02_real_llm_minimal.rb` needs a provider key.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Pedagogical order: hook → activation → evolution → composition → quality → advanced.
|
|
6
6
|
|
|
7
|
-
|
|
|
8
|
-
|
|
9
|
-
|
|
|
10
|
-
|
|
|
11
|
-
|
|
|
12
|
-
|
|
|
13
|
-
|
|
|
14
|
-
|
|
|
15
|
-
|
|
|
16
|
-
| 8 | 2-arity invariants | Cross-validate output against input |
|
|
17
|
-
| 9 | Context override | Per-run adapter and model switching |
|
|
18
|
-
| 10 | StepResult | Full inspection: status, output, errors, trace |
|
|
19
|
-
| 11 | Pipeline | Chain steps with fail-fast data threading |
|
|
7
|
+
| # | File | Answers |
|
|
8
|
+
|---|------|---------|
|
|
9
|
+
| 00 | `00_basics.rb` | **"How do I start?"** — seven incremental layers: plain prompt → output_schema → validate → structured prompt → Hash input → cross-input validate → retry_policy → trace inspection, plus real-LLM swap pointer. |
|
|
10
|
+
| 01 | `01_fallback_showcase.rb` | **"Show me the gem in 30 seconds."** — Part A: schema-only ships a flaky sample. Part B: full contract rejects it and retry_policy escalates to the next model. Per-attempt trace printed inline. |
|
|
11
|
+
| 02 | `02_real_llm_minimal.rb` | **"How do I plug in a real LLM?"** — ~30 lines. `Adapters::RubyLLM.new` in context, same step. Also shows per-call provider switch (OpenAI → Anthropic → Ollama). |
|
|
12
|
+
| 03 | `03_summarize_with_keywords.rb` | **"How does the contract evolve when the product grows?"** — marketing wants a "topic pills" row, so `SummarizeArticle` gains a keywords field with probability and cross-validation. Prompt, schema, and validates stay in lockstep. |
|
|
13
|
+
| 04 | `04_summarize_and_translate.rb` | **"How do steps compose into a pipeline?"** — 3 steps threaded by `Pipeline::Base`: English summary → translate to French → quality review. Fail-fast: a rejected summary means translate and review never run. |
|
|
14
|
+
| 05 | `05_eval_dataset.rb` | **"How do I stop silent prompt regressions?"** — define_eval with real cases, baseline vs regressed adapter, regression detection signal, inline eval_case. |
|
|
15
|
+
| 06 | `06_retry_variants.rb` | **"What retry shapes exist beyond cross-model?"** — `attempts: 3` (variance absorption), `reasoning_effort` escalation (low→medium→high), cross-provider fallback (Ollama → Anthropic → OpenAI). |
|
|
20
16
|
|
|
21
|
-
Every
|
|
22
|
-
|
|
23
|
-
## 01_classify_threads.rb — Thread classification
|
|
24
|
-
|
|
25
|
-
Real-world before/after: classify Reddit threads as PROMO/FILLER/SKIP.
|
|
26
|
-
Shows ID matching, enum validation, score consistency invariants.
|
|
27
|
-
|
|
28
|
-
## 02_generate_comment.rb — Comment generation
|
|
29
|
-
|
|
30
|
-
Real-world before/after: generate Reddit comments with persona.
|
|
31
|
-
Shows sections, banned openings, link presence, length constraints, 2-arity invariants.
|
|
32
|
-
|
|
33
|
-
## 03_target_audience.rb — Audience profiling
|
|
34
|
-
|
|
35
|
-
Real-world before/after: generate target audience profiles.
|
|
36
|
-
Shows cascade failure prevention, locale validation, structural invariants.
|
|
37
|
-
|
|
38
|
-
## 04_real_llm.rb — Real LLM calls via ruby_llm
|
|
39
|
-
|
|
40
|
-
Connect to real LLM providers (OpenAI, Anthropic, Google, etc.) using Adapters::RubyLLM.
|
|
41
|
-
Shows configuration, model switching, temperature/max_tokens control, provider-agnostic steps.
|
|
42
|
-
|
|
43
|
-
| Step | Feature | What it shows |
|
|
44
|
-
|------|---------|---------------|
|
|
45
|
-
| 1 | Configure ruby_llm | Set API keys for your provider |
|
|
46
|
-
| 2 | Set RubyLLM adapter | Swap Test adapter for production |
|
|
47
|
-
| 3 | Define a step | Identical to Test adapter — provider-agnostic |
|
|
48
|
-
| 4 | Run with real LLM | Real call, real tokens, full contract enforcement |
|
|
49
|
-
| 5 | Compare models | A/B test different models per call |
|
|
50
|
-
| 6 | Generation params | Temperature, max_tokens forwarding |
|
|
51
|
-
| 7 | Switch providers | Same step, different provider — just change model name |
|
|
52
|
-
| 8 | Error handling | Contract enforcement with real LLM responses |
|
|
53
|
-
| 9 | Full power | Every feature combined in AnalyzeTicket |
|
|
54
|
-
| 10 | Pipeline | Chain steps with real LLM calls |
|
|
55
|
-
|
|
56
|
-
**Requires:** `export OPENAI_API_KEY=sk-...` (or another provider key)
|
|
57
|
-
|
|
58
|
-
## 05_output_schema.rb — Declarative output schema
|
|
59
|
-
|
|
60
|
-
Replace manual invariants with a schema DSL (ruby_llm-schema).
|
|
61
|
-
|
|
62
|
-
| Step | Feature | What it shows |
|
|
63
|
-
|------|---------|---------------|
|
|
64
|
-
| 1 | Before (invariants) | Manual enum, range, required checks |
|
|
65
|
-
| 2 | After (schema) | Same constraints in declarative DSL |
|
|
66
|
-
| 3 | Schema + invariants | Schema for structure, invariants for business logic |
|
|
67
|
-
| 4 | Complex schema | Nested objects, arrays, constraints |
|
|
68
|
-
| 5 | Provider-agnostic | Same schema works with Test and RubyLLM adapters |
|
|
69
|
-
| 6 | Pipeline + schemas | Fully typed multi-step composition |
|
|
17
|
+
Every example has an "Expected output" section in the file header — you can read what each one prints without running it.
|
|
70
18
|
|
|
71
19
|
## Running
|
|
72
20
|
|
|
73
21
|
```bash
|
|
74
22
|
# Test adapter — no API keys needed:
|
|
75
23
|
ruby examples/00_basics.rb
|
|
76
|
-
ruby examples/
|
|
77
|
-
ruby examples/
|
|
78
|
-
ruby examples/
|
|
79
|
-
ruby examples/
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
24
|
+
ruby examples/01_fallback_showcase.rb
|
|
25
|
+
ruby examples/03_summarize_with_keywords.rb
|
|
26
|
+
ruby examples/04_summarize_and_translate.rb
|
|
27
|
+
ruby examples/05_eval_dataset.rb
|
|
28
|
+
ruby examples/06_retry_variants.rb
|
|
29
|
+
|
|
30
|
+
# Real LLM — requires a provider API key or a local Ollama server:
|
|
31
|
+
ruby examples/02_real_llm_minimal.rb
|
|
83
32
|
```
|
|
84
|
-
|
|
85
|
-
## 06_reddit_promo.rb — Real-world Reddit promo pipeline
|
|
86
|
-
|
|
87
|
-
3-step pipeline from the reddit_promo_planner case study:
|
|
88
|
-
|
|
89
|
-
| Step | Role | Invariants catch |
|
|
90
|
-
|------|------|------------------|
|
|
91
|
-
| 1 | TargetAudience | `locale: "USA"` instead of `"en"`, vague summary |
|
|
92
|
-
| 2 | ClassifyThreads | PROMO with score 2, SKIP with score 8 |
|
|
93
|
-
| 3 | GenerateComment | `{PRODUCT}` instead of URL, banned openings |
|
|
94
|
-
|
|
95
|
-
Runs with test adapter by default. `REAL_LLM=1` for Ollama, `MODEL=gemma:latest` to pick model.
|
|
96
|
-
|
|
97
|
-
## 07_keyword_extraction.rb — Keyword extraction with probability
|
|
98
|
-
|
|
99
|
-
Extract up to 15 keywords from an article, each with relevance probability.
|
|
100
|
-
|
|
101
|
-
| Feature | What it shows |
|
|
102
|
-
|---------|---------------|
|
|
103
|
-
| Array schema | `min_items: 1, max_items: 15` with nested objects |
|
|
104
|
-
| Number range | `probability: 0.0–1.0` |
|
|
105
|
-
| Sorting invariant | Schema can't express "sorted descending" |
|
|
106
|
-
| Uniqueness invariant | Schema can't express "no duplicates" |
|
|
107
|
-
| Cross-validation | Keywords must appear in source text (catches hallucination) |
|
|
108
|
-
| Pipeline | Keywords → Related Topics |
|
|
109
|
-
|
|
110
|
-
## 08_translation.rb — Translation pipeline with quality review
|
|
111
|
-
|
|
112
|
-
3-step pipeline: extract segments → translate → review quality.
|
|
113
|
-
|
|
114
|
-
| Step | LLM Skill | Invariants catch |
|
|
115
|
-
|------|-----------|------------------|
|
|
116
|
-
| Extract | Analysis | Duplicate keys, wrong target_lang |
|
|
117
|
-
| Translate | Creative | Missing segments, too long, echoed back untranslated |
|
|
118
|
-
| Review | Evaluation | Inconsistent counts, failed reviews without issues |
|
|
119
|
-
|
|
120
|
-
## Running
|
|
121
|
-
|
|
122
|
-
```bash
|
|
123
|
-
# Test adapter — no API keys needed:
|
|
124
|
-
ruby examples/00_basics.rb
|
|
125
|
-
ruby examples/01_classify_threads.rb
|
|
126
|
-
ruby examples/02_generate_comment.rb
|
|
127
|
-
ruby examples/03_target_audience.rb
|
|
128
|
-
ruby examples/05_output_schema.rb
|
|
129
|
-
ruby examples/06_reddit_promo.rb
|
|
130
|
-
ruby examples/07_keyword_extraction.rb
|
|
131
|
-
ruby examples/08_translation.rb
|
|
132
|
-
|
|
133
|
-
# Real LLM — requires Ollama or API key:
|
|
134
|
-
ruby examples/04_real_llm.rb
|
|
135
|
-
REAL_LLM=1 ruby examples/06_reddit_promo.rb
|
|
136
|
-
REAL_LLM=1 MODEL=llama3.2:3b ruby examples/06_reddit_promo.rb
|
|
137
|
-
```
|
|
138
|
-
|
|
139
|
-
Examples 00–03, 05–06 use the test adapter by default — no API keys needed.
|
|
140
|
-
Example 04 and 06 with `REAL_LLM=1` require Ollama or an API key.
|
|
@@ -72,9 +72,9 @@ module RubyLLM
|
|
|
72
72
|
end
|
|
73
73
|
|
|
74
74
|
chain_width = [rows.map { |r| r[:chain].length }.max || 0, 20].max
|
|
75
|
-
lines = [format(" %-#{chain_width}s %-
|
|
76
|
-
"Chain", "
|
|
77
|
-
lines << " #{"-" * (chain_width +
|
|
75
|
+
lines = [format(" %-#{chain_width}s %-13s %-10s %-14s %-9s %s",
|
|
76
|
+
"Chain", "first-attempt", "fallback %", "effective cost", "latency", "score")]
|
|
77
|
+
lines << " #{"-" * (chain_width + 62)}"
|
|
78
78
|
|
|
79
79
|
rows.each do |row|
|
|
80
80
|
lines << format_production_row(row, chain_width)
|
|
@@ -95,7 +95,7 @@ module RubyLLM
|
|
|
95
95
|
|
|
96
96
|
def format_production_row(row, chain_width)
|
|
97
97
|
report = row[:report]
|
|
98
|
-
format(" %-#{chain_width}s %-
|
|
98
|
+
format(" %-#{chain_width}s %-13s %-10s %-14s %-9s %6.2f",
|
|
99
99
|
row[:chain],
|
|
100
100
|
format_money(report.single_shot_cost || report.total_cost),
|
|
101
101
|
format_escalation(row, report),
|
|
@@ -15,8 +15,12 @@ module RubyLLM
|
|
|
15
15
|
class RetryOptimizer
|
|
16
16
|
Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
|
|
17
17
|
:constraining_eval, :chain, :chain_details, keyword_init: true) do
|
|
18
|
+
# Terminology alias — `hardest_eval` is the narrative name used in docs;
|
|
19
|
+
# `constraining_eval` is preserved as the original field name.
|
|
20
|
+
alias_method :hardest_eval, :constraining_eval
|
|
21
|
+
|
|
18
22
|
def print_summary(io = $stdout)
|
|
19
|
-
io.puts "#{step_name} —
|
|
23
|
+
io.puts "#{step_name} — fallback list optimization"
|
|
20
24
|
io.puts
|
|
21
25
|
print_table(io)
|
|
22
26
|
io.puts
|
|
@@ -59,7 +63,7 @@ module RubyLLM
|
|
|
59
63
|
end
|
|
60
64
|
|
|
61
65
|
io.puts
|
|
62
|
-
io.puts "
|
|
66
|
+
io.puts " Hardest eval: #{constraining_eval}" if constraining_eval
|
|
63
67
|
end
|
|
64
68
|
|
|
65
69
|
def print_chain(io)
|
|
@@ -68,7 +72,7 @@ module RubyLLM
|
|
|
68
72
|
return
|
|
69
73
|
end
|
|
70
74
|
|
|
71
|
-
io.puts " Suggested
|
|
75
|
+
io.puts " Suggested fallback list:"
|
|
72
76
|
chain_details.each_with_index do |detail, i|
|
|
73
77
|
suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
|
|
74
78
|
io.puts " #{detail[:label]} — #{suffix}"
|
|
@@ -186,20 +186,36 @@ module RubyLLM
|
|
|
186
186
|
"{ |c| c.default_adapter = ... } or pass context: { adapter: ... }"
|
|
187
187
|
end
|
|
188
188
|
|
|
189
|
+
# ADR-0021 deliverable 2: narrow ArgumentError rescue to DSL-setup phase only.
|
|
190
|
+
#
|
|
191
|
+
# DSL misconfiguration (e.g. `prompt has not been set`, missing required
|
|
192
|
+
# attributes) surfaces as ArgumentError when constructing Runner. We catch
|
|
193
|
+
# that and return :input_error — these are contract-definition issues the
|
|
194
|
+
# caller can handle as "bad input to the step definition".
|
|
195
|
+
#
|
|
196
|
+
# Runner#call itself does NOT get a blanket rescue: input-type validation
|
|
197
|
+
# failures return :input_error from within InputValidator; adapter/runtime
|
|
198
|
+
# programmer bugs (NoMethodError, adapter-code ArgumentError) must propagate
|
|
199
|
+
# instead of being silently masked as :input_error.
|
|
189
200
|
def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
|
|
190
201
|
effective_temp = context_temperature || temperature
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
202
|
+
runner =
|
|
203
|
+
begin
|
|
204
|
+
Runner.new(
|
|
205
|
+
input_type: input_type, output_type: output_type,
|
|
206
|
+
prompt_block: prompt, contract_definition: effective_contract,
|
|
207
|
+
adapter: adapter, model: model, output_schema: output_schema,
|
|
208
|
+
max_output: max_output, max_input: max_input, max_cost: max_cost,
|
|
209
|
+
on_unknown_pricing: on_unknown_pricing,
|
|
210
|
+
temperature: effective_temp, extra_options: extra_options,
|
|
211
|
+
observers: class_observers
|
|
212
|
+
)
|
|
213
|
+
rescue ArgumentError => e
|
|
214
|
+
return Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
|
|
215
|
+
validation_errors: [e.message])
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
runner.call(input)
|
|
203
219
|
end
|
|
204
220
|
|
|
205
221
|
def log_result(result)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-contract
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.7.
|
|
4
|
+
version: 0.7.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justyna
|
|
@@ -69,15 +69,12 @@ files:
|
|
|
69
69
|
- README.md
|
|
70
70
|
- Rakefile
|
|
71
71
|
- examples/00_basics.rb
|
|
72
|
-
- examples/
|
|
73
|
-
- examples/
|
|
74
|
-
- examples/
|
|
75
|
-
- examples/
|
|
76
|
-
- examples/
|
|
77
|
-
- examples/
|
|
78
|
-
- examples/08_translation.rb
|
|
79
|
-
- examples/09_eval_dataset.rb
|
|
80
|
-
- examples/10_reddit_full_showcase.rb
|
|
72
|
+
- examples/01_fallback_showcase.rb
|
|
73
|
+
- examples/02_real_llm_minimal.rb
|
|
74
|
+
- examples/03_summarize_with_keywords.rb
|
|
75
|
+
- examples/04_summarize_and_translate.rb
|
|
76
|
+
- examples/05_eval_dataset.rb
|
|
77
|
+
- examples/06_retry_variants.rb
|
|
81
78
|
- examples/README.md
|
|
82
79
|
- lib/ruby_llm/contract.rb
|
|
83
80
|
- lib/ruby_llm/contract/adapters.rb
|