ruby_llm-contract 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +55 -0
  4. data/CHANGELOG.md +76 -0
  5. data/Gemfile +11 -0
  6. data/Gemfile.lock +176 -0
  7. data/LICENSE +21 -0
  8. data/README.md +154 -0
  9. data/Rakefile +8 -0
  10. data/examples/00_basics.rb +500 -0
  11. data/examples/01_classify_threads.rb +220 -0
  12. data/examples/02_generate_comment.rb +203 -0
  13. data/examples/03_target_audience.rb +201 -0
  14. data/examples/04_real_llm.rb +410 -0
  15. data/examples/05_output_schema.rb +258 -0
  16. data/examples/07_keyword_extraction.rb +239 -0
  17. data/examples/08_translation.rb +353 -0
  18. data/examples/09_eval_dataset.rb +287 -0
  19. data/examples/10_reddit_full_showcase.rb +363 -0
  20. data/examples/README.md +140 -0
  21. data/lib/ruby_llm/contract/adapters/base.rb +13 -0
  22. data/lib/ruby_llm/contract/adapters/response.rb +17 -0
  23. data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
  24. data/lib/ruby_llm/contract/adapters/test.rb +44 -0
  25. data/lib/ruby_llm/contract/adapters.rb +6 -0
  26. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
  27. data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
  28. data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
  29. data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
  30. data/lib/ruby_llm/contract/configuration.rb +21 -0
  31. data/lib/ruby_llm/contract/contract/definition.rb +39 -0
  32. data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
  33. data/lib/ruby_llm/contract/contract/parser.rb +143 -0
  34. data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
  35. data/lib/ruby_llm/contract/contract/validator.rb +104 -0
  36. data/lib/ruby_llm/contract/contract.rb +7 -0
  37. data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
  38. data/lib/ruby_llm/contract/dsl.rb +13 -0
  39. data/lib/ruby_llm/contract/errors.rb +19 -0
  40. data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
  41. data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
  42. data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
  43. data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
  44. data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
  45. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
  46. data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
  47. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
  48. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
  49. data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
  50. data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
  51. data/lib/ruby_llm/contract/eval/report.rb +115 -0
  52. data/lib/ruby_llm/contract/eval/runner.rb +162 -0
  53. data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
  54. data/lib/ruby_llm/contract/eval.rb +16 -0
  55. data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
  56. data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
  57. data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
  58. data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
  59. data/lib/ruby_llm/contract/pipeline.rb +6 -0
  60. data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
  61. data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
  62. data/lib/ruby_llm/contract/prompt/node.rb +25 -0
  63. data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
  64. data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
  65. data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
  66. data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
  67. data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
  68. data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
  69. data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
  70. data/lib/ruby_llm/contract/railtie.rb +20 -0
  71. data/lib/ruby_llm/contract/rake_task.rb +78 -0
  72. data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
  73. data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
  74. data/lib/ruby_llm/contract/rspec.rb +6 -0
  75. data/lib/ruby_llm/contract/step/base.rb +138 -0
  76. data/lib/ruby_llm/contract/step/dsl.rb +144 -0
  77. data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
  78. data/lib/ruby_llm/contract/step/result.rb +38 -0
  79. data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
  80. data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
  81. data/lib/ruby_llm/contract/step/runner.rb +126 -0
  82. data/lib/ruby_llm/contract/step/trace.rb +70 -0
  83. data/lib/ruby_llm/contract/step.rb +10 -0
  84. data/lib/ruby_llm/contract/token_estimator.rb +19 -0
  85. data/lib/ruby_llm/contract/types.rb +11 -0
  86. data/lib/ruby_llm/contract/version.rb +7 -0
  87. data/lib/ruby_llm/contract.rb +108 -0
  88. data/ruby_llm-contract.gemspec +33 -0
  89. metadata +172 -0
@@ -0,0 +1,287 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # EXAMPLE 9: Dataset-based prompt evaluation
5
+ #
6
+ # Define test cases with expected outputs, run a step against all of them,
7
+ # and get an aggregate quality score. Like unit tests for your prompts.
8
+ #
9
+ # Shows:
10
+ # - Dataset DSL with cases (input + expected)
11
+ # - 4 evaluator types: exact, json_includes, regex, custom proc
12
+ # - expected_traits for multi-property checks
13
+ # - Aggregate scoring (0.0–1.0)
14
+ # - eval_case convenience for inline testing
15
+ # - Eval detecting quality regression
16
+ # =============================================================================
17
+
18
+ require_relative "../lib/ruby_llm/contract"
19
+
20
+ # =============================================================================
21
+ # STEP TO EVALUATE
22
+ # =============================================================================
23
+
24
+ class ClassifyIntent < RubyLLM::Contract::Step::Base
25
+ input_type String
26
+
27
+ output_schema do
28
+ string :intent, enum: %w[sales support billing other]
29
+ number :confidence, minimum: 0.0, maximum: 1.0
30
+ end
31
+
32
+ prompt do
33
+ system "Classify the user's intent."
34
+ user "{input}"
35
+ end
36
+ end
37
+
38
+ # =============================================================================
39
+ # STEP 1: Define a dataset — your "golden set" of test cases
40
+ # =============================================================================
41
+
42
+ puts "=" * 60
43
+ puts "STEP 1: Define a dataset"
44
+ puts "=" * 60
45
+
46
+ dataset = RubyLLM::Contract::Eval::Dataset.define("intent_classification") do
47
+ # Case with exact expected output
48
+ add_case "billing inquiry",
49
+ input: "I need help with my invoice",
50
+ expected: { intent: "billing" }
51
+
52
+ # Case with multiple expected fields
53
+ add_case "sales inquiry",
54
+ input: "I want to upgrade my plan",
55
+ expected: { intent: "sales" }
56
+
57
+ # Case with expected_traits (regex, ranges)
58
+ add_case "support with confidence",
59
+ input: "My app is crashing",
60
+ expected_traits: { intent: "support" }
61
+
62
+ # Case with custom evaluator (proc)
63
+ add_case "high confidence expected",
64
+ input: "URGENT: billing error!!!",
65
+ evaluator: ->(output) { output[:confidence] >= 0.8 }
66
+
67
+ # Case with no expected — just checks contract passes
68
+ add_case "contract smoke test",
69
+ input: "random text here"
70
+ end
71
+
72
+ puts "Dataset: #{dataset.name}"
73
+ puts "Cases: #{dataset.cases.length}"
74
+ dataset.cases.each { |c| puts " - #{c.name}" }
75
+
76
+ # =============================================================================
77
+ # STEP 2: Run the eval — good model (all pass)
78
+ # =============================================================================
79
+
80
+ puts "\n\n#{"=" * 60}"
81
+ puts "STEP 2: Run eval — good model (all cases pass)"
82
+ puts "=" * 60
83
+
84
+ # Simulate a good model that returns correct intents
85
+ good_responses = {
86
+ "I need help with my invoice" => '{"intent": "billing", "confidence": 0.92}',
87
+ "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}',
88
+ "My app is crashing" => '{"intent": "support", "confidence": 0.95}',
89
+ "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.97}',
90
+ "random text here" => '{"intent": "other", "confidence": 0.6}'
91
+ }
92
+
93
+ # Custom adapter that returns different responses per input
94
+ good_adapter = Object.new
95
+ good_adapter.define_singleton_method(:call) do |messages:, **_opts|
96
+ user_msg = messages.find { |m| m[:role] == :user }
97
+ response = good_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
98
+ RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
99
+ end
100
+
101
+ report = RubyLLM::Contract::Eval::Runner.run(
102
+ step: ClassifyIntent,
103
+ dataset: dataset,
104
+ context: { adapter: good_adapter }
105
+ )
106
+
107
+ puts "\nScore: #{report.score.round(2)}"
108
+ puts "Pass rate: #{report.pass_rate}"
109
+ puts "All passed: #{report.passed?}"
110
+ puts
111
+ report.each do |r|
112
+ icon = r.passed? ? "✓" : "✗"
113
+ puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
114
+ end
115
+
116
+ # =============================================================================
117
+ # STEP 3: Run eval — bad model (some fail)
118
+ # =============================================================================
119
+
120
+ puts "\n\n#{"=" * 60}"
121
+ puts "STEP 3: Run eval — bad model (quality regression)"
122
+ puts "=" * 60
123
+
124
+ # Simulate a worse model that misclassifies some intents
125
+ bad_responses = {
126
+ "I need help with my invoice" => '{"intent": "support", "confidence": 0.7}', # WRONG: billing → support
127
+ "I want to upgrade my plan" => '{"intent": "sales", "confidence": 0.88}', # correct
128
+ "My app is crashing" => '{"intent": "other", "confidence": 0.4}', # WRONG: support → other
129
+ "URGENT: billing error!!!" => '{"intent": "billing", "confidence": 0.55}', # low confidence
130
+ "random text here" => '{"intent": "other", "confidence": 0.6}' # correct
131
+ }
132
+
133
+ bad_adapter = Object.new
134
+ bad_adapter.define_singleton_method(:call) do |messages:, **_opts|
135
+ user_msg = messages.find { |m| m[:role] == :user }
136
+ response = bad_responses[user_msg[:content]] || '{"intent": "other", "confidence": 0.5}'
137
+ RubyLLM::Contract::Adapters::Response.new(content: response, usage: { input_tokens: 0, output_tokens: 0 })
138
+ end
139
+
140
+ bad_report = RubyLLM::Contract::Eval::Runner.run(
141
+ step: ClassifyIntent,
142
+ dataset: dataset,
143
+ context: { adapter: bad_adapter }
144
+ )
145
+
146
+ puts "\nScore: #{bad_report.score.round(2)}"
147
+ puts "Pass rate: #{bad_report.pass_rate}"
148
+ puts "All passed: #{bad_report.passed?}"
149
+ puts
150
+ bad_report.each do |r|
151
+ icon = r.passed? ? "✓" : "✗"
152
+ puts " #{icon} #{r.name.ljust(30)} score=#{r.score} #{r.details}"
153
+ end
154
+
155
+ puts "\nRegression detected:"
156
+ puts " Score dropped: #{report.score.round(2)} → #{bad_report.score.round(2)} " \
157
+ "(#{((report.score - bad_report.score) * 100).round(1)}% drop)"
158
+
159
+ # =============================================================================
160
+ # STEP 4: eval_case — quick inline check
161
+ # =============================================================================
162
+
163
+ puts "\n\n#{"=" * 60}"
164
+ puts "STEP 4: eval_case — inline single-case eval"
165
+ puts "=" * 60
166
+
167
+ # No dataset needed — just check one case
168
+ result = ClassifyIntent.eval_case(
169
+ input: "I want to cancel my subscription",
170
+ expected: { intent: "billing" },
171
+ context: { adapter: good_adapter }
172
+ )
173
+
174
+ puts "Passed: #{result[:passed]}"
175
+ puts "Score: #{result[:score]}"
176
+ puts "Output: #{result[:output]}"
177
+ puts "Details: #{result[:details]}"
178
+
179
+ # With expected_traits
180
+ result2 = ClassifyIntent.eval_case(
181
+ input: "URGENT: server down!!!",
182
+ expected_traits: { intent: "support" },
183
+ context: {
184
+ adapter: RubyLLM::Contract::Adapters::Test.new(
185
+ response: '{"intent": "support", "confidence": 0.99}'
186
+ )
187
+ }
188
+ )
189
+
190
+ puts "\nTraits check:"
191
+ puts "Passed: #{result2[:passed]}"
192
+ puts "Details: #{result2[:details]}"
193
+
194
+ # With custom proc evaluator
195
+ result3 = ClassifyIntent.eval_case(
196
+ input: "test",
197
+ evaluator: ->(output) { output[:confidence] > 0.9 },
198
+ context: {
199
+ adapter: RubyLLM::Contract::Adapters::Test.new(
200
+ response: '{"intent": "other", "confidence": 0.95}'
201
+ )
202
+ }
203
+ )
204
+
205
+ puts "\nCustom proc:"
206
+ puts "Passed: #{result3[:passed]} (confidence > 0.9)"
207
+
208
+ # =============================================================================
209
+ # STEP 5: Evaluating a pipeline
210
+ # =============================================================================
211
+
212
+ puts "\n\n#{"=" * 60}"
213
+ puts "STEP 5: Evaluate a pipeline end-to-end"
214
+ puts "=" * 60
215
+
216
+ class SuggestAction < RubyLLM::Contract::Step::Base
217
+ input_type Hash
218
+
219
+ output_schema do
220
+ string :action
221
+ string :priority, enum: %w[low medium high urgent]
222
+ end
223
+
224
+ prompt do
225
+ system "Suggest an action based on the classified intent."
226
+ user "Intent: {intent}, Confidence: {confidence}"
227
+ end
228
+ end
229
+
230
+ class SupportPipeline < RubyLLM::Contract::Pipeline::Base
231
+ step ClassifyIntent, as: :classify
232
+ step SuggestAction, as: :action
233
+ end
234
+
235
+ pipeline_dataset = RubyLLM::Contract::Eval::Dataset.define("support_pipeline") do
236
+ add_case "billing → action",
237
+ input: "I need help with my invoice",
238
+ expected: { priority: "medium" }
239
+
240
+ add_case "urgent → action",
241
+ input: "URGENT: server is down!",
242
+ expected: { priority: "urgent" }
243
+ end
244
+
245
+ pipeline_adapter = RubyLLM::Contract::Adapters::Test.new(
246
+ response: '{"intent": "billing", "confidence": 0.9, "action": "Review invoice", "priority": "medium"}'
247
+ )
248
+
249
+ pipeline_report = RubyLLM::Contract::Eval::Runner.run(
250
+ step: SupportPipeline,
251
+ dataset: pipeline_dataset,
252
+ context: { adapter: pipeline_adapter }
253
+ )
254
+
255
+ puts "\nPipeline eval:"
256
+ puts "Score: #{pipeline_report.score.round(2)}"
257
+ puts "Pass rate: #{pipeline_report.pass_rate}"
258
+ pipeline_report.each do |r|
259
+ icon = r.passed? ? "✓" : "✗"
260
+ puts " #{icon} #{r.name.ljust(25)} #{r.details}"
261
+ end
262
+
263
+ # =============================================================================
264
+ # SUMMARY
265
+ #
266
+ # Dataset eval answers: "Is my prompt good?"
267
+ #
268
+ # Define cases:
269
+ # - expected: exact output match (or json_includes for partial)
270
+ # - expected_traits: multi-property checks (regex, values)
271
+ # - evaluator: custom proc for complex logic
272
+ # - no expected: just check contract passes
273
+ #
274
+ # Run eval:
275
+ # - report.score → 0.0-1.0 aggregate
276
+ # - report.pass_rate → "4/5"
277
+ # - report.each → per-case details
278
+ #
279
+ # Quick check:
280
+ # - MyStep.eval_case(input: ..., expected: ...) → single result
281
+ #
282
+ # Regression detection:
283
+ # - Compare report.score before/after prompt change
284
+ # - Drop from 1.0 to 0.6 → something broke
285
+ #
286
+ # Next: GH-8 adds Regression::Baseline to automate this comparison
287
+ # =============================================================================
@@ -0,0 +1,363 @@
1
+ # frozen_string_literal: true
2
+
3
+ # =============================================================================
4
+ # Reddit Promo Pipeline — 5-step campaign from URL to comment
5
+ #
6
+ # A real-world pipeline that takes a product URL and produces a natural
7
+ # Reddit comment ready to post. Each step has a contract that catches
8
+ # the kind of failures LLMs actually produce in production.
9
+ #
10
+ # ruby examples/10_reddit_full_showcase.rb
11
+ # =============================================================================
12
+
13
+ require_relative "../lib/ruby_llm/contract"
14
+
15
+ # ===========================================================================
16
+ # Step 1 — Analyze the product
17
+ #
18
+ # Takes a plain String URL. Returns audience profile.
19
+ # Contract catches: invalid locale ("USA" instead of "en"), vague audiences.
20
+ # ===========================================================================
21
+
22
+ class AnalyzeProduct < RubyLLM::Contract::Step::Base
23
+ output_schema do
24
+ string :product_description, description: "What the product does (1-2 sentences)"
25
+ string :locale, description: "ISO 639-1 language code"
26
+ string :audience_group_1
27
+ string :audience_group_2
28
+ string :audience_group_3
29
+ end
30
+
31
+ prompt <<~PROMPT
32
+ You are a marketing analyst. Analyze the product and identify target audiences.
33
+ locale must be a 2-letter ISO 639-1 code (en, pl, de), NOT a country name.
34
+ Audience groups must be specific, not generic.
35
+
36
+ {input}
37
+ PROMPT
38
+
39
+ max_input 3_000 # refuse before LLM call if prompt too large
40
+ max_cost 0.01 # refuse before LLM call if estimated cost > $0.01
41
+
42
+ validate("locale is valid ISO 639-1") { |o| o[:locale].to_s.match?(/\A[a-z]{2}\z/) }
43
+ validate("description is substantive") { |o| o[:product_description].to_s.split.size >= 5 }
44
+ validate("audience groups are specific") do |o|
45
+ [o[:audience_group_1], o[:audience_group_2], o[:audience_group_3]].all? { |g| g.to_s.size > 5 }
46
+ end
47
+ end
48
+
49
+ # ===========================================================================
50
+ # Step 2 — Find subreddits and a sample thread
51
+ #
52
+ # Receives the audience profile, returns subreddits + a thread to work with.
53
+ # Contract catches: empty subreddit names, missing thread language.
54
+ # ===========================================================================
55
+
56
+ class IdentifySubreddits < RubyLLM::Contract::Step::Base
57
+ input_type Hash
58
+
59
+ output_schema do
60
+ string :product_description
61
+ string :locale
62
+ string :subreddit_1
63
+ string :subreddit_2
64
+ string :subreddit_3
65
+ string :thread_title, description: "A representative thread title"
66
+ string :thread_selftext, description: "Thread body text"
67
+ string :thread_subreddit
68
+ string :thread_language, description: "ISO 639-1 code of the thread's language"
69
+ end
70
+
71
+ prompt <<~PROMPT
72
+ You are a Reddit marketing researcher.
73
+ Find subreddits where the target audience hangs out.
74
+ Pick one representative thread that would be perfect for a product mention.
75
+ Pass through product_description and locale from input.
76
+
77
+ SUBREDDIT CRITERIA:
78
+ - Active community (>10k members)
79
+ - Allows product discussions
80
+ - Not hostile to recommendations
81
+
82
+ {input}
83
+ PROMPT
84
+
85
+ validate("has subreddits") do |o|
86
+ [o[:subreddit_1], o[:subreddit_2], o[:subreddit_3]].all? { |s| s.to_s.size >= 2 }
87
+ end
88
+ validate("thread has content") { |o| o[:thread_title].to_s.size > 5 }
89
+ validate("thread language is valid") { |o| o[:thread_language].to_s.match?(/\A[a-z]{2}\z/) }
90
+ end
91
+
92
+ # ===========================================================================
93
+ # Step 3 — Classify the thread
94
+ #
95
+ # PROMO / FILLER / SKIP with relevance score.
96
+ # Uses `validate` and a 2-arity invariant that cross-checks the output
97
+ # language against the input language.
98
+ # Contract catches: PROMO with score 2, SKIP with score 8, wrong language.
99
+ # ===========================================================================
100
+
101
+ class ClassifyThread < RubyLLM::Contract::Step::Base
102
+ input_type Hash
103
+
104
+ # Block DSL because we use `example` (few-shot learning)
105
+ prompt do
106
+ system "You are a thread classifier for Reddit marketing."
107
+ rule "Classify the thread as PROMO, FILLER, or SKIP based on product relevance."
108
+ rule "Return JSON with: classification, relevance_score (1-10), reasoning, thread_title, thread_language."
109
+ rule "PROMO: score >= 6. FILLER: 3-5. SKIP: 1-2."
110
+
111
+ example input: '{"thread_title":"Best invoicing tool?","product_description":"invoicing SaaS"}',
112
+ output: '{"classification":"PROMO","relevance_score":9,"reasoning":"Direct fit","thread_title":"Best invoicing tool?","thread_language":"en"}'
113
+
114
+ user "{input}"
115
+ end
116
+
117
+ validate("valid classification") { |o| %w[PROMO FILLER SKIP].include?(o[:classification]) }
118
+ validate("relevance score in range") { |o| o[:relevance_score].is_a?(Integer) && o[:relevance_score].between?(1, 10) }
119
+ validate("PROMO score >= 6") { |o| o[:classification] != "PROMO" || o[:relevance_score] >= 6 }
120
+ validate("SKIP score <= 2") { |o| o[:classification] != "SKIP" || o[:relevance_score] <= 2 }
121
+
122
+ validate("thread language preserved from input") do |output, input|
123
+ next true unless input.is_a?(Hash) && input[:thread_language]
124
+
125
+ output[:thread_language] == input[:thread_language]
126
+ end
127
+ end
128
+
129
+ # ===========================================================================
130
+ # Step 4 — Plan the comment
131
+ #
132
+ # Decides approach, tone, and key points before writing.
133
+ # Contract catches: missing strategy, invalid tone.
134
+ # ===========================================================================
135
+
136
+ class PlanComment < RubyLLM::Contract::Step::Base
137
+ input_type Hash
138
+
139
+ prompt <<~PROMPT
140
+ You are a Reddit comment strategist.
141
+ Plan a helpful, non-spammy comment for the classified thread.
142
+ Return JSON with: approach, tone, key_points, link_strategy, thread_title.
143
+
144
+ GUIDELINES:
145
+ - Never use aggressive marketing language.
146
+ - Be genuinely helpful first.
147
+ - Mention product naturally.
148
+
149
+ TONE OPTIONS:
150
+ - casual — peer sharing experience
151
+ - professional — industry expert
152
+ - empathetic — I had the same problem
153
+
154
+ {input}
155
+ PROMPT
156
+
157
+ validate("has approach") { |o| o[:approach].to_s.size > 5 }
158
+ validate("valid tone") { |o| %w[casual professional empathetic].include?(o[:tone]) }
159
+ validate("has link strategy") { |o| o[:link_strategy].to_s.size > 3 }
160
+ end
161
+
162
+ # ===========================================================================
163
+ # Step 5 — Write the comment
164
+ #
165
+ # Retry policy: starts with gpt-4.1-nano (cheap), escalates to mini then
166
+ # full if the contract catches problems. In practice, nano often writes
167
+ # comments that are too short or forget the link.
168
+ # Contract catches: spam phrases, banned openings, missing links, too short.
169
+ # ===========================================================================
170
+
171
+ class GenerateComment < RubyLLM::Contract::Step::Base
172
+ input_type Hash
173
+
174
+ # Block DSL here because we use `example` (few-shot) — needs user/assistant pairs.
175
+ # Steps without examples use plain heredoc (see AnalyzeProduct, PlanComment above).
176
+ prompt do
177
+ system "You are a helpful Reddit commenter promoting a SaaS product."
178
+ rule "Write the comment based on the plan."
179
+ rule "Return JSON with: comment, word_count (integer)."
180
+ rule "No markdown headers. No emojis. No bullet lists."
181
+ rule "Include https://acme-invoice.com naturally, maximum once."
182
+
183
+ section "ANTI-SPAM",
184
+ "Never use: buy now, limited offer, click here, act fast, discount.\n" \
185
+ "Never start with: Great question!, As a, I'm an AI, Hey there!"
186
+
187
+ example input: '{"approach":"share experience","tone":"casual"}',
188
+ output: '{"comment":"I switched to Acme Invoice last year and it cut my invoicing time ' \
189
+ "in half. The automatic reminders are a lifesaver. https://acme-invoice.com if " \
190
+ 'you want to check it out.","word_count":30}'
191
+
192
+ user "{input}"
193
+ end
194
+
195
+ validate("comment long enough") { |o| o[:comment].to_s.strip.size > 30 }
196
+ validate("no markdown headers") { |o| !o[:comment].to_s.match?(/^\#{2,}/) }
197
+ validate("has word count") { |o| o[:word_count].is_a?(Integer) && o[:word_count].positive? }
198
+ validate("contains product link") { |o| o[:comment].to_s.include?("acme-invoice.com") }
199
+ validate("no spam phrases") do |o|
200
+ spam = ["buy now", "limited offer", "click here", "act fast", "discount"]
201
+ spam.none? { |s| o[:comment].to_s.downcase.include?(s) }
202
+ end
203
+ validate("no banned openings") do |o|
204
+ banned = ["Great question", "As a", "I'm an AI", "Hey there!", "Check this out"]
205
+ banned.none? { |b| o[:comment].to_s.start_with?(b) }
206
+ end
207
+
208
+ max_output 300 # tokens — don't let the model ramble
209
+
210
+ retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini gpt-4.1]
211
+ end
212
+
213
+ # ===========================================================================
214
+ # Pipeline — wires the 5 steps together, with per-step model hints
215
+ # ===========================================================================
216
+
217
+ class RedditPromoPipeline < RubyLLM::Contract::Pipeline::Base
218
+ step AnalyzeProduct, as: :analyze, model: "gpt-4.1-mini"
219
+ step IdentifySubreddits, as: :subreddits, model: "gpt-4.1-mini"
220
+ step ClassifyThread, as: :classify, model: "gpt-4.1-nano"
221
+ step PlanComment, as: :plan, model: "gpt-4.1-nano"
222
+ step GenerateComment, as: :comment # uses retry_policy escalation
223
+
224
+ token_budget 15_000 # max tokens across all steps — halt if exceeded
225
+ end
226
+
227
+ # ===========================================================================
228
+ # Eval — defined OUTSIDE the step class (like specs live outside models)
229
+ # In production: eval/generate_comment_eval.rb
230
+ # ===========================================================================
231
+
232
+ GenerateComment.define_eval("smoke") do
233
+ default_input({
234
+ approach: "Share personal experience with invoicing frustration, then mention Acme Invoice",
235
+ tone: "casual",
236
+ key_points: '["empathize","mention recurring invoices","highlight reminders"]',
237
+ link_strategy: "Drop link naturally after mentioning the tool",
238
+ thread_title: "What invoicing tool do you use?"
239
+ })
240
+
241
+ sample_response({
242
+ comment: "I was in the exact same boat — spreadsheets worked until I had more than " \
243
+ "10 clients, then tracking who paid became a nightmare. I switched to Acme " \
244
+ "Invoice about a year ago and it's been great. Recurring invoices are " \
245
+ "set-and-forget, and the automatic payment reminders saved me so many awkward " \
246
+ "follow-up emails. It's affordable too. https://acme-invoice.com if you want " \
247
+ "to check it out.",
248
+ word_count: 62
249
+ })
250
+
251
+ # Zero verify needed — step's validate blocks already check:
252
+ # comment long enough, no markdown headers, has word count,
253
+ # contains product link, no spam phrases, no banned openings.
254
+ end
255
+
256
+ # ===========================================================================
257
+ # Simulated LLM responses (what a real model would return)
258
+ # ===========================================================================
259
+
260
+ RESPONSES = {
261
+ analyze: {
262
+ product_description: "Simple invoicing and billing platform for freelancers and small businesses",
263
+ locale: "en",
264
+ audience_group_1: "freelance designers and developers",
265
+ audience_group_2: "small business owners under 10 employees",
266
+ audience_group_3: "accountants serving freelance clients"
267
+ },
268
+
269
+ subreddits: {
270
+ product_description: "Simple invoicing and billing platform for freelancers",
271
+ locale: "en",
272
+ subreddit_1: "freelance", subreddit_2: "smallbusiness", subreddit_3: "Entrepreneur",
273
+ thread_title: "What invoicing tool do you use for your freelance business?",
274
+ thread_selftext: "I've been using spreadsheets but it's getting out of hand. " \
275
+ "Need something for recurring invoices and payment reminders.",
276
+ thread_subreddit: "freelance",
277
+ thread_language: "en"
278
+ },
279
+
280
+ classify: {
281
+ classification: "PROMO", relevance_score: 9,
282
+ reasoning: "Thread directly asks for invoicing tool — perfect fit",
283
+ thread_title: "What invoicing tool do you use for your freelance business?",
284
+ thread_language: "en"
285
+ },
286
+
287
+ plan: {
288
+ approach: "Share personal experience with invoicing frustration, then mention Acme Invoice",
289
+ tone: "casual",
290
+ key_points: '["empathize with spreadsheet pain","mention recurring invoices",' \
291
+ '"highlight payment reminders","note affordability"]',
292
+ link_strategy: "Drop link naturally after mentioning the tool by name",
293
+ thread_title: "What invoicing tool do you use for your freelance business?"
294
+ },
295
+
296
+ comment: {
297
+ comment: "I was in the exact same boat — spreadsheets worked until I had more than " \
298
+ "10 clients, then tracking who paid became a nightmare. I switched to Acme " \
299
+ "Invoice about a year ago and it's been great. Recurring invoices are " \
300
+ "set-and-forget, and the automatic payment reminders saved me so many awkward " \
301
+ "follow-up emails. It's affordable too. https://acme-invoice.com if you want " \
302
+ "to check it out.",
303
+ word_count: 62
304
+ }
305
+ }.freeze
306
+
307
+ # ===========================================================================
308
+ # Run — Pipeline.test with named responses (no adapter setup needed)
309
+ # ===========================================================================
310
+
311
+ result = RedditPromoPipeline.test(
312
+ "https://acme-invoice.com — Simple invoicing for freelancers",
313
+ responses: RESPONSES
314
+ )
315
+
316
+ # ===========================================================================
317
+ # Results
318
+ # ===========================================================================
319
+
320
+ puts result
321
+ # Pipeline: ok 5 steps 0ms 0+0 tokens $0.000000 trace=...
322
+ # analyze ok gpt-4.1-mini 0ms 0+0 tokens $0.000000
323
+ # subreddits ok gpt-4.1-mini 0ms 0+0 tokens $0.000000
324
+ # classify ok gpt-4.1-nano 0ms 0+0 tokens $0.000000
325
+ # plan ok gpt-4.1-nano 0ms 0+0 tokens $0.000000
326
+ # comment ok gpt-4.1-nano 0ms 0+0 tokens $0.000000
327
+ # (costs are $0 here because Test adapter reports 0 tokens —
328
+ # with a real LLM you'd see actual costs from model registry)
329
+
330
+ puts
331
+
332
+ result.pretty_print
333
+ # +----------------------------------------------------------------------------------+
334
+ # | Pipeline: ok 5 steps 0ms ... |
335
+ # +----------------+------------+----------------------------------------------------+
336
+ # | Step | Status | Output |
337
+ # +----------------+------------+----------------------------------------------------+
338
+ # | analyze | ok | product_description: Simple invoicing and billi... |
339
+ # | | | locale: en |
340
+ # | | | audience_group_1: freelance designers and devel... |
341
+ # +----------------+------------+----------------------------------------------------+
342
+ # | subreddits | ok | subreddit_1: freelance |
343
+ # | | | thread_title: What invoicing tool do you use fo... |
344
+ # +----------------+------------+----------------------------------------------------+
345
+ # | classify | ok | classification: PROMO |
346
+ # | | | relevance_score: 9 |
347
+ # | | | reasoning: Thread directly asks for invoicing t... |
348
+ # +----------------+------------+----------------------------------------------------+
349
+ # | plan | ok | approach: Share personal experience with invoic... |
350
+ # | | | tone: casual |
351
+ # +----------------+------------+----------------------------------------------------+
352
+ # | comment | ok | comment: I was in the exact same boat — spreads... |
353
+ # | | | word_count: 62 |
354
+ # +----------------------------------------------------------------------------------+
355
+
356
+ puts
357
+
358
+ # ===========================================================================
359
+ # Quality check — zero setup, eval has its own sample_response
360
+ # ===========================================================================
361
+
362
+ puts GenerateComment.run_eval("smoke")
363
+ # smoke: 1/1 checks passed