ruby_llm-contract 0.8.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,8 @@ require "ruby_llm/contract"
5
5
  module RubyLLM
6
6
  module Contract
7
7
  module MinitestHelpers
8
+ include Concerns::StubHelpers
9
+
8
10
  # Snapshot adapter before each test so teardown can restore it.
9
11
  def setup
10
12
  super if defined?(super)
@@ -47,114 +49,10 @@ module RubyLLM
47
49
  report
48
50
  end
49
51
 
50
- # Stub a specific step to return a canned response without API calls.
51
- # Routes per-step other steps are not affected.
52
- #
53
- # stub_step(ClassifyTicket, response: { priority: "high" })
54
- #
55
- # Supports an optional block form — the override is removed after the
56
- # block returns (even if it raises):
57
- #
58
- # stub_step(ClassifyTicket, response: data) do
59
- # result = ClassifyTicket.run("test")
60
- # end
61
- # # ClassifyTicket.run no longer stubbed
62
- #
63
- def stub_step(step_class, response: nil, responses: nil, &block)
64
- adapter = if responses
65
- Adapters::Test.new(responses: responses)
66
- else
67
- Adapters::Test.new(response: response)
68
- end
69
-
70
- overrides = RubyLLM::Contract.step_adapter_overrides
71
- previous = overrides[step_class]
72
- overrides[step_class] = adapter
73
-
74
- if block
75
- begin
76
- yield
77
- ensure
78
- if previous
79
- overrides[step_class] = previous
80
- else
81
- overrides.delete(step_class)
82
- end
83
- end
84
- end
85
- end
86
-
87
- # Stub multiple steps at once with different responses.
88
- # Takes a hash of step_class => options. Requires a block.
89
- #
90
- # stub_steps(
91
- # ClassifyTicket => { response: { priority: "high" } },
92
- # RouteToTeam => { response: { team: "billing" } }
93
- # ) do
94
- # result = TicketPipeline.run("test")
95
- # end
96
- #
97
- def stub_steps(stubs, &block)
98
- raise ArgumentError, "stub_steps requires a block" unless block
99
-
100
- overrides = RubyLLM::Contract.step_adapter_overrides
101
- previous = {}
102
-
103
- stubs.each do |step_class, opts|
104
- opts = opts.transform_keys(&:to_sym)
105
- adapter = if opts[:responses]
106
- Adapters::Test.new(responses: opts[:responses])
107
- else
108
- Adapters::Test.new(response: opts[:response])
109
- end
110
- previous[step_class] = overrides[step_class]
111
- overrides[step_class] = adapter
112
- end
113
-
114
- begin
115
- yield
116
- ensure
117
- stubs.each_key do |step_class|
118
- if previous[step_class]
119
- overrides[step_class] = previous[step_class]
120
- else
121
- overrides.delete(step_class)
122
- end
123
- end
124
- end
125
- end
126
-
127
- # Set a global test adapter for ALL steps.
128
- #
129
- # stub_all_steps(response: { default: true })
130
- #
131
- # Supports an optional block form — the previous adapter is restored
132
- # after the block returns (even if it raises):
133
- #
134
- # stub_all_steps(response: { default: true }) do
135
- # # all steps use test adapter
136
- # end
137
- # # original adapter restored
138
- #
139
- def stub_all_steps(response: nil, responses: nil, &block)
140
- adapter = if responses
141
- Adapters::Test.new(responses: responses)
142
- else
143
- Adapters::Test.new(response: response)
144
- end
145
-
146
- if block
147
- previous = RubyLLM::Contract.configuration.default_adapter
148
- begin
149
- RubyLLM::Contract.configuration.default_adapter = adapter
150
- yield
151
- ensure
152
- RubyLLM::Contract.configuration.default_adapter = previous
153
- end
154
- else
155
- RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
156
- end
157
- end
52
+ # `stub_step`, `stub_steps`, `stub_all_steps` provided by
53
+ # `Concerns::StubHelpers` (included above). Shared implementation
54
+ # used by both Minitest and RSpec hosts; documentation and method
55
+ # signatures live in `concerns/stub_helpers.rb`.
158
56
  end
159
57
  end
160
58
  end
@@ -108,7 +108,7 @@ module RubyLLM
108
108
  trace = step_result.trace
109
109
  status = step_status(step_result)
110
110
  trace_str = trace.respond_to?(:to_s) ? trace.to_s : ""
111
- " #{step_record[:alias].to_s.ljust(14)} #{status.ljust(10)} #{trace_str}"
111
+ " #{step_record[:alias].to_s.ljust(COL1)} #{status.ljust(COL2)} #{trace_str}"
112
112
  end
113
113
 
114
114
  def step_status(step_result)
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ class RakeTask < ::Rake::TaskLib
6
+ # Encapsulates the pass/fail gate that runs after `RakeTask#define_task`
7
+ # has collected eval reports. Extracted from the prior `define_task`
8
+ # god-method so each gating dimension (cost, score, regression) is
9
+ # testable in isolation.
10
+ #
11
+ # Returns a `Verdict` value object with:
12
+ # - `passed?` — overall gate verdict
13
+ # - `abort_reason` — String for `abort` when `passed? == false`, nil otherwise
14
+ # - `passed_reports` — [[host, report], ...] of reports that individually passed
15
+ # (used to decide which baselines to save)
16
+ # - `suite_cost` — total cost across all reports
17
+ #
18
+ # Gate ordering (preserved from pre-refactor behaviour):
19
+ # 1. cost gate runs FIRST — if `maximum_cost` set and exceeded, the
20
+ # suite aborts before any score check; passed_reports is empty.
21
+ # 2. score gate runs per-report; a report passes if
22
+ # `report_meets_score?` AND `!check_regression`.
23
+ # 3. overall passed = ALL reports passed AND cost gate not tripped.
24
+ class SuiteGate
25
+ Verdict = Data.define(:passed, :abort_reason, :passed_reports, :suite_cost) do
26
+ def passed?
27
+ passed
28
+ end
29
+ end
30
+
31
+ def self.evaluate(host_reports:, minimum_score:, maximum_cost:, fail_on_regression:)
32
+ new(host_reports: host_reports,
33
+ minimum_score: minimum_score,
34
+ maximum_cost: maximum_cost,
35
+ fail_on_regression: fail_on_regression).verdict
36
+ end
37
+
38
+ attr_reader :verdict
39
+
40
+ def initialize(host_reports:, minimum_score:, maximum_cost:, fail_on_regression:)
41
+ @host_reports = host_reports
42
+ @minimum_score = minimum_score
43
+ @maximum_cost = maximum_cost
44
+ @fail_on_regression = fail_on_regression
45
+ @verdict = build_verdict
46
+ end
47
+
48
+ private
49
+
50
+ def build_verdict
51
+ suite_cost = compute_suite_cost
52
+
53
+ if cost_exceeded?(suite_cost)
54
+ return Verdict.new(
55
+ passed: false,
56
+ abort_reason: cost_abort_message(suite_cost),
57
+ passed_reports: [],
58
+ suite_cost: suite_cost
59
+ )
60
+ end
61
+
62
+ passed_reports, all_passed = score_each_report
63
+ Verdict.new(
64
+ passed: all_passed,
65
+ abort_reason: all_passed ? nil : "Eval suite FAILED",
66
+ passed_reports: passed_reports,
67
+ suite_cost: suite_cost
68
+ )
69
+ end
70
+
71
+ def compute_suite_cost
72
+ @host_reports.sum { |_host, report| report.total_cost }
73
+ end
74
+
75
+ def cost_exceeded?(suite_cost)
76
+ @maximum_cost && suite_cost > @maximum_cost
77
+ end
78
+
79
+ def cost_abort_message(suite_cost)
80
+ "total cost $#{format("%.4f", suite_cost)} exceeds budget $#{format("%.4f", @maximum_cost)}"
81
+ end
82
+
83
+ def score_each_report
84
+ passed_reports = []
85
+ all_passed = true
86
+ @host_reports.each do |host, report|
87
+ report_ok = report_meets_score?(report) && !check_regression(report)
88
+ all_passed = false unless report_ok
89
+ passed_reports << [host, report] if report_ok
90
+ end
91
+ [passed_reports, all_passed]
92
+ end
93
+
94
+ def report_meets_score?(report)
95
+ if @minimum_score
96
+ report.score >= @minimum_score
97
+ else
98
+ report.passed?
99
+ end
100
+ end
101
+
102
+ def check_regression(report)
103
+ return false unless @fail_on_regression && report.baseline_exists?
104
+
105
+ diff = report.compare_with_baseline
106
+ if diff.regressed?
107
+ puts "\n REGRESSIONS DETECTED:"
108
+ puts " #{diff}"
109
+ true
110
+ else
111
+ false
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "rake"
4
4
  require "rake/tasklib"
5
+ require_relative "rake_task/suite_gate"
5
6
 
6
7
  module RubyLLM
7
8
  module Contract
@@ -33,71 +34,49 @@ module RubyLLM
33
34
  RubyLLM::Contract.load_evals!(*@eval_dirs)
34
35
 
35
36
  context = @context.respond_to?(:call) ? @context.call : @context
36
- results = RubyLLM::Contract.run_all_evals(context: context)
37
-
38
- if results.empty?
39
- if @fail_on_empty
40
- abort "No evals defined. Define evals with define_eval or set fail_on_empty = false."
41
- else
42
- puts "No evals defined."
43
- next
44
- end
45
- end
46
-
47
- gate_passed = true
48
- suite_cost = 0.0
49
-
50
- passed_reports = []
51
- all_reports = []
52
-
53
- results.each do |host, reports|
54
- puts "\n#{host.name || host.to_s}"
55
- reports.each_value do |report|
56
- report.print_summary
57
- suite_cost += report.total_cost
58
- all_reports << [host, report]
59
- report_ok = report_meets_score?(report) && !check_regression(report)
60
- gate_passed = false unless report_ok
61
- passed_reports << report if report_ok
62
- end
63
- end
37
+ host_reports = collect_host_reports(context)
38
+ next unless host_reports # empty path already handled
64
39
 
65
40
  # Save history BEFORE gating — failures are valuable trend data (ADR-0016 F3)
66
- save_all_history!(all_reports, context) if @track_history
41
+ save_all_history!(host_reports, context) if @track_history
67
42
 
68
- if @maximum_cost && suite_cost > @maximum_cost
69
- abort "\nEval suite FAILED: total cost $#{format("%.4f", suite_cost)} " \
70
- "exceeds budget $#{format("%.4f", @maximum_cost)}"
71
- end
43
+ verdict = SuiteGate.evaluate(
44
+ host_reports: host_reports,
45
+ minimum_score: @minimum_score,
46
+ maximum_cost: @maximum_cost,
47
+ fail_on_regression: @fail_on_regression
48
+ )
72
49
 
73
- abort "\nEval suite FAILED" unless gate_passed
50
+ abort "\nEval suite FAILED: #{verdict.abort_reason}" unless verdict.passed?
74
51
 
75
52
  # Save baselines only after ALL gates pass
76
- passed_reports.each { |r| save_baseline!(r) } if @save_baseline
53
+ verdict.passed_reports.each { |_host, r| save_baseline!(r) } if @save_baseline
77
54
 
78
55
  puts "\nAll evals passed."
79
56
  end
80
57
  end
81
58
 
82
- def report_meets_score?(report)
83
- if @minimum_score
84
- report.score >= @minimum_score
85
- else
86
- report.passed?
87
- end
88
- end
59
+ def collect_host_reports(context)
60
+ results = RubyLLM::Contract.run_all_evals(context: context)
89
61
 
90
- def check_regression(report)
91
- return false unless @fail_on_regression && report.baseline_exists?
62
+ if results.empty?
63
+ if @fail_on_empty
64
+ abort "No evals defined. Define evals with define_eval or set fail_on_empty = false."
65
+ else
66
+ puts "No evals defined."
67
+ return nil
68
+ end
69
+ end
92
70
 
93
- diff = report.compare_with_baseline
94
- if diff.regressed?
95
- puts "\n REGRESSIONS DETECTED:"
96
- puts " #{diff}"
97
- true
98
- else
99
- false
71
+ host_reports = []
72
+ results.each do |host, reports|
73
+ puts "\n#{host.name || host.to_s}"
74
+ reports.each_value do |report|
75
+ report.print_summary
76
+ host_reports << [host, report]
77
+ end
100
78
  end
79
+ host_reports
101
80
  end
102
81
 
103
82
  def save_baseline!(report)
@@ -3,130 +3,16 @@
3
3
  module RubyLLM
4
4
  module Contract
5
5
  module RSpec
6
+ # `stub_step`, `stub_steps`, `stub_all_steps` — provided by
7
+ # `Concerns::StubHelpers`. Shared implementation used by both RSpec
8
+ # and Minitest hosts; documentation and method signatures live in
9
+ # `concerns/stub_helpers.rb`.
10
+ #
11
+ # Cleanup between examples is handled by the `around(:each)` hook
12
+ # in `lib/ruby_llm/contract/rspec.rb`, which snapshots and restores
13
+ # `step_adapter_overrides` plus `configuration.default_adapter`.
6
14
  module Helpers
7
- # Stub a step to return a canned response without API calls.
8
- #
9
- # stub_step(ClassifyTicket, response: { priority: "high" })
10
- # result = ClassifyTicket.run("test")
11
- # result.parsed_output # => {priority: "high"}
12
- #
13
- # Only affects the specified step — other steps are not affected.
14
- #
15
- # With a block, the stub is scoped — cleaned up after the block:
16
- #
17
- # stub_step(ClassifyTicket, response: data) do
18
- # # only stubbed inside this block
19
- # end
20
- # # ClassifyTicket no longer stubbed
21
- #
22
- # Without a block, the stub lives until the RSpec example ends.
23
- #
24
- def stub_step(step_class, response: nil, responses: nil, &block)
25
- adapter = build_test_adapter(response: response, responses: responses)
26
-
27
- if block
28
- # Block form: use thread-local overrides with save/restore for real scoping
29
- overrides = RubyLLM::Contract.step_adapter_overrides
30
- previous = overrides[step_class]
31
- overrides[step_class] = adapter
32
- begin
33
- yield
34
- ensure
35
- if previous
36
- overrides[step_class] = previous
37
- else
38
- overrides.delete(step_class)
39
- end
40
- end
41
- else
42
- # Non-block: use RSpec allow (auto-cleaned after example)
43
- allow(step_class).to receive(:run).and_wrap_original do |original, input, **kwargs|
44
- context = kwargs[:context] || {}
45
- unless context.key?(:adapter) || context.key?("adapter")
46
- context = context.merge(adapter: adapter)
47
- end
48
- original.call(input, context: context)
49
- end
50
- end
51
- end
52
-
53
- # Stub multiple steps at once with different responses.
54
- # Takes a hash of step_class => options. Requires a block.
55
- #
56
- # stub_steps(
57
- # ClassifyTicket => { response: { priority: "high" } },
58
- # RouteToTeam => { response: { team: "billing" } }
59
- # ) do
60
- # result = TicketPipeline.run("test")
61
- # end
62
- #
63
- def stub_steps(stubs, &block)
64
- raise ArgumentError, "stub_steps requires a block" unless block
65
-
66
- overrides = RubyLLM::Contract.step_adapter_overrides
67
- previous = {}
68
-
69
- stubs.each do |step_class, opts|
70
- opts = opts.transform_keys(&:to_sym)
71
- adapter = build_test_adapter(**opts)
72
- previous[step_class] = overrides[step_class]
73
- overrides[step_class] = adapter
74
- end
75
-
76
- begin
77
- yield
78
- ensure
79
- stubs.each_key do |step_class|
80
- if previous[step_class]
81
- overrides[step_class] = previous[step_class]
82
- else
83
- overrides.delete(step_class)
84
- end
85
- end
86
- end
87
- end
88
-
89
- # Set a global test adapter for ALL steps.
90
- #
91
- # stub_all_steps(response: { default: true })
92
- #
93
- # Supports an optional block form — the previous adapter is restored
94
- # after the block returns (even if it raises):
95
- #
96
- # stub_all_steps(response: { default: true }) do
97
- # # all steps use test adapter
98
- # end
99
- # # original adapter restored
100
- #
101
- def stub_all_steps(response: nil, responses: nil, &block)
102
- adapter = build_test_adapter(response: response, responses: responses)
103
-
104
- if block
105
- previous = RubyLLM::Contract.configuration.default_adapter
106
- begin
107
- RubyLLM::Contract.configuration.default_adapter = adapter
108
- yield
109
- ensure
110
- RubyLLM::Contract.configuration.default_adapter = previous
111
- end
112
- else
113
- RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
114
- end
115
- end
116
-
117
- private
118
-
119
- def build_test_adapter(response: nil, responses: nil)
120
- if responses
121
- Adapters::Test.new(responses: responses.map { |r| normalize_test_response(r) })
122
- else
123
- Adapters::Test.new(response: normalize_test_response(response))
124
- end
125
- end
126
-
127
- def normalize_test_response(value)
128
- value
129
- end
15
+ include Concerns::StubHelpers
130
16
  end
131
17
  end
132
18
  end
@@ -21,19 +21,30 @@ module RubyLLM
21
21
  context: context).results.first
22
22
  end
23
23
 
24
- def estimate_cost(input:, model: nil)
24
+ def estimate_cost(input:, model: nil, attachment: nil)
25
25
  model_name = estimated_model_name(model)
26
- model_info = CostCalculator.send(:find_model, model_name)
26
+ model_info = CostCalculator.find_model(model_name)
27
27
  return nil unless model_info
28
28
 
29
- input_tokens = TokenEstimator.estimate(build_messages(input))
29
+ text_tokens = TokenEstimator.estimate(build_messages(input))
30
+ attachment_tokens, attachment_error = resolve_attachment_tokens(attachment)
31
+ return nil if attachment_error
32
+
33
+ input_tokens = text_tokens + attachment_tokens
34
+ # NOTE: attachment tokens add to input only, not output. Vision-
35
+ # heavy outputs (long image descriptions) may exceed
36
+ # `output_tokens_estimate` — this method is a floor for budget
37
+ # planning, not a precise predictor. See multimodal_input.md.
30
38
  output_tokens = max_output || DEFAULT_OUTPUT_TOKENS
31
39
 
32
40
  {
33
41
  model: model_name,
34
42
  input_tokens: input_tokens,
35
43
  output_tokens_estimate: output_tokens,
36
- estimated_cost: estimated_cost_for(model_info, input_tokens, output_tokens)
44
+ estimated_cost: CostCalculator.calculate(
45
+ model_name: model_name,
46
+ usage: { input_tokens: input_tokens, output_tokens: output_tokens }
47
+ )
37
48
  }
38
49
  end
39
50
 
@@ -49,7 +60,9 @@ module RubyLLM
49
60
  end
50
61
  end
51
62
 
52
- def recommend(eval_name, candidates:, min_score: 0.95, min_first_try_pass_rate: 0.8, context: {})
63
+ def recommend(eval_name, candidates:, context: {},
64
+ min_score: Eval::DEFAULT_MIN_SCORE,
65
+ min_first_try_pass_rate: Eval::DEFAULT_MIN_FIRST_TRY_PASS_RATE)
53
66
  comparison = compare_models(eval_name, candidates: candidates, context: context)
54
67
  Eval::Recommender.new(
55
68
  comparison: comparison,
@@ -59,7 +72,9 @@ module RubyLLM
59
72
  ).recommend
60
73
  end
61
74
 
62
- def optimize_retry_policy(candidates:, context: {}, min_score: 0.95, runs: 1, production_mode: nil)
75
+ def optimize_retry_policy(candidates:, context: {},
76
+ min_score: Eval::DEFAULT_MIN_SCORE,
77
+ runs: 1, production_mode: nil)
63
78
  Eval::RetryOptimizer.new(
64
79
  step: self,
65
80
  candidates: candidates,
@@ -71,7 +86,7 @@ module RubyLLM
71
86
  end
72
87
 
73
88
  KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists
74
- reasoning_effort retry_policy_override].freeze
89
+ reasoning_effort retry_policy_override attachment].freeze
75
90
 
76
91
  include Concerns::ContextHelpers
77
92
 
@@ -104,12 +119,23 @@ module RubyLLM
104
119
  model || (self.model if respond_to?(:model)) || RubyLLM::Contract.configuration.default_model
105
120
  end
106
121
 
107
- def estimated_cost_for(model_info, input_tokens, output_tokens)
108
- CostCalculator.send(
109
- :compute_cost,
110
- model_info,
111
- { input_tokens: input_tokens, output_tokens: output_tokens }
112
- )
122
+ # Returns [tokens, error?] where error is true when fail-closed should
123
+ # short-circuit the caller. Mirrors limit_checker.rb fail-closed policy
124
+ # so estimate_cost and runtime check_limits agree on the same input.
125
+ def resolve_attachment_tokens(attachment)
126
+ return [0, false] if attachment.nil?
127
+
128
+ estimate = attachment_token_estimate if respond_to?(:attachment_token_estimate)
129
+ return [estimate, false] unless estimate.nil?
130
+
131
+ mode = respond_to?(:on_unknown_attachment_size) ? on_unknown_attachment_size : :refuse
132
+ if mode == :warn
133
+ warn "[ruby_llm-contract] attachment present but attachment_token_estimate not " \
134
+ "declared on #{name || self} — estimate_cost proceeds without attachment cost"
135
+ return [0, false]
136
+ end
137
+
138
+ [0, true]
113
139
  end
114
140
 
115
141
  def estimate_eval_cost_for_model(cases, model_name)
@@ -159,7 +185,7 @@ module RubyLLM
159
185
 
160
186
  def runtime_settings(context)
161
187
  policy = context.key?(:retry_policy_override) ? context[:retry_policy_override] : retry_policy
162
- extra = context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort)
188
+ extra = context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort, :attachment)
163
189
 
164
190
  # Always pass the class-level `thinking` config to the adapter when
165
191
  # set, so fields like `budget` survive a per-call `reasoning_effort`
@@ -215,18 +241,9 @@ module RubyLLM
215
241
  # programmer bugs (NoMethodError, adapter-code ArgumentError) must propagate
216
242
  # instead of being silently masked as :input_error.
217
243
  def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
218
- effective_temp = context_temperature || temperature
219
244
  runner =
220
245
  begin
221
- Runner.new(
222
- input_type: input_type, output_type: output_type,
223
- prompt_block: prompt, contract_definition: effective_contract,
224
- adapter: adapter, model: model, output_schema: output_schema,
225
- max_output: max_output, max_input: max_input, max_cost: max_cost,
226
- on_unknown_pricing: on_unknown_pricing,
227
- temperature: effective_temp, extra_options: extra_options,
228
- observers: class_observers
229
- )
246
+ Runner.new(config: build_runner_config(adapter, model, context_temperature, extra_options))
230
247
  rescue ArgumentError => e
231
248
  return Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
232
249
  validation_errors: [e.message])
@@ -235,6 +252,21 @@ module RubyLLM
235
252
  runner.call(input)
236
253
  end
237
254
 
255
+ def build_runner_config(adapter, model, context_temperature, extra_options)
256
+ RunnerConfig.build(
257
+ input_type: input_type, output_type: output_type,
258
+ prompt_block: prompt, contract_definition: effective_contract,
259
+ adapter: adapter, model: model, output_schema: output_schema,
260
+ max_output: max_output, max_input: max_input, max_cost: max_cost,
261
+ on_unknown_pricing: on_unknown_pricing,
262
+ attachment_token_estimate: attachment_token_estimate,
263
+ on_unknown_attachment_size: on_unknown_attachment_size,
264
+ temperature: context_temperature || temperature,
265
+ extra_options: extra_options,
266
+ observers: class_observers
267
+ )
268
+ end
269
+
238
270
  def log_result(result)
239
271
  logger = RubyLLM::Contract.configuration.logger
240
272
  return unless logger