ruby_llm-contract 0.8.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +64 -0
- data/Gemfile.lock +2 -2
- data/README.md +96 -37
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +9 -1
- data/lib/ruby_llm/contract/concerns/eval_host.rb +6 -9
- data/lib/ruby_llm/contract/concerns/stub_helpers.rb +97 -0
- data/lib/ruby_llm/contract/contract/definition.rb +2 -0
- data/lib/ruby_llm/contract/cost_calculator.rb +11 -2
- data/lib/ruby_llm/contract/eval/recommender.rb +3 -1
- data/lib/ruby_llm/contract/eval/retry_optimizer.rb +16 -13
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/minitest.rb +6 -108
- data/lib/ruby_llm/contract/pipeline/result.rb +1 -1
- data/lib/ruby_llm/contract/rake_task/suite_gate.rb +117 -0
- data/lib/ruby_llm/contract/rake_task.rb +30 -51
- data/lib/ruby_llm/contract/rspec/helpers.rb +9 -123
- data/lib/ruby_llm/contract/step/base.rb +56 -24
- data/lib/ruby_llm/contract/step/dsl.rb +91 -63
- data/lib/ruby_llm/contract/step/limit_checker.rb +34 -1
- data/lib/ruby_llm/contract/step/retry_executor.rb +6 -13
- data/lib/ruby_llm/contract/step/runner.rb +22 -20
- data/lib/ruby_llm/contract/step/runner_config.rb +26 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/lib/ruby_llm/contract.rb +1 -0
- data/ruby_llm-contract.gemspec +5 -1
- metadata +3 -4
- data/.rspec +0 -3
- data/.rubycritic.yml +0 -8
- data/.simplecov +0 -22
|
@@ -5,6 +5,8 @@ require "ruby_llm/contract"
|
|
|
5
5
|
module RubyLLM
|
|
6
6
|
module Contract
|
|
7
7
|
module MinitestHelpers
|
|
8
|
+
include Concerns::StubHelpers
|
|
9
|
+
|
|
8
10
|
# Snapshot adapter before each test so teardown can restore it.
|
|
9
11
|
def setup
|
|
10
12
|
super if defined?(super)
|
|
@@ -47,114 +49,10 @@ module RubyLLM
|
|
|
47
49
|
report
|
|
48
50
|
end
|
|
49
51
|
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
# Supports an optional block form — the override is removed after the
|
|
56
|
-
# block returns (even if it raises):
|
|
57
|
-
#
|
|
58
|
-
# stub_step(ClassifyTicket, response: data) do
|
|
59
|
-
# result = ClassifyTicket.run("test")
|
|
60
|
-
# end
|
|
61
|
-
# # ClassifyTicket.run no longer stubbed
|
|
62
|
-
#
|
|
63
|
-
def stub_step(step_class, response: nil, responses: nil, &block)
|
|
64
|
-
adapter = if responses
|
|
65
|
-
Adapters::Test.new(responses: responses)
|
|
66
|
-
else
|
|
67
|
-
Adapters::Test.new(response: response)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
71
|
-
previous = overrides[step_class]
|
|
72
|
-
overrides[step_class] = adapter
|
|
73
|
-
|
|
74
|
-
if block
|
|
75
|
-
begin
|
|
76
|
-
yield
|
|
77
|
-
ensure
|
|
78
|
-
if previous
|
|
79
|
-
overrides[step_class] = previous
|
|
80
|
-
else
|
|
81
|
-
overrides.delete(step_class)
|
|
82
|
-
end
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Stub multiple steps at once with different responses.
|
|
88
|
-
# Takes a hash of step_class => options. Requires a block.
|
|
89
|
-
#
|
|
90
|
-
# stub_steps(
|
|
91
|
-
# ClassifyTicket => { response: { priority: "high" } },
|
|
92
|
-
# RouteToTeam => { response: { team: "billing" } }
|
|
93
|
-
# ) do
|
|
94
|
-
# result = TicketPipeline.run("test")
|
|
95
|
-
# end
|
|
96
|
-
#
|
|
97
|
-
def stub_steps(stubs, &block)
|
|
98
|
-
raise ArgumentError, "stub_steps requires a block" unless block
|
|
99
|
-
|
|
100
|
-
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
101
|
-
previous = {}
|
|
102
|
-
|
|
103
|
-
stubs.each do |step_class, opts|
|
|
104
|
-
opts = opts.transform_keys(&:to_sym)
|
|
105
|
-
adapter = if opts[:responses]
|
|
106
|
-
Adapters::Test.new(responses: opts[:responses])
|
|
107
|
-
else
|
|
108
|
-
Adapters::Test.new(response: opts[:response])
|
|
109
|
-
end
|
|
110
|
-
previous[step_class] = overrides[step_class]
|
|
111
|
-
overrides[step_class] = adapter
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
begin
|
|
115
|
-
yield
|
|
116
|
-
ensure
|
|
117
|
-
stubs.each_key do |step_class|
|
|
118
|
-
if previous[step_class]
|
|
119
|
-
overrides[step_class] = previous[step_class]
|
|
120
|
-
else
|
|
121
|
-
overrides.delete(step_class)
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
# Set a global test adapter for ALL steps.
|
|
128
|
-
#
|
|
129
|
-
# stub_all_steps(response: { default: true })
|
|
130
|
-
#
|
|
131
|
-
# Supports an optional block form — the previous adapter is restored
|
|
132
|
-
# after the block returns (even if it raises):
|
|
133
|
-
#
|
|
134
|
-
# stub_all_steps(response: { default: true }) do
|
|
135
|
-
# # all steps use test adapter
|
|
136
|
-
# end
|
|
137
|
-
# # original adapter restored
|
|
138
|
-
#
|
|
139
|
-
def stub_all_steps(response: nil, responses: nil, &block)
|
|
140
|
-
adapter = if responses
|
|
141
|
-
Adapters::Test.new(responses: responses)
|
|
142
|
-
else
|
|
143
|
-
Adapters::Test.new(response: response)
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
if block
|
|
147
|
-
previous = RubyLLM::Contract.configuration.default_adapter
|
|
148
|
-
begin
|
|
149
|
-
RubyLLM::Contract.configuration.default_adapter = adapter
|
|
150
|
-
yield
|
|
151
|
-
ensure
|
|
152
|
-
RubyLLM::Contract.configuration.default_adapter = previous
|
|
153
|
-
end
|
|
154
|
-
else
|
|
155
|
-
RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
|
|
156
|
-
end
|
|
157
|
-
end
|
|
52
|
+
# `stub_step`, `stub_steps`, `stub_all_steps` — provided by
|
|
53
|
+
# `Concerns::StubHelpers` (included above). Shared implementation
|
|
54
|
+
# used by both Minitest and RSpec hosts; documentation and method
|
|
55
|
+
# signatures live in `concerns/stub_helpers.rb`.
|
|
158
56
|
end
|
|
159
57
|
end
|
|
160
58
|
end
|
|
@@ -108,7 +108,7 @@ module RubyLLM
|
|
|
108
108
|
trace = step_result.trace
|
|
109
109
|
status = step_status(step_result)
|
|
110
110
|
trace_str = trace.respond_to?(:to_s) ? trace.to_s : ""
|
|
111
|
-
" #{step_record[:alias].to_s.ljust(
|
|
111
|
+
" #{step_record[:alias].to_s.ljust(COL1)} #{status.ljust(COL2)} #{trace_str}"
|
|
112
112
|
end
|
|
113
113
|
|
|
114
114
|
def step_status(step_result)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
class RakeTask < ::Rake::TaskLib
|
|
6
|
+
# Encapsulates the pass/fail gate that runs after `RakeTask#define_task`
|
|
7
|
+
# has collected eval reports. Extracted from the prior `define_task`
|
|
8
|
+
# god-method so each gating dimension (cost, score, regression) is
|
|
9
|
+
# testable in isolation.
|
|
10
|
+
#
|
|
11
|
+
# Returns a `Verdict` value object with:
|
|
12
|
+
# - `passed?` — overall gate verdict
|
|
13
|
+
# - `abort_reason` — String for `abort` when `passed? == false`, nil otherwise
|
|
14
|
+
# - `passed_reports` — [[host, report], ...] of reports that individually passed
|
|
15
|
+
# (used to decide which baselines to save)
|
|
16
|
+
# - `suite_cost` — total cost across all reports
|
|
17
|
+
#
|
|
18
|
+
# Gate ordering (preserved from pre-refactor behaviour):
|
|
19
|
+
# 1. cost gate runs FIRST — if `maximum_cost` set and exceeded, the
|
|
20
|
+
# suite aborts before any score check; passed_reports is empty.
|
|
21
|
+
# 2. score gate runs per-report; a report passes if
|
|
22
|
+
# `report_meets_score?` AND `!check_regression`.
|
|
23
|
+
# 3. overall passed = ALL reports passed AND cost gate not tripped.
|
|
24
|
+
class SuiteGate
|
|
25
|
+
Verdict = Data.define(:passed, :abort_reason, :passed_reports, :suite_cost) do
|
|
26
|
+
def passed?
|
|
27
|
+
passed
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.evaluate(host_reports:, minimum_score:, maximum_cost:, fail_on_regression:)
|
|
32
|
+
new(host_reports: host_reports,
|
|
33
|
+
minimum_score: minimum_score,
|
|
34
|
+
maximum_cost: maximum_cost,
|
|
35
|
+
fail_on_regression: fail_on_regression).verdict
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
attr_reader :verdict
|
|
39
|
+
|
|
40
|
+
def initialize(host_reports:, minimum_score:, maximum_cost:, fail_on_regression:)
|
|
41
|
+
@host_reports = host_reports
|
|
42
|
+
@minimum_score = minimum_score
|
|
43
|
+
@maximum_cost = maximum_cost
|
|
44
|
+
@fail_on_regression = fail_on_regression
|
|
45
|
+
@verdict = build_verdict
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def build_verdict
|
|
51
|
+
suite_cost = compute_suite_cost
|
|
52
|
+
|
|
53
|
+
if cost_exceeded?(suite_cost)
|
|
54
|
+
return Verdict.new(
|
|
55
|
+
passed: false,
|
|
56
|
+
abort_reason: cost_abort_message(suite_cost),
|
|
57
|
+
passed_reports: [],
|
|
58
|
+
suite_cost: suite_cost
|
|
59
|
+
)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
passed_reports, all_passed = score_each_report
|
|
63
|
+
Verdict.new(
|
|
64
|
+
passed: all_passed,
|
|
65
|
+
abort_reason: all_passed ? nil : "Eval suite FAILED",
|
|
66
|
+
passed_reports: passed_reports,
|
|
67
|
+
suite_cost: suite_cost
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def compute_suite_cost
|
|
72
|
+
@host_reports.sum { |_host, report| report.total_cost }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def cost_exceeded?(suite_cost)
|
|
76
|
+
@maximum_cost && suite_cost > @maximum_cost
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def cost_abort_message(suite_cost)
|
|
80
|
+
"total cost $#{format("%.4f", suite_cost)} exceeds budget $#{format("%.4f", @maximum_cost)}"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def score_each_report
|
|
84
|
+
passed_reports = []
|
|
85
|
+
all_passed = true
|
|
86
|
+
@host_reports.each do |host, report|
|
|
87
|
+
report_ok = report_meets_score?(report) && !check_regression(report)
|
|
88
|
+
all_passed = false unless report_ok
|
|
89
|
+
passed_reports << [host, report] if report_ok
|
|
90
|
+
end
|
|
91
|
+
[passed_reports, all_passed]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def report_meets_score?(report)
|
|
95
|
+
if @minimum_score
|
|
96
|
+
report.score >= @minimum_score
|
|
97
|
+
else
|
|
98
|
+
report.passed?
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def check_regression(report)
|
|
103
|
+
return false unless @fail_on_regression && report.baseline_exists?
|
|
104
|
+
|
|
105
|
+
diff = report.compare_with_baseline
|
|
106
|
+
if diff.regressed?
|
|
107
|
+
puts "\n REGRESSIONS DETECTED:"
|
|
108
|
+
puts " #{diff}"
|
|
109
|
+
true
|
|
110
|
+
else
|
|
111
|
+
false
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "rake"
|
|
4
4
|
require "rake/tasklib"
|
|
5
|
+
require_relative "rake_task/suite_gate"
|
|
5
6
|
|
|
6
7
|
module RubyLLM
|
|
7
8
|
module Contract
|
|
@@ -33,71 +34,49 @@ module RubyLLM
|
|
|
33
34
|
RubyLLM::Contract.load_evals!(*@eval_dirs)
|
|
34
35
|
|
|
35
36
|
context = @context.respond_to?(:call) ? @context.call : @context
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if results.empty?
|
|
39
|
-
if @fail_on_empty
|
|
40
|
-
abort "No evals defined. Define evals with define_eval or set fail_on_empty = false."
|
|
41
|
-
else
|
|
42
|
-
puts "No evals defined."
|
|
43
|
-
next
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
gate_passed = true
|
|
48
|
-
suite_cost = 0.0
|
|
49
|
-
|
|
50
|
-
passed_reports = []
|
|
51
|
-
all_reports = []
|
|
52
|
-
|
|
53
|
-
results.each do |host, reports|
|
|
54
|
-
puts "\n#{host.name || host.to_s}"
|
|
55
|
-
reports.each_value do |report|
|
|
56
|
-
report.print_summary
|
|
57
|
-
suite_cost += report.total_cost
|
|
58
|
-
all_reports << [host, report]
|
|
59
|
-
report_ok = report_meets_score?(report) && !check_regression(report)
|
|
60
|
-
gate_passed = false unless report_ok
|
|
61
|
-
passed_reports << report if report_ok
|
|
62
|
-
end
|
|
63
|
-
end
|
|
37
|
+
host_reports = collect_host_reports(context)
|
|
38
|
+
next unless host_reports # empty path already handled
|
|
64
39
|
|
|
65
40
|
# Save history BEFORE gating — failures are valuable trend data (ADR-0016 F3)
|
|
66
|
-
save_all_history!(
|
|
41
|
+
save_all_history!(host_reports, context) if @track_history
|
|
67
42
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
43
|
+
verdict = SuiteGate.evaluate(
|
|
44
|
+
host_reports: host_reports,
|
|
45
|
+
minimum_score: @minimum_score,
|
|
46
|
+
maximum_cost: @maximum_cost,
|
|
47
|
+
fail_on_regression: @fail_on_regression
|
|
48
|
+
)
|
|
72
49
|
|
|
73
|
-
abort "\nEval suite FAILED" unless
|
|
50
|
+
abort "\nEval suite FAILED: #{verdict.abort_reason}" unless verdict.passed?
|
|
74
51
|
|
|
75
52
|
# Save baselines only after ALL gates pass
|
|
76
|
-
passed_reports.each { |r| save_baseline!(r) } if @save_baseline
|
|
53
|
+
verdict.passed_reports.each { |_host, r| save_baseline!(r) } if @save_baseline
|
|
77
54
|
|
|
78
55
|
puts "\nAll evals passed."
|
|
79
56
|
end
|
|
80
57
|
end
|
|
81
58
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
report.score >= @minimum_score
|
|
85
|
-
else
|
|
86
|
-
report.passed?
|
|
87
|
-
end
|
|
88
|
-
end
|
|
59
|
+
def collect_host_reports(context)
|
|
60
|
+
results = RubyLLM::Contract.run_all_evals(context: context)
|
|
89
61
|
|
|
90
|
-
|
|
91
|
-
|
|
62
|
+
if results.empty?
|
|
63
|
+
if @fail_on_empty
|
|
64
|
+
abort "No evals defined. Define evals with define_eval or set fail_on_empty = false."
|
|
65
|
+
else
|
|
66
|
+
puts "No evals defined."
|
|
67
|
+
return nil
|
|
68
|
+
end
|
|
69
|
+
end
|
|
92
70
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
puts "\n
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
71
|
+
host_reports = []
|
|
72
|
+
results.each do |host, reports|
|
|
73
|
+
puts "\n#{host.name || host.to_s}"
|
|
74
|
+
reports.each_value do |report|
|
|
75
|
+
report.print_summary
|
|
76
|
+
host_reports << [host, report]
|
|
77
|
+
end
|
|
100
78
|
end
|
|
79
|
+
host_reports
|
|
101
80
|
end
|
|
102
81
|
|
|
103
82
|
def save_baseline!(report)
|
|
@@ -3,130 +3,16 @@
|
|
|
3
3
|
module RubyLLM
|
|
4
4
|
module Contract
|
|
5
5
|
module RSpec
|
|
6
|
+
# `stub_step`, `stub_steps`, `stub_all_steps` — provided by
|
|
7
|
+
# `Concerns::StubHelpers`. Shared implementation used by both RSpec
|
|
8
|
+
# and Minitest hosts; documentation and method signatures live in
|
|
9
|
+
# `concerns/stub_helpers.rb`.
|
|
10
|
+
#
|
|
11
|
+
# Cleanup between examples is handled by the `around(:each)` hook
|
|
12
|
+
# in `lib/ruby_llm/contract/rspec.rb`, which snapshots and restores
|
|
13
|
+
# `step_adapter_overrides` plus `configuration.default_adapter`.
|
|
6
14
|
module Helpers
|
|
7
|
-
|
|
8
|
-
#
|
|
9
|
-
# stub_step(ClassifyTicket, response: { priority: "high" })
|
|
10
|
-
# result = ClassifyTicket.run("test")
|
|
11
|
-
# result.parsed_output # => {priority: "high"}
|
|
12
|
-
#
|
|
13
|
-
# Only affects the specified step — other steps are not affected.
|
|
14
|
-
#
|
|
15
|
-
# With a block, the stub is scoped — cleaned up after the block:
|
|
16
|
-
#
|
|
17
|
-
# stub_step(ClassifyTicket, response: data) do
|
|
18
|
-
# # only stubbed inside this block
|
|
19
|
-
# end
|
|
20
|
-
# # ClassifyTicket no longer stubbed
|
|
21
|
-
#
|
|
22
|
-
# Without a block, the stub lives until the RSpec example ends.
|
|
23
|
-
#
|
|
24
|
-
def stub_step(step_class, response: nil, responses: nil, &block)
|
|
25
|
-
adapter = build_test_adapter(response: response, responses: responses)
|
|
26
|
-
|
|
27
|
-
if block
|
|
28
|
-
# Block form: use thread-local overrides with save/restore for real scoping
|
|
29
|
-
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
30
|
-
previous = overrides[step_class]
|
|
31
|
-
overrides[step_class] = adapter
|
|
32
|
-
begin
|
|
33
|
-
yield
|
|
34
|
-
ensure
|
|
35
|
-
if previous
|
|
36
|
-
overrides[step_class] = previous
|
|
37
|
-
else
|
|
38
|
-
overrides.delete(step_class)
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
else
|
|
42
|
-
# Non-block: use RSpec allow (auto-cleaned after example)
|
|
43
|
-
allow(step_class).to receive(:run).and_wrap_original do |original, input, **kwargs|
|
|
44
|
-
context = kwargs[:context] || {}
|
|
45
|
-
unless context.key?(:adapter) || context.key?("adapter")
|
|
46
|
-
context = context.merge(adapter: adapter)
|
|
47
|
-
end
|
|
48
|
-
original.call(input, context: context)
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Stub multiple steps at once with different responses.
|
|
54
|
-
# Takes a hash of step_class => options. Requires a block.
|
|
55
|
-
#
|
|
56
|
-
# stub_steps(
|
|
57
|
-
# ClassifyTicket => { response: { priority: "high" } },
|
|
58
|
-
# RouteToTeam => { response: { team: "billing" } }
|
|
59
|
-
# ) do
|
|
60
|
-
# result = TicketPipeline.run("test")
|
|
61
|
-
# end
|
|
62
|
-
#
|
|
63
|
-
def stub_steps(stubs, &block)
|
|
64
|
-
raise ArgumentError, "stub_steps requires a block" unless block
|
|
65
|
-
|
|
66
|
-
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
67
|
-
previous = {}
|
|
68
|
-
|
|
69
|
-
stubs.each do |step_class, opts|
|
|
70
|
-
opts = opts.transform_keys(&:to_sym)
|
|
71
|
-
adapter = build_test_adapter(**opts)
|
|
72
|
-
previous[step_class] = overrides[step_class]
|
|
73
|
-
overrides[step_class] = adapter
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
begin
|
|
77
|
-
yield
|
|
78
|
-
ensure
|
|
79
|
-
stubs.each_key do |step_class|
|
|
80
|
-
if previous[step_class]
|
|
81
|
-
overrides[step_class] = previous[step_class]
|
|
82
|
-
else
|
|
83
|
-
overrides.delete(step_class)
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
# Set a global test adapter for ALL steps.
|
|
90
|
-
#
|
|
91
|
-
# stub_all_steps(response: { default: true })
|
|
92
|
-
#
|
|
93
|
-
# Supports an optional block form — the previous adapter is restored
|
|
94
|
-
# after the block returns (even if it raises):
|
|
95
|
-
#
|
|
96
|
-
# stub_all_steps(response: { default: true }) do
|
|
97
|
-
# # all steps use test adapter
|
|
98
|
-
# end
|
|
99
|
-
# # original adapter restored
|
|
100
|
-
#
|
|
101
|
-
def stub_all_steps(response: nil, responses: nil, &block)
|
|
102
|
-
adapter = build_test_adapter(response: response, responses: responses)
|
|
103
|
-
|
|
104
|
-
if block
|
|
105
|
-
previous = RubyLLM::Contract.configuration.default_adapter
|
|
106
|
-
begin
|
|
107
|
-
RubyLLM::Contract.configuration.default_adapter = adapter
|
|
108
|
-
yield
|
|
109
|
-
ensure
|
|
110
|
-
RubyLLM::Contract.configuration.default_adapter = previous
|
|
111
|
-
end
|
|
112
|
-
else
|
|
113
|
-
RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
private
|
|
118
|
-
|
|
119
|
-
def build_test_adapter(response: nil, responses: nil)
|
|
120
|
-
if responses
|
|
121
|
-
Adapters::Test.new(responses: responses.map { |r| normalize_test_response(r) })
|
|
122
|
-
else
|
|
123
|
-
Adapters::Test.new(response: normalize_test_response(response))
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
def normalize_test_response(value)
|
|
128
|
-
value
|
|
129
|
-
end
|
|
15
|
+
include Concerns::StubHelpers
|
|
130
16
|
end
|
|
131
17
|
end
|
|
132
18
|
end
|
|
@@ -21,19 +21,30 @@ module RubyLLM
|
|
|
21
21
|
context: context).results.first
|
|
22
22
|
end
|
|
23
23
|
|
|
24
|
-
def estimate_cost(input:, model: nil)
|
|
24
|
+
def estimate_cost(input:, model: nil, attachment: nil)
|
|
25
25
|
model_name = estimated_model_name(model)
|
|
26
|
-
model_info = CostCalculator.
|
|
26
|
+
model_info = CostCalculator.find_model(model_name)
|
|
27
27
|
return nil unless model_info
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
text_tokens = TokenEstimator.estimate(build_messages(input))
|
|
30
|
+
attachment_tokens, attachment_error = resolve_attachment_tokens(attachment)
|
|
31
|
+
return nil if attachment_error
|
|
32
|
+
|
|
33
|
+
input_tokens = text_tokens + attachment_tokens
|
|
34
|
+
# NOTE: attachment tokens add to input only, not output. Vision-
|
|
35
|
+
# heavy outputs (long image descriptions) may exceed
|
|
36
|
+
# `output_tokens_estimate` — this method is a floor for budget
|
|
37
|
+
# planning, not a precise predictor. See multimodal_input.md.
|
|
30
38
|
output_tokens = max_output || DEFAULT_OUTPUT_TOKENS
|
|
31
39
|
|
|
32
40
|
{
|
|
33
41
|
model: model_name,
|
|
34
42
|
input_tokens: input_tokens,
|
|
35
43
|
output_tokens_estimate: output_tokens,
|
|
36
|
-
estimated_cost:
|
|
44
|
+
estimated_cost: CostCalculator.calculate(
|
|
45
|
+
model_name: model_name,
|
|
46
|
+
usage: { input_tokens: input_tokens, output_tokens: output_tokens }
|
|
47
|
+
)
|
|
37
48
|
}
|
|
38
49
|
end
|
|
39
50
|
|
|
@@ -49,7 +60,9 @@ module RubyLLM
|
|
|
49
60
|
end
|
|
50
61
|
end
|
|
51
62
|
|
|
52
|
-
def recommend(eval_name, candidates:,
|
|
63
|
+
def recommend(eval_name, candidates:, context: {},
|
|
64
|
+
min_score: Eval::DEFAULT_MIN_SCORE,
|
|
65
|
+
min_first_try_pass_rate: Eval::DEFAULT_MIN_FIRST_TRY_PASS_RATE)
|
|
53
66
|
comparison = compare_models(eval_name, candidates: candidates, context: context)
|
|
54
67
|
Eval::Recommender.new(
|
|
55
68
|
comparison: comparison,
|
|
@@ -59,7 +72,9 @@ module RubyLLM
|
|
|
59
72
|
).recommend
|
|
60
73
|
end
|
|
61
74
|
|
|
62
|
-
def optimize_retry_policy(candidates:, context: {},
|
|
75
|
+
def optimize_retry_policy(candidates:, context: {},
|
|
76
|
+
min_score: Eval::DEFAULT_MIN_SCORE,
|
|
77
|
+
runs: 1, production_mode: nil)
|
|
63
78
|
Eval::RetryOptimizer.new(
|
|
64
79
|
step: self,
|
|
65
80
|
candidates: candidates,
|
|
@@ -71,7 +86,7 @@ module RubyLLM
|
|
|
71
86
|
end
|
|
72
87
|
|
|
73
88
|
KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists
|
|
74
|
-
reasoning_effort retry_policy_override].freeze
|
|
89
|
+
reasoning_effort retry_policy_override attachment].freeze
|
|
75
90
|
|
|
76
91
|
include Concerns::ContextHelpers
|
|
77
92
|
|
|
@@ -104,12 +119,23 @@ module RubyLLM
|
|
|
104
119
|
model || (self.model if respond_to?(:model)) || RubyLLM::Contract.configuration.default_model
|
|
105
120
|
end
|
|
106
121
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
122
|
+
# Returns [tokens, error?] where error is true when fail-closed should
|
|
123
|
+
# short-circuit the caller. Mirrors limit_checker.rb fail-closed policy
|
|
124
|
+
# so estimate_cost and runtime check_limits agree on the same input.
|
|
125
|
+
def resolve_attachment_tokens(attachment)
|
|
126
|
+
return [0, false] if attachment.nil?
|
|
127
|
+
|
|
128
|
+
estimate = attachment_token_estimate if respond_to?(:attachment_token_estimate)
|
|
129
|
+
return [estimate, false] unless estimate.nil?
|
|
130
|
+
|
|
131
|
+
mode = respond_to?(:on_unknown_attachment_size) ? on_unknown_attachment_size : :refuse
|
|
132
|
+
if mode == :warn
|
|
133
|
+
warn "[ruby_llm-contract] attachment present but attachment_token_estimate not " \
|
|
134
|
+
"declared on #{name || self} — estimate_cost proceeds without attachment cost"
|
|
135
|
+
return [0, false]
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
[0, true]
|
|
113
139
|
end
|
|
114
140
|
|
|
115
141
|
def estimate_eval_cost_for_model(cases, model_name)
|
|
@@ -159,7 +185,7 @@ module RubyLLM
|
|
|
159
185
|
|
|
160
186
|
def runtime_settings(context)
|
|
161
187
|
policy = context.key?(:retry_policy_override) ? context[:retry_policy_override] : retry_policy
|
|
162
|
-
extra = context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort)
|
|
188
|
+
extra = context.slice(:provider, :assume_model_exists, :max_tokens, :reasoning_effort, :attachment)
|
|
163
189
|
|
|
164
190
|
# Always pass the class-level `thinking` config to the adapter when
|
|
165
191
|
# set, so fields like `budget` survive a per-call `reasoning_effort`
|
|
@@ -215,18 +241,9 @@ module RubyLLM
|
|
|
215
241
|
# programmer bugs (NoMethodError, adapter-code ArgumentError) must propagate
|
|
216
242
|
# instead of being silently masked as :input_error.
|
|
217
243
|
def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
|
|
218
|
-
effective_temp = context_temperature || temperature
|
|
219
244
|
runner =
|
|
220
245
|
begin
|
|
221
|
-
Runner.new(
|
|
222
|
-
input_type: input_type, output_type: output_type,
|
|
223
|
-
prompt_block: prompt, contract_definition: effective_contract,
|
|
224
|
-
adapter: adapter, model: model, output_schema: output_schema,
|
|
225
|
-
max_output: max_output, max_input: max_input, max_cost: max_cost,
|
|
226
|
-
on_unknown_pricing: on_unknown_pricing,
|
|
227
|
-
temperature: effective_temp, extra_options: extra_options,
|
|
228
|
-
observers: class_observers
|
|
229
|
-
)
|
|
246
|
+
Runner.new(config: build_runner_config(adapter, model, context_temperature, extra_options))
|
|
230
247
|
rescue ArgumentError => e
|
|
231
248
|
return Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
|
|
232
249
|
validation_errors: [e.message])
|
|
@@ -235,6 +252,21 @@ module RubyLLM
|
|
|
235
252
|
runner.call(input)
|
|
236
253
|
end
|
|
237
254
|
|
|
255
|
+
def build_runner_config(adapter, model, context_temperature, extra_options)
|
|
256
|
+
RunnerConfig.build(
|
|
257
|
+
input_type: input_type, output_type: output_type,
|
|
258
|
+
prompt_block: prompt, contract_definition: effective_contract,
|
|
259
|
+
adapter: adapter, model: model, output_schema: output_schema,
|
|
260
|
+
max_output: max_output, max_input: max_input, max_cost: max_cost,
|
|
261
|
+
on_unknown_pricing: on_unknown_pricing,
|
|
262
|
+
attachment_token_estimate: attachment_token_estimate,
|
|
263
|
+
on_unknown_attachment_size: on_unknown_attachment_size,
|
|
264
|
+
temperature: context_temperature || temperature,
|
|
265
|
+
extra_options: extra_options,
|
|
266
|
+
observers: class_observers
|
|
267
|
+
)
|
|
268
|
+
end
|
|
269
|
+
|
|
238
270
|
def log_result(result)
|
|
239
271
|
logger = RubyLLM::Contract.configuration.logger
|
|
240
272
|
return unless logger
|