ruby_llm-contract 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/Gemfile.lock +2 -2
- data/README.md +27 -2
- data/lib/ruby_llm/contract/adapters/response.rb +4 -2
- data/lib/ruby_llm/contract/adapters/test.rb +3 -2
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +23 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +10 -2
- data/lib/ruby_llm/contract/eval/baseline_diff.rb +88 -0
- data/lib/ruby_llm/contract/eval/dataset.rb +11 -4
- data/lib/ruby_llm/contract/eval/eval_definition.rb +11 -10
- data/lib/ruby_llm/contract/eval/model_comparison.rb +1 -1
- data/lib/ruby_llm/contract/eval/report.rb +71 -2
- data/lib/ruby_llm/contract/eval/runner.rb +3 -2
- data/lib/ruby_llm/contract/eval.rb +1 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +1 -1
- data/lib/ruby_llm/contract/pipeline/runner.rb +1 -1
- data/lib/ruby_llm/contract/pipeline/trace.rb +3 -2
- data/lib/ruby_llm/contract/prompt/node.rb +2 -2
- data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +2 -2
- data/lib/ruby_llm/contract/rake_task.rb +31 -4
- data/lib/ruby_llm/contract/rspec/helpers.rb +28 -8
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +23 -2
- data/lib/ruby_llm/contract/step/base.rb +10 -5
- data/lib/ruby_llm/contract/step/dsl.rb +1 -1
- data/lib/ruby_llm/contract/step/retry_executor.rb +3 -2
- data/lib/ruby_llm/contract/step/retry_policy.rb +6 -0
- data/lib/ruby_llm/contract/step/runner.rb +3 -1
- data/lib/ruby_llm/contract/step/trace.rb +5 -4
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/lib/ruby_llm/contract.rb +21 -18
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b032109a7818caa3f68cae651f9f99210765d4257825f52a332944a6120ad522
|
|
4
|
+
data.tar.gz: 8f4c1bb95cbcf79236723e100becf8c8f2b87061bd7c29827152e4d716a99ce3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e84f8e58367e2eae1ea6a0a712e125be6b3edb361ce6feca984c659f15ca11ce658143adf7fdfcd09f5c1ff57d09fad31e431320f780dd08da7ab7499dd9b961
|
|
7
|
+
data.tar.gz: 29c98d8fb09a92df1a88136d7c67094784fdf2ae01ae9ec1aaa3fc5f1cd589fd27c7139c84663ba9e49c89e5537f98480eb451076c8a00dffcccfc3bf062f5d8
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.0 (2026-03-23)
|
|
4
|
+
|
|
5
|
+
Baseline regression detection — know when quality drops before users do.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- **`report.save_baseline!`** — serialize eval results to `.eval_baselines/` (JSON, git-tracked)
|
|
10
|
+
- **`report.compare_with_baseline`** — returns `BaselineDiff` with regressions, improvements, score_delta, new/removed cases
|
|
11
|
+
- **`diff.regressed?`** — true when any previously-passing case now fails
|
|
12
|
+
- **`without_regressions` RSpec chain** — `expect(Step).to pass_eval("x").without_regressions`
|
|
13
|
+
- **RakeTask `fail_on_regression`** — blocks CI when regressions detected
|
|
14
|
+
- **RakeTask `save_baseline`** — auto-save after successful run
|
|
15
|
+
- **Migration guide** — `docs/guide/migration.md` with 7 patterns for adopting the gem in existing Rails apps
|
|
16
|
+
|
|
17
|
+
### Stats
|
|
18
|
+
|
|
19
|
+
- 1086 tests, 0 failures
|
|
20
|
+
|
|
3
21
|
## 0.2.3 (2026-03-23)
|
|
4
22
|
|
|
5
23
|
Production hardening from senior Rails review panel.
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ruby_llm-contract (0.
|
|
4
|
+
ruby_llm-contract (0.3.0)
|
|
5
5
|
dry-types (~> 1.7)
|
|
6
6
|
ruby_llm (~> 1.0)
|
|
7
7
|
ruby_llm-schema (~> 0.3)
|
|
@@ -165,7 +165,7 @@ CHECKSUMS
|
|
|
165
165
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
166
166
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
167
167
|
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
168
|
-
ruby_llm-contract (0.
|
|
168
|
+
ruby_llm-contract (0.3.0)
|
|
169
169
|
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
170
170
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
171
171
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
data/README.md
CHANGED
|
@@ -111,6 +111,30 @@ end
|
|
|
111
111
|
# bundle exec rake ruby_llm_contract:eval
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
## Detect quality drops
|
|
115
|
+
|
|
116
|
+
Save a baseline. Next run, see what regressed.
|
|
117
|
+
|
|
118
|
+
```ruby
|
|
119
|
+
report = ClassifyTicket.run_eval("regression", context: { model: "gpt-4.1-nano" })
|
|
120
|
+
report.save_baseline!(model: "gpt-4.1-nano")
|
|
121
|
+
|
|
122
|
+
# Later — after prompt change, model update, or provider weight shift:
|
|
123
|
+
report = ClassifyTicket.run_eval("regression", context: { model: "gpt-4.1-nano" })
|
|
124
|
+
diff = report.compare_with_baseline(model: "gpt-4.1-nano")
|
|
125
|
+
|
|
126
|
+
diff.regressed? # => true
|
|
127
|
+
diff.regressions # => [{case: "outage", baseline: {passed: true}, current: {passed: false}}]
|
|
128
|
+
diff.score_delta # => -0.33
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
```ruby
|
|
132
|
+
# CI: block merge if any previously-passing case now fails
|
|
133
|
+
expect(ClassifyTicket).to pass_eval("regression")
|
|
134
|
+
.with_context(model: "gpt-4.1-nano")
|
|
135
|
+
.without_regressions
|
|
136
|
+
```
|
|
137
|
+
|
|
114
138
|
## Predict cost before running
|
|
115
139
|
|
|
116
140
|
```ruby
|
|
@@ -140,12 +164,13 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
|
140
164
|
| [Output Schema](docs/guide/output_schema.md) | Full schema reference + constraints |
|
|
141
165
|
| [Pipeline](docs/guide/pipeline.md) | Multi-step composition, timeout, fail-fast |
|
|
142
166
|
| [Testing](docs/guide/testing.md) | Test adapter, RSpec matchers |
|
|
167
|
+
| [Migration](docs/guide/migration.md) | Adopting the gem in existing Rails apps |
|
|
143
168
|
|
|
144
169
|
## Roadmap
|
|
145
170
|
|
|
146
|
-
**v0.
|
|
171
|
+
**v0.3 (current):** Baseline regression detection — `save_baseline!`, `compare_with_baseline`, `without_regressions`. Migration guide.
|
|
147
172
|
|
|
148
|
-
**v0.
|
|
173
|
+
**v0.2:** Model comparison, cost tracking, eval with `add_case`, CI gating, Rails Railtie.
|
|
149
174
|
|
|
150
175
|
**v0.4:** Auto-routing — learn which model works for which input pattern.
|
|
151
176
|
|
|
@@ -4,11 +4,13 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Adapters
|
|
6
6
|
class Response
|
|
7
|
+
include Concerns::DeepFreeze
|
|
8
|
+
|
|
7
9
|
attr_reader :content, :usage
|
|
8
10
|
|
|
9
11
|
def initialize(content:, usage: {})
|
|
10
|
-
@content = content
|
|
11
|
-
@usage = usage
|
|
12
|
+
@content = deep_dup_freeze(content)
|
|
13
|
+
@usage = deep_dup_freeze(usage)
|
|
12
14
|
freeze
|
|
13
15
|
end
|
|
14
16
|
end
|
|
@@ -4,8 +4,9 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Adapters
|
|
6
6
|
class Test < Base
|
|
7
|
-
def initialize(response: nil, responses: nil)
|
|
7
|
+
def initialize(response: nil, responses: nil, usage: nil)
|
|
8
8
|
super()
|
|
9
|
+
@usage = (usage || { input_tokens: 0, output_tokens: 0 }).dup.freeze
|
|
9
10
|
if responses
|
|
10
11
|
raise ArgumentError, "responses: must not be empty (use response: nil for nil content)" if responses.empty?
|
|
11
12
|
|
|
@@ -36,7 +37,7 @@ module RubyLLM
|
|
|
36
37
|
else
|
|
37
38
|
@response
|
|
38
39
|
end
|
|
39
|
-
Response.new(content: content, usage:
|
|
40
|
+
Response.new(content: content, usage: @usage)
|
|
40
41
|
end
|
|
41
42
|
end
|
|
42
43
|
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
# Deep-duplicate and freeze a value. Creates an independent frozen copy
|
|
7
|
+
# without mutating the original. Handles Hash, Array, String recursively.
|
|
8
|
+
module DeepFreeze
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
def deep_dup_freeze(obj)
|
|
12
|
+
case obj
|
|
13
|
+
when NilClass, Integer, Float, Symbol, TrueClass, FalseClass then obj
|
|
14
|
+
when Hash then obj.transform_values { |v| deep_dup_freeze(v) }.freeze
|
|
15
|
+
when Array then obj.map { |v| deep_dup_freeze(v) }.freeze
|
|
16
|
+
when String then obj.frozen? ? obj : obj.dup.freeze
|
|
17
|
+
else obj.frozen? ? obj : obj.dup.freeze
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -6,6 +6,7 @@ module RubyLLM
|
|
|
6
6
|
module EvalHost
|
|
7
7
|
def define_eval(name, &)
|
|
8
8
|
@eval_definitions ||= {}
|
|
9
|
+
@file_sourced_evals ||= Set.new
|
|
9
10
|
key = name.to_s
|
|
10
11
|
|
|
11
12
|
if @eval_definitions.key?(key) && !Thread.current[:ruby_llm_contract_reloading]
|
|
@@ -14,12 +15,16 @@ module RubyLLM
|
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
@eval_definitions[key] = Eval::EvalDefinition.new(key, step_class: self, &)
|
|
18
|
+
@file_sourced_evals.add(key) if Thread.current[:ruby_llm_contract_reloading]
|
|
17
19
|
Contract.register_eval_host(self)
|
|
18
20
|
register_subclasses(self)
|
|
19
21
|
end
|
|
20
22
|
|
|
21
|
-
def
|
|
22
|
-
@
|
|
23
|
+
def clear_file_sourced_evals!
|
|
24
|
+
return unless defined?(@file_sourced_evals) && defined?(@eval_definitions)
|
|
25
|
+
|
|
26
|
+
@file_sourced_evals.each { |key| @eval_definitions.delete(key) }
|
|
27
|
+
@file_sourced_evals.clear
|
|
23
28
|
end
|
|
24
29
|
|
|
25
30
|
def eval_names
|
|
@@ -31,6 +36,7 @@ module RubyLLM
|
|
|
31
36
|
end
|
|
32
37
|
|
|
33
38
|
def run_eval(name = nil, context: {})
|
|
39
|
+
context ||= {}
|
|
34
40
|
if name
|
|
35
41
|
run_single_eval(name, context)
|
|
36
42
|
else
|
|
@@ -39,6 +45,7 @@ module RubyLLM
|
|
|
39
45
|
end
|
|
40
46
|
|
|
41
47
|
def compare_models(eval_name, models:, context: {})
|
|
48
|
+
context ||= {}
|
|
42
49
|
reports = models.each_with_object({}) do |model, hash|
|
|
43
50
|
model_context = deep_dup_context(context).merge(model: model)
|
|
44
51
|
hash[model] = run_single_eval(eval_name, model_context)
|
|
@@ -75,6 +82,7 @@ module RubyLLM
|
|
|
75
82
|
end
|
|
76
83
|
|
|
77
84
|
def eval_context(defn, context)
|
|
85
|
+
context = (context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
|
|
78
86
|
return context if context[:adapter]
|
|
79
87
|
|
|
80
88
|
sample_adapter = defn.build_adapter
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class BaselineDiff
|
|
7
|
+
attr_reader :baseline_score, :current_score
|
|
8
|
+
|
|
9
|
+
def initialize(baseline_cases:, current_cases:)
|
|
10
|
+
@baseline = index_by_name(baseline_cases)
|
|
11
|
+
@current = index_by_name(current_cases)
|
|
12
|
+
@baseline_score = baseline_cases.empty? ? 0.0 : baseline_cases.sum { |c| c[:score] } / baseline_cases.length
|
|
13
|
+
@current_score = current_cases.empty? ? 0.0 : current_cases.sum { |c| c[:score] } / current_cases.length
|
|
14
|
+
freeze
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def regressions
|
|
18
|
+
@baseline.filter_map do |name, baseline|
|
|
19
|
+
current = @current[name]
|
|
20
|
+
next unless current
|
|
21
|
+
next unless baseline[:passed] && !current[:passed]
|
|
22
|
+
|
|
23
|
+
{
|
|
24
|
+
case: name,
|
|
25
|
+
baseline: { passed: baseline[:passed], score: baseline[:score] },
|
|
26
|
+
current: { passed: current[:passed], score: current[:score] },
|
|
27
|
+
detail: current[:details]
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def improvements
|
|
33
|
+
@baseline.filter_map do |name, baseline|
|
|
34
|
+
current = @current[name]
|
|
35
|
+
next unless current
|
|
36
|
+
next unless !baseline[:passed] && current[:passed]
|
|
37
|
+
|
|
38
|
+
{
|
|
39
|
+
case: name,
|
|
40
|
+
baseline: { passed: baseline[:passed], score: baseline[:score] },
|
|
41
|
+
current: { passed: current[:passed], score: current[:score] }
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def score_delta
|
|
47
|
+
(current_score - baseline_score).round(4)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def regressed?
|
|
51
|
+
regressions.any?
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def improved?
|
|
55
|
+
improvements.any?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def new_cases
|
|
59
|
+
(@current.keys - @baseline.keys)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def removed_cases
|
|
63
|
+
(@baseline.keys - @current.keys)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def to_s
|
|
67
|
+
lines = ["Score: #{baseline_score.round(2)} → #{current_score.round(2)} (#{format_delta})"]
|
|
68
|
+
regressions.each { |r| lines << " REGRESSED #{r[:case]}: #{r[:detail]}" }
|
|
69
|
+
improvements.each { |r| lines << " IMPROVED #{r[:case]}" }
|
|
70
|
+
new_cases.each { |c| lines << " NEW #{c}" }
|
|
71
|
+
removed_cases.each { |c| lines << " REMOVED #{c}" }
|
|
72
|
+
lines.join("\n")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def index_by_name(cases)
|
|
78
|
+
cases.each_with_object({}) { |c, h| h[c[:name]] = c }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def format_delta
|
|
82
|
+
d = score_delta
|
|
83
|
+
d >= 0 ? "+#{d}" : d.to_s
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -23,8 +23,13 @@ module RubyLLM
|
|
|
23
23
|
# dataset.case "name", input: {...}, expected_traits: {...}
|
|
24
24
|
# dataset.case "name", input: {...}, evaluator: proc
|
|
25
25
|
def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
26
|
+
case_name = name || "case_#{@cases.length + 1}"
|
|
27
|
+
if @cases.any? { |c| c.name == case_name }
|
|
28
|
+
raise ArgumentError, "Duplicate case name '#{case_name}'. Case names must be unique within a dataset."
|
|
29
|
+
end
|
|
30
|
+
|
|
26
31
|
@cases << Case.new(
|
|
27
|
-
name:
|
|
32
|
+
name: case_name,
|
|
28
33
|
input: input,
|
|
29
34
|
expected: expected,
|
|
30
35
|
expected_traits: expected_traits,
|
|
@@ -37,13 +42,15 @@ module RubyLLM
|
|
|
37
42
|
end
|
|
38
43
|
|
|
39
44
|
class Case
|
|
45
|
+
include Concerns::DeepFreeze
|
|
46
|
+
|
|
40
47
|
attr_reader :name, :input, :expected, :expected_traits, :evaluator
|
|
41
48
|
|
|
42
49
|
def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
43
50
|
@name = name
|
|
44
|
-
@input = input
|
|
45
|
-
@expected = expected
|
|
46
|
-
@expected_traits = expected_traits
|
|
51
|
+
@input = deep_dup_freeze(input)
|
|
52
|
+
@expected = deep_dup_freeze(expected)
|
|
53
|
+
@expected_traits = deep_dup_freeze(expected_traits)
|
|
47
54
|
@evaluator = evaluator
|
|
48
55
|
freeze
|
|
49
56
|
end
|
|
@@ -21,18 +21,19 @@ module RubyLLM
|
|
|
21
21
|
|
|
22
22
|
def sample_response(response)
|
|
23
23
|
@sample_response = response
|
|
24
|
+
@has_sample_response = true
|
|
24
25
|
pre_validate_sample! if @step_class
|
|
25
26
|
end
|
|
26
27
|
|
|
27
28
|
def build_adapter
|
|
28
|
-
return nil unless @
|
|
29
|
+
return nil unless defined?(@has_sample_response) && @has_sample_response
|
|
29
30
|
|
|
30
|
-
Adapters::Test.new(response: @sample_response
|
|
31
|
+
Adapters::Test.new(response: @sample_response)
|
|
31
32
|
end
|
|
32
33
|
|
|
33
34
|
def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil)
|
|
34
|
-
case_input = input
|
|
35
|
-
raise ArgumentError, "add_case requires input (set default_input or pass input:)"
|
|
35
|
+
case_input = input.nil? ? @default_input : input
|
|
36
|
+
raise ArgumentError, "add_case requires input (set default_input or pass input:)" if case_input.nil?
|
|
36
37
|
|
|
37
38
|
@cases << {
|
|
38
39
|
name: description,
|
|
@@ -44,12 +45,12 @@ module RubyLLM
|
|
|
44
45
|
end
|
|
45
46
|
|
|
46
47
|
def verify(description, expected_or_proc = nil, input: nil, expect: nil)
|
|
47
|
-
if expected_or_proc && expect
|
|
48
|
+
if !expected_or_proc.nil? && !expect.nil?
|
|
48
49
|
raise ArgumentError, "verify accepts either a positional argument or expect: keyword, not both"
|
|
49
50
|
end
|
|
50
51
|
|
|
51
|
-
expected_or_proc = expect
|
|
52
|
-
case_input = input
|
|
52
|
+
expected_or_proc = expect unless expect.nil?
|
|
53
|
+
case_input = input.nil? ? @default_input : input
|
|
53
54
|
validate_verify_args!(expected_or_proc, case_input)
|
|
54
55
|
|
|
55
56
|
evaluator = expected_or_proc.is_a?(::Proc) ? expected_or_proc : nil
|
|
@@ -78,15 +79,15 @@ module RubyLLM
|
|
|
78
79
|
|
|
79
80
|
def effective_cases
|
|
80
81
|
return @cases if @cases.any?
|
|
81
|
-
return []
|
|
82
|
+
return [] if @default_input.nil?
|
|
82
83
|
|
|
83
84
|
# Zero-verify: auto-add a contract check case
|
|
84
85
|
[{ name: "contract check", input: @default_input, expected: nil, evaluator: nil }]
|
|
85
86
|
end
|
|
86
87
|
|
|
87
88
|
def validate_verify_args!(expected_or_proc, case_input)
|
|
88
|
-
raise ArgumentError, "verify requires either a positional argument or expect: keyword"
|
|
89
|
-
raise ArgumentError, "verify requires input (set default_input or pass input:)"
|
|
89
|
+
raise ArgumentError, "verify requires either a positional argument or expect: keyword" if expected_or_proc.nil?
|
|
90
|
+
raise ArgumentError, "verify requires input (set default_input or pass input:)" if case_input.nil?
|
|
90
91
|
end
|
|
91
92
|
|
|
92
93
|
def pre_validate_sample!
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
3
6
|
module RubyLLM
|
|
4
7
|
module Contract
|
|
5
8
|
module Eval
|
|
6
9
|
class Report
|
|
7
10
|
attr_reader :dataset_name, :results
|
|
8
11
|
|
|
9
|
-
def initialize(dataset_name:, results:)
|
|
12
|
+
def initialize(dataset_name:, results:, step_name: nil)
|
|
10
13
|
@dataset_name = dataset_name
|
|
11
|
-
@
|
|
14
|
+
@step_name = step_name
|
|
15
|
+
@results = results.dup.freeze
|
|
12
16
|
freeze
|
|
13
17
|
end
|
|
14
18
|
|
|
@@ -78,6 +82,29 @@ module RubyLLM
|
|
|
78
82
|
lines.join("\n")
|
|
79
83
|
end
|
|
80
84
|
|
|
85
|
+
def save_baseline!(path: nil, model: nil)
|
|
86
|
+
file = path || default_baseline_path(model: model)
|
|
87
|
+
FileUtils.mkdir_p(File.dirname(file))
|
|
88
|
+
File.write(file, JSON.pretty_generate(serialize_for_baseline))
|
|
89
|
+
file
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def compare_with_baseline(path: nil, model: nil)
|
|
93
|
+
file = path || default_baseline_path(model: model)
|
|
94
|
+
raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
|
|
95
|
+
|
|
96
|
+
baseline_data = JSON.parse(File.read(file), symbolize_names: true)
|
|
97
|
+
validate_baseline!(baseline_data)
|
|
98
|
+
BaselineDiff.new(
|
|
99
|
+
baseline_cases: baseline_data[:cases],
|
|
100
|
+
current_cases: evaluated_results.map { |r| serialize_case(r) }
|
|
101
|
+
)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def baseline_exists?(path: nil, model: nil)
|
|
105
|
+
File.exist?(path || default_baseline_path(model: model))
|
|
106
|
+
end
|
|
107
|
+
|
|
81
108
|
def print_summary(io = $stdout)
|
|
82
109
|
io.puts summary
|
|
83
110
|
io.puts
|
|
@@ -106,6 +133,48 @@ module RubyLLM
|
|
|
106
133
|
results.reject { |r| r.step_status == :skipped }
|
|
107
134
|
end
|
|
108
135
|
|
|
136
|
+
def default_baseline_path(model: nil)
|
|
137
|
+
parts = [".eval_baselines"]
|
|
138
|
+
parts << sanitize_name(@step_name) if @step_name
|
|
139
|
+
name = sanitize_name(dataset_name)
|
|
140
|
+
name = "#{name}_#{sanitize_name(model)}" if model
|
|
141
|
+
parts << "#{name}.json"
|
|
142
|
+
File.join(*parts)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def validate_baseline!(data)
|
|
146
|
+
if data[:dataset_name] && data[:dataset_name] != dataset_name
|
|
147
|
+
raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{dataset_name}'"
|
|
148
|
+
end
|
|
149
|
+
if data[:step_name] && @step_name && data[:step_name] != @step_name
|
|
150
|
+
raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@step_name}'"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def sanitize_name(name)
|
|
155
|
+
name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def serialize_for_baseline
|
|
159
|
+
{
|
|
160
|
+
dataset_name: dataset_name,
|
|
161
|
+
step_name: @step_name,
|
|
162
|
+
score: score,
|
|
163
|
+
total_cost: total_cost,
|
|
164
|
+
cases: evaluated_results.map { |r| serialize_case(r) }
|
|
165
|
+
}
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def serialize_case(result)
|
|
169
|
+
{
|
|
170
|
+
name: result.name,
|
|
171
|
+
passed: result.passed?,
|
|
172
|
+
score: result.score,
|
|
173
|
+
details: result.details,
|
|
174
|
+
cost: result.cost
|
|
175
|
+
}
|
|
176
|
+
end
|
|
177
|
+
|
|
109
178
|
def format_cost(cost)
|
|
110
179
|
"$#{format("%.6f", cost)}"
|
|
111
180
|
end
|
|
@@ -19,7 +19,8 @@ module RubyLLM
|
|
|
19
19
|
|
|
20
20
|
def run
|
|
21
21
|
results = @dataset.cases.map { |test_case| evaluate_case(test_case) }
|
|
22
|
-
|
|
22
|
+
step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
|
|
23
|
+
Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
|
|
23
24
|
end
|
|
24
25
|
|
|
25
26
|
private
|
|
@@ -81,7 +82,7 @@ module RubyLLM
|
|
|
81
82
|
evaluate_with_custom(step_result, test_case)
|
|
82
83
|
elsif test_case.expected_traits
|
|
83
84
|
evaluate_traits(step_result, test_case)
|
|
84
|
-
elsif test_case.expected
|
|
85
|
+
elsif !test_case.expected.nil?
|
|
85
86
|
evaluate_expected(step_result, test_case)
|
|
86
87
|
else
|
|
87
88
|
evaluate_contract_only
|
|
@@ -5,14 +5,15 @@ module RubyLLM
|
|
|
5
5
|
module Pipeline
|
|
6
6
|
class Trace
|
|
7
7
|
include Concerns::TraceEquality
|
|
8
|
+
include Concerns::DeepFreeze
|
|
8
9
|
|
|
9
10
|
attr_reader :trace_id, :total_latency_ms, :total_usage, :step_traces, :total_cost
|
|
10
11
|
|
|
11
12
|
def initialize(trace_id: nil, total_latency_ms: nil, total_usage: nil, step_traces: nil)
|
|
12
13
|
@trace_id = trace_id
|
|
13
14
|
@total_latency_ms = total_latency_ms
|
|
14
|
-
@total_usage = total_usage
|
|
15
|
-
@step_traces = step_traces
|
|
15
|
+
@total_usage = deep_dup_freeze(total_usage)
|
|
16
|
+
@step_traces = step_traces&.dup&.freeze
|
|
16
17
|
@total_cost = calculate_total_cost
|
|
17
18
|
freeze
|
|
18
19
|
end
|
|
@@ -8,8 +8,8 @@ module RubyLLM
|
|
|
8
8
|
attr_reader :input, :output
|
|
9
9
|
|
|
10
10
|
def initialize(input:, output:)
|
|
11
|
-
@input = input.freeze
|
|
12
|
-
@output = output.freeze
|
|
11
|
+
@input = input.frozen? ? input : input.dup.freeze
|
|
12
|
+
@output = output.frozen? ? output : output.dup.freeze
|
|
13
13
|
super(type: :example, content: nil)
|
|
14
14
|
end
|
|
15
15
|
|
|
@@ -6,7 +6,8 @@ require "rake/tasklib"
|
|
|
6
6
|
module RubyLLM
|
|
7
7
|
module Contract
|
|
8
8
|
class RakeTask < ::Rake::TaskLib
|
|
9
|
-
attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost,
|
|
9
|
+
attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost,
|
|
10
|
+
:eval_dirs, :save_baseline, :fail_on_regression
|
|
10
11
|
|
|
11
12
|
def initialize(name = :"ruby_llm_contract:eval", &block)
|
|
12
13
|
super()
|
|
@@ -16,6 +17,8 @@ module RubyLLM
|
|
|
16
17
|
@minimum_score = nil # nil = require 100%; float = threshold
|
|
17
18
|
@maximum_cost = nil # nil = no cost limit; float = budget cap (suite-level)
|
|
18
19
|
@eval_dirs = [] # directories to load eval files from (non-Rails)
|
|
20
|
+
@save_baseline = false
|
|
21
|
+
@fail_on_regression = false
|
|
19
22
|
block&.call(self)
|
|
20
23
|
define_task
|
|
21
24
|
end
|
|
@@ -26,8 +29,7 @@ module RubyLLM
|
|
|
26
29
|
desc "Run all ruby_llm-contract evals"
|
|
27
30
|
task(@name => task_prerequisites) do
|
|
28
31
|
require "ruby_llm/contract"
|
|
29
|
-
|
|
30
|
-
RubyLLM::Contract.load_evals!
|
|
32
|
+
RubyLLM::Contract.load_evals!(*@eval_dirs)
|
|
31
33
|
|
|
32
34
|
results = RubyLLM::Contract.run_all_evals(context: @context)
|
|
33
35
|
|
|
@@ -43,12 +45,16 @@ module RubyLLM
|
|
|
43
45
|
gate_passed = true
|
|
44
46
|
suite_cost = 0.0
|
|
45
47
|
|
|
48
|
+
passed_reports = []
|
|
49
|
+
|
|
46
50
|
results.each do |host, reports|
|
|
47
51
|
puts "\n#{host.name || host.to_s}"
|
|
48
52
|
reports.each_value do |report|
|
|
49
53
|
report.print_summary
|
|
50
54
|
suite_cost += report.total_cost
|
|
51
|
-
|
|
55
|
+
report_ok = report_meets_score?(report) && !check_regression(report)
|
|
56
|
+
gate_passed = false unless report_ok
|
|
57
|
+
passed_reports << report if report_ok
|
|
52
58
|
end
|
|
53
59
|
end
|
|
54
60
|
|
|
@@ -58,6 +64,9 @@ module RubyLLM
|
|
|
58
64
|
end
|
|
59
65
|
|
|
60
66
|
abort "\nEval suite FAILED" unless gate_passed
|
|
67
|
+
|
|
68
|
+
# Save baselines only after ALL gates pass
|
|
69
|
+
passed_reports.each { |r| save_baseline!(r) } if @save_baseline
|
|
61
70
|
puts "\nAll evals passed."
|
|
62
71
|
end
|
|
63
72
|
end
|
|
@@ -70,6 +79,24 @@ module RubyLLM
|
|
|
70
79
|
end
|
|
71
80
|
end
|
|
72
81
|
|
|
82
|
+
def check_regression(report)
|
|
83
|
+
return false unless @fail_on_regression && report.baseline_exists?
|
|
84
|
+
|
|
85
|
+
diff = report.compare_with_baseline
|
|
86
|
+
if diff.regressed?
|
|
87
|
+
puts "\n REGRESSIONS DETECTED:"
|
|
88
|
+
puts " #{diff}"
|
|
89
|
+
true
|
|
90
|
+
else
|
|
91
|
+
false
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def save_baseline!(report)
|
|
96
|
+
path = report.save_baseline!
|
|
97
|
+
puts " Baseline saved: #{path}"
|
|
98
|
+
end
|
|
99
|
+
|
|
73
100
|
def task_prerequisites
|
|
74
101
|
Rake::Task.task_defined?(:environment) ? [:environment] : []
|
|
75
102
|
end
|
|
@@ -10,18 +10,38 @@ module RubyLLM
|
|
|
10
10
|
# result = ClassifyTicket.run("test")
|
|
11
11
|
# result.parsed_output # => {priority: "high"}
|
|
12
12
|
#
|
|
13
|
-
#
|
|
14
|
-
# stub_step(ClassifyTicket, responses: [{ a: 1 }, { a: 2 }])
|
|
13
|
+
# Only affects the specified step — other steps are not affected.
|
|
15
14
|
#
|
|
16
15
|
def stub_step(step_class, response: nil, responses: nil)
|
|
17
|
-
adapter =
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
adapter = build_test_adapter(response: response, responses: responses)
|
|
17
|
+
allow(step_class).to receive(:run).and_wrap_original do |original, input, **kwargs|
|
|
18
|
+
context = (kwargs[:context] || {}).merge(adapter: adapter)
|
|
19
|
+
original.call(input, context: context)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Set a global test adapter for ALL steps.
|
|
24
|
+
#
|
|
25
|
+
# stub_all_steps(response: { default: true })
|
|
26
|
+
#
|
|
27
|
+
def stub_all_steps(response: nil, responses: nil)
|
|
28
|
+
adapter = build_test_adapter(response: response, responses: responses)
|
|
23
29
|
RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
|
|
24
30
|
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def build_test_adapter(response: nil, responses: nil)
|
|
35
|
+
if responses
|
|
36
|
+
Adapters::Test.new(responses: responses.map { |r| normalize_test_response(r) })
|
|
37
|
+
else
|
|
38
|
+
Adapters::Test.new(response: normalize_test_response(response))
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def normalize_test_response(value)
|
|
43
|
+
value
|
|
44
|
+
end
|
|
25
45
|
end
|
|
26
46
|
end
|
|
27
47
|
end
|
|
@@ -64,12 +64,18 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
64
64
|
@maximum_cost = cost
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
+
chain :without_regressions do
|
|
68
|
+
@check_regressions = true
|
|
69
|
+
end
|
|
70
|
+
|
|
67
71
|
match do |step_or_pipeline|
|
|
68
72
|
@eval_name = eval_name
|
|
69
73
|
@context ||= {}
|
|
70
74
|
@minimum_score ||= nil
|
|
71
75
|
@maximum_cost ||= nil
|
|
76
|
+
@check_regressions ||= false
|
|
72
77
|
@error = nil
|
|
78
|
+
@diff = nil
|
|
73
79
|
@report = step_or_pipeline.run_eval(eval_name, context: @context)
|
|
74
80
|
|
|
75
81
|
score_ok = if @minimum_score
|
|
@@ -80,14 +86,29 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
80
86
|
|
|
81
87
|
cost_ok = @maximum_cost ? @report.total_cost <= @maximum_cost : true
|
|
82
88
|
|
|
83
|
-
|
|
89
|
+
regression_ok = if @check_regressions && @report.baseline_exists?
|
|
90
|
+
@diff = @report.compare_with_baseline
|
|
91
|
+
!@diff.regressed?
|
|
92
|
+
else
|
|
93
|
+
true
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
score_ok && cost_ok && regression_ok
|
|
84
97
|
rescue StandardError => e
|
|
85
98
|
@error = e
|
|
86
99
|
false
|
|
87
100
|
end
|
|
88
101
|
|
|
89
102
|
failure_message do
|
|
90
|
-
format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
|
|
103
|
+
msg = format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
|
|
104
|
+
if @diff&.regressed?
|
|
105
|
+
msg += "\n\nRegressions from baseline:\n"
|
|
106
|
+
@diff.regressions.each do |r|
|
|
107
|
+
msg += " #{r[:case]}: was PASS, now FAIL — #{r[:detail]}\n"
|
|
108
|
+
end
|
|
109
|
+
msg += " Score delta: #{@diff.score_delta}"
|
|
110
|
+
end
|
|
111
|
+
msg
|
|
91
112
|
end
|
|
92
113
|
|
|
93
114
|
failure_message_when_negated do
|
|
@@ -58,18 +58,23 @@ module RubyLLM
|
|
|
58
58
|
end
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
-
KNOWN_CONTEXT_KEYS = %i[adapter model temperature
|
|
61
|
+
KNOWN_CONTEXT_KEYS = %i[adapter model temperature provider assume_model_exists].freeze
|
|
62
62
|
|
|
63
63
|
def run(input, context: {})
|
|
64
|
+
context = (context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
|
|
64
65
|
warn_unknown_context_keys(context)
|
|
65
66
|
adapter = resolve_adapter(context)
|
|
66
67
|
default_model = context[:model] || model || RubyLLM::Contract.configuration.default_model
|
|
67
68
|
policy = retry_policy
|
|
68
69
|
|
|
70
|
+
ctx_temp = context[:temperature]
|
|
71
|
+
extra = context.slice(:provider, :assume_model_exists)
|
|
69
72
|
result = if policy
|
|
70
|
-
run_with_retry(input, adapter: adapter, default_model: default_model,
|
|
73
|
+
run_with_retry(input, adapter: adapter, default_model: default_model,
|
|
74
|
+
policy: policy, context_temperature: ctx_temp, extra_options: extra)
|
|
71
75
|
else
|
|
72
|
-
run_once(input, adapter: adapter, model: default_model,
|
|
76
|
+
run_once(input, adapter: adapter, model: default_model,
|
|
77
|
+
context_temperature: ctx_temp, extra_options: extra)
|
|
73
78
|
end
|
|
74
79
|
|
|
75
80
|
invoke_around_call(input, result)
|
|
@@ -101,14 +106,14 @@ module RubyLLM
|
|
|
101
106
|
"{ |c| c.default_adapter = ... } or pass context: { adapter: ... }"
|
|
102
107
|
end
|
|
103
108
|
|
|
104
|
-
def run_once(input, adapter:, model:, context_temperature: nil)
|
|
109
|
+
def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
|
|
105
110
|
effective_temp = context_temperature || temperature
|
|
106
111
|
Runner.new(
|
|
107
112
|
input_type: input_type, output_type: output_type,
|
|
108
113
|
prompt_block: prompt, contract_definition: effective_contract,
|
|
109
114
|
adapter: adapter, model: model, output_schema: output_schema,
|
|
110
115
|
max_output: max_output, max_input: max_input, max_cost: max_cost,
|
|
111
|
-
temperature: effective_temp
|
|
116
|
+
temperature: effective_temp, extra_options: extra_options
|
|
112
117
|
).call(input)
|
|
113
118
|
rescue ArgumentError => e
|
|
114
119
|
Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
|
|
@@ -168,7 +168,7 @@ module RubyLLM
|
|
|
168
168
|
end
|
|
169
169
|
|
|
170
170
|
def retry_policy(models: nil, attempts: nil, retry_on: nil, &block)
|
|
171
|
-
if block || models || attempts
|
|
171
|
+
if block || models || attempts || retry_on
|
|
172
172
|
return @retry_policy = RetryPolicy.new(models: models, attempts: attempts, retry_on: retry_on, &block)
|
|
173
173
|
end
|
|
174
174
|
|
|
@@ -8,12 +8,13 @@ module RubyLLM
|
|
|
8
8
|
module RetryExecutor
|
|
9
9
|
private
|
|
10
10
|
|
|
11
|
-
def run_with_retry(input, adapter:, default_model:, policy:)
|
|
11
|
+
def run_with_retry(input, adapter:, default_model:, policy:, context_temperature: nil, extra_options: {})
|
|
12
12
|
all_attempts = []
|
|
13
13
|
|
|
14
14
|
policy.max_attempts.times do |attempt_index|
|
|
15
15
|
model = policy.model_for_attempt(attempt_index, default_model)
|
|
16
|
-
result = run_once(input, adapter: adapter, model: model
|
|
16
|
+
result = run_once(input, adapter: adapter, model: model,
|
|
17
|
+
context_temperature: context_temperature, extra_options: extra_options)
|
|
17
18
|
all_attempts << { attempt: attempt_index + 1, model: model, result: result }
|
|
18
19
|
break unless policy.retryable?(result)
|
|
19
20
|
end
|
|
@@ -15,6 +15,7 @@ module RubyLLM
|
|
|
15
15
|
if block
|
|
16
16
|
@max_attempts = 1
|
|
17
17
|
instance_eval(&block)
|
|
18
|
+
warn_no_retry! if @max_attempts == 1 && @models.empty?
|
|
18
19
|
else
|
|
19
20
|
apply_keywords(models: models, attempts: attempts, retry_on: retry_on)
|
|
20
21
|
end
|
|
@@ -65,6 +66,11 @@ module RubyLLM
|
|
|
65
66
|
@retryable_statuses = Array(retry_on).dup if retry_on
|
|
66
67
|
end
|
|
67
68
|
|
|
69
|
+
def warn_no_retry!
|
|
70
|
+
warn "[ruby_llm-contract] retry_policy has max_attempts=1 with no models. " \
|
|
71
|
+
"This means no actual retry will happen. Add `attempts 2` or `escalate %w[model1 model2]`."
|
|
72
|
+
end
|
|
73
|
+
|
|
68
74
|
def validate_max_attempts!
|
|
69
75
|
return if @max_attempts.is_a?(Integer) && @max_attempts >= 1
|
|
70
76
|
|
|
@@ -8,7 +8,7 @@ module RubyLLM
|
|
|
8
8
|
|
|
9
9
|
def initialize(input_type:, output_type:, prompt_block:, contract_definition:,
|
|
10
10
|
adapter:, model:, output_schema: nil, max_output: nil,
|
|
11
|
-
max_input: nil, max_cost: nil, temperature: nil)
|
|
11
|
+
max_input: nil, max_cost: nil, temperature: nil, extra_options: {})
|
|
12
12
|
@input_type = input_type
|
|
13
13
|
@output_type = output_type
|
|
14
14
|
@prompt_block = prompt_block
|
|
@@ -20,6 +20,7 @@ module RubyLLM
|
|
|
20
20
|
@max_input = max_input
|
|
21
21
|
@max_cost = max_cost
|
|
22
22
|
@temperature = temperature
|
|
23
|
+
@extra_options = extra_options
|
|
23
24
|
end
|
|
24
25
|
|
|
25
26
|
def call(input)
|
|
@@ -86,6 +87,7 @@ module RubyLLM
|
|
|
86
87
|
opts[:schema] = @output_schema if @output_schema
|
|
87
88
|
opts[:max_tokens] = @max_output if @max_output
|
|
88
89
|
opts[:temperature] = @temperature if @temperature
|
|
90
|
+
@extra_options.each { |k, v| opts[k] = v unless opts.key?(k) }
|
|
89
91
|
end
|
|
90
92
|
end
|
|
91
93
|
|
|
@@ -5,15 +5,16 @@ module RubyLLM
|
|
|
5
5
|
module Step
|
|
6
6
|
class Trace
|
|
7
7
|
include Concerns::TraceEquality
|
|
8
|
+
include Concerns::DeepFreeze
|
|
8
9
|
|
|
9
10
|
attr_reader :messages, :model, :latency_ms, :usage, :attempts, :cost
|
|
10
11
|
|
|
11
12
|
def initialize(messages: nil, model: nil, latency_ms: nil, usage: nil, attempts: nil, cost: nil)
|
|
12
|
-
@messages = messages
|
|
13
|
-
@model = model
|
|
13
|
+
@messages = deep_dup_freeze(messages)
|
|
14
|
+
@model = model.frozen? ? model : model&.dup&.freeze
|
|
14
15
|
@latency_ms = latency_ms
|
|
15
|
-
@usage = usage
|
|
16
|
-
@attempts = attempts
|
|
16
|
+
@usage = deep_dup_freeze(usage)
|
|
17
|
+
@attempts = deep_dup_freeze(attempts)
|
|
17
18
|
@cost = cost || CostCalculator.calculate(model_name: model, usage: usage)
|
|
18
19
|
freeze
|
|
19
20
|
end
|
data/lib/ruby_llm/contract.rb
CHANGED
|
@@ -40,25 +40,21 @@ module RubyLLM
|
|
|
40
40
|
@eval_hosts = []
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
-
def load_evals!(
|
|
44
|
-
dirs =
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
else
|
|
52
|
-
[]
|
|
53
|
-
end
|
|
43
|
+
def load_evals!(*dirs)
|
|
44
|
+
dirs = dirs.flatten.compact
|
|
45
|
+
if dirs.empty? && defined?(::Rails)
|
|
46
|
+
dirs = %w[app/steps/eval app/contracts/eval].filter_map do |path|
|
|
47
|
+
full = ::Rails.root.join(path)
|
|
48
|
+
full.to_s if full.exist?
|
|
49
|
+
end
|
|
50
|
+
end
|
|
54
51
|
|
|
55
52
|
return if dirs.empty?
|
|
56
53
|
|
|
57
|
-
# Clear
|
|
58
|
-
# Thread-local flag suppresses the "redefining" warning during reload.
|
|
54
|
+
# Clear file-sourced evals ONCE, then load ALL dirs.
|
|
59
55
|
Thread.current[:ruby_llm_contract_reloading] = true
|
|
60
56
|
eval_hosts.each do |host|
|
|
61
|
-
host.
|
|
57
|
+
host.clear_file_sourced_evals! if host.respond_to?(:clear_file_sourced_evals!)
|
|
62
58
|
end
|
|
63
59
|
|
|
64
60
|
dirs.each do |d|
|
|
@@ -70,11 +66,17 @@ module RubyLLM
|
|
|
70
66
|
|
|
71
67
|
private
|
|
72
68
|
|
|
73
|
-
# Filter
|
|
69
|
+
# Filter stale hosts, deduplicate by name (last wins), prune registry in-place
|
|
74
70
|
def live_eval_hosts
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
71
|
+
# Remove hosts without evals
|
|
72
|
+
@eval_hosts&.reject! { |h| !h.respond_to?(:eval_defined?) || !h.eval_defined? }
|
|
73
|
+
|
|
74
|
+
# Deduplicate: if two classes share a name (reload), keep the latest
|
|
75
|
+
seen = {}
|
|
76
|
+
@eval_hosts&.each { |h| seen[h.name || h.object_id] = h }
|
|
77
|
+
@eval_hosts = seen.values
|
|
78
|
+
|
|
79
|
+
@eval_hosts || []
|
|
78
80
|
end
|
|
79
81
|
|
|
80
82
|
def auto_create_adapter!
|
|
@@ -87,6 +89,7 @@ module RubyLLM
|
|
|
87
89
|
end
|
|
88
90
|
end
|
|
89
91
|
|
|
92
|
+
require_relative "contract/concerns/deep_freeze"
|
|
90
93
|
require_relative "contract/concerns/deep_symbolize"
|
|
91
94
|
require_relative "contract/concerns/eval_host"
|
|
92
95
|
require_relative "contract/concerns/trace_equality"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-contract
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justyna
|
|
@@ -82,6 +82,7 @@ files:
|
|
|
82
82
|
- lib/ruby_llm/contract/adapters/response.rb
|
|
83
83
|
- lib/ruby_llm/contract/adapters/ruby_llm.rb
|
|
84
84
|
- lib/ruby_llm/contract/adapters/test.rb
|
|
85
|
+
- lib/ruby_llm/contract/concerns/deep_freeze.rb
|
|
85
86
|
- lib/ruby_llm/contract/concerns/deep_symbolize.rb
|
|
86
87
|
- lib/ruby_llm/contract/concerns/eval_host.rb
|
|
87
88
|
- lib/ruby_llm/contract/concerns/trace_equality.rb
|
|
@@ -97,6 +98,7 @@ files:
|
|
|
97
98
|
- lib/ruby_llm/contract/dsl.rb
|
|
98
99
|
- lib/ruby_llm/contract/errors.rb
|
|
99
100
|
- lib/ruby_llm/contract/eval.rb
|
|
101
|
+
- lib/ruby_llm/contract/eval/baseline_diff.rb
|
|
100
102
|
- lib/ruby_llm/contract/eval/case_result.rb
|
|
101
103
|
- lib/ruby_llm/contract/eval/contract_detail_builder.rb
|
|
102
104
|
- lib/ruby_llm/contract/eval/dataset.rb
|