ruby_llm-contract 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/Gemfile.lock +2 -2
- data/README.md +3 -3
- data/lib/ruby_llm/contract/adapters/base.rb +6 -0
- data/lib/ruby_llm/contract/adapters/test.rb +14 -0
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +31 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +15 -19
- data/lib/ruby_llm/contract/configuration.rb +2 -1
- data/lib/ruby_llm/contract/eval/dataset.rb +6 -4
- data/lib/ruby_llm/contract/eval/eval_definition.rb +5 -3
- data/lib/ruby_llm/contract/eval/eval_history.rb +79 -0
- data/lib/ruby_llm/contract/eval/report.rb +27 -0
- data/lib/ruby_llm/contract/eval/runner.rb +106 -5
- data/lib/ruby_llm/contract/eval.rb +1 -0
- data/lib/ruby_llm/contract/minitest.rb +46 -0
- data/lib/ruby_llm/contract/railtie.rb +10 -4
- data/lib/ruby_llm/contract/step/base.rb +17 -1
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/lib/ruby_llm/contract.rb +2 -4
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a423ef1b370ae97651d256fdc3776bd895d1eebc81a2b1c4adac305292e2a7a0
|
|
4
|
+
data.tar.gz: 685ec9b00a369748ca897e38ae498e26d9fc31644aac8c41f096a704bceadd7d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 34ab0e678a2de57812a7b8391d406cabe3bb13cf0399669a9bcba18609fc69488d0ef4d2e4a6675436da71fc9b67d3f4c9e264e8fdbef475b07d020d9d8b9d34
|
|
7
|
+
data.tar.gz: 3aca7548473e4f6e32df442296344013d6081564b56fb5d0081aefc1ea0ab6129896ed852e723c2678548d265797e3568fc4bcf0ca219ca6e220c9ecba35bc9c
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.4.0 (2026-03-24)
|
|
4
|
+
|
|
5
|
+
Observability & Scale — see what changed, run it fast, debug it easily.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- **Structured logging** — `Contract.configure { |c| c.logger = Rails.logger }`. Auto-logs model, status, latency, tokens, cost on every `step.run`.
|
|
10
|
+
- **Batch eval concurrency** — `run_eval("regression", concurrency: 4)`. Parallel case execution via Concurrent::Future. 4x faster CI for large eval suites.
|
|
11
|
+
- **Eval history & trending** — `report.save_history!` appends to JSONL. `report.eval_history` returns `EvalHistory` with `score_trend`, `drift?`, run-by-run scores.
|
|
12
|
+
- **Pipeline per-step eval** — `add_case(..., step_expectations: { classify: { priority: "high" } })`. See which step in a pipeline regressed.
|
|
13
|
+
- **Minitest support** — `assert_satisfies_contract`, `assert_eval_passes`, `stub_step` for Minitest users. `require "ruby_llm/contract/minitest"`.
|
|
14
|
+
|
|
15
|
+
### Game changer continuity
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
v0.2: "Which model?" → compare_models (snapshot)
|
|
19
|
+
v0.3: "Did it change?" → baseline regression (binary)
|
|
20
|
+
v0.4: "Show me the trend" → eval history (time series)
|
|
21
|
+
"Which step changed?" → pipeline per-step eval
|
|
22
|
+
"Run it fast" → batch concurrency
|
|
23
|
+
```
|
|
24
|
+
|
|
3
25
|
## 0.3.7 (2026-03-24)
|
|
4
26
|
|
|
5
27
|
- **Trait missing key = error** — `expected_traits: { title: 0..5 }` on output `{}` now fails instead of silently passing.
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ruby_llm-contract (0.
|
|
4
|
+
ruby_llm-contract (0.4.0)
|
|
5
5
|
dry-types (~> 1.7)
|
|
6
6
|
ruby_llm (~> 1.0)
|
|
7
7
|
ruby_llm-schema (~> 0.3)
|
|
@@ -165,7 +165,7 @@ CHECKSUMS
|
|
|
165
165
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
166
166
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
167
167
|
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
168
|
-
ruby_llm-contract (0.
|
|
168
|
+
ruby_llm-contract (0.4.0)
|
|
169
169
|
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
170
170
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
171
171
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
data/README.md
CHANGED
|
@@ -168,11 +168,11 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
|
168
168
|
|
|
169
169
|
## Roadmap
|
|
170
170
|
|
|
171
|
-
**v0.
|
|
171
|
+
**v0.4 (current):** Observability & scale — eval history with trending, batch eval with concurrency, pipeline per-step eval, Minitest support, structured logging.
|
|
172
172
|
|
|
173
|
-
**v0.
|
|
173
|
+
**v0.3:** Baseline regression detection, migration guide, production hardening.
|
|
174
174
|
|
|
175
|
-
**v0.
|
|
175
|
+
**v0.5:** Prompt A/B testing — `compare_with(OtherStep)` for data-driven prompt engineering with regression safety. Cross-provider comparison docs.
|
|
176
176
|
|
|
177
177
|
## License
|
|
178
178
|
|
|
@@ -7,6 +7,12 @@ module RubyLLM
|
|
|
7
7
|
def call(messages:, **_options)
|
|
8
8
|
raise NotImplementedError, "Subclasses must implement #call"
|
|
9
9
|
end
|
|
10
|
+
|
|
11
|
+
# Override in stateful adapters to provide a fully independent copy
|
|
12
|
+
# for concurrent eval execution. Default: self (stateless adapters).
|
|
13
|
+
def clone_for_concurrency
|
|
14
|
+
self
|
|
15
|
+
end
|
|
10
16
|
end
|
|
11
17
|
end
|
|
12
18
|
end
|
|
@@ -29,6 +29,20 @@ module RubyLLM
|
|
|
29
29
|
|
|
30
30
|
public
|
|
31
31
|
|
|
32
|
+
# Exposes raw responses array for concurrent eval to split per-case
|
|
33
|
+
def responses_array
|
|
34
|
+
@responses
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Returns a fresh adapter with reset index for concurrent execution
|
|
38
|
+
def clone_for_concurrency
|
|
39
|
+
if @responses
|
|
40
|
+
self.class.new(responses: @responses.dup, usage: @usage.dup)
|
|
41
|
+
else
|
|
42
|
+
self.class.new(response: @response, usage: @usage.dup)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
32
46
|
def call(messages:, **_options) # rubocop:disable Lint/UnusedMethodArgument
|
|
33
47
|
content = if @responses
|
|
34
48
|
c = @responses[@index] || @responses.last
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
# Shared helpers for context hash manipulation.
|
|
7
|
+
# Used by EvalHost, Runner, Step::Base.
|
|
8
|
+
module ContextHelpers
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
def safe_context(context)
|
|
12
|
+
(context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def isolate_context(context)
|
|
16
|
+
context.transform_values do |v|
|
|
17
|
+
if v.respond_to?(:clone_for_concurrency)
|
|
18
|
+
v.clone_for_concurrency
|
|
19
|
+
elsif v.respond_to?(:dup)
|
|
20
|
+
v.dup
|
|
21
|
+
else
|
|
22
|
+
v
|
|
23
|
+
end
|
|
24
|
+
rescue TypeError
|
|
25
|
+
v
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -4,6 +4,7 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Concerns
|
|
6
6
|
module EvalHost
|
|
7
|
+
include ContextHelpers
|
|
7
8
|
def define_eval(name, &)
|
|
8
9
|
@eval_definitions ||= {}
|
|
9
10
|
@file_sourced_evals ||= Set.new
|
|
@@ -35,20 +36,20 @@ module RubyLLM
|
|
|
35
36
|
!all_eval_definitions.empty?
|
|
36
37
|
end
|
|
37
38
|
|
|
38
|
-
def run_eval(name = nil, context: {})
|
|
39
|
-
context
|
|
39
|
+
def run_eval(name = nil, context: {}, concurrency: nil)
|
|
40
|
+
context = safe_context(context)
|
|
40
41
|
if name
|
|
41
|
-
run_single_eval(name, context)
|
|
42
|
+
run_single_eval(name, context, concurrency: concurrency)
|
|
42
43
|
else
|
|
43
|
-
run_all_own_evals(context)
|
|
44
|
+
run_all_own_evals(context, concurrency: concurrency)
|
|
44
45
|
end
|
|
45
46
|
end
|
|
46
47
|
|
|
47
48
|
def compare_models(eval_name, models:, context: {})
|
|
48
|
-
context
|
|
49
|
+
context = safe_context(context)
|
|
49
50
|
models = models.uniq
|
|
50
51
|
reports = models.each_with_object({}) do |model, hash|
|
|
51
|
-
model_context =
|
|
52
|
+
model_context = isolate_context(context).merge(model: model)
|
|
52
53
|
hash[model] = run_single_eval(eval_name, model_context)
|
|
53
54
|
end
|
|
54
55
|
Eval::ModelComparison.new(eval_name: eval_name, reports: reports)
|
|
@@ -66,24 +67,26 @@ module RubyLLM
|
|
|
66
67
|
inherited.merge(own)
|
|
67
68
|
end
|
|
68
69
|
|
|
69
|
-
def run_single_eval(name, context)
|
|
70
|
+
def run_single_eval(name, context, concurrency: nil)
|
|
70
71
|
defn = all_eval_definitions[name.to_s]
|
|
71
72
|
raise ArgumentError, "No eval '#{name}' defined. Available: #{all_eval_definitions.keys}" unless defn
|
|
72
73
|
|
|
73
74
|
effective_context = eval_context(defn, context)
|
|
74
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context
|
|
75
|
+
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
76
|
+
concurrency: concurrency)
|
|
75
77
|
end
|
|
76
78
|
|
|
77
|
-
def run_all_own_evals(context)
|
|
79
|
+
def run_all_own_evals(context, concurrency: nil)
|
|
78
80
|
all_eval_definitions.transform_values do |defn|
|
|
79
|
-
isolated_context =
|
|
81
|
+
isolated_context = isolate_context(context)
|
|
80
82
|
effective_context = eval_context(defn, isolated_context)
|
|
81
|
-
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context
|
|
83
|
+
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context,
|
|
84
|
+
concurrency: concurrency)
|
|
82
85
|
end
|
|
83
86
|
end
|
|
84
87
|
|
|
85
88
|
def eval_context(defn, context)
|
|
86
|
-
context = (context
|
|
89
|
+
context = safe_context(context)
|
|
87
90
|
return context if context[:adapter]
|
|
88
91
|
|
|
89
92
|
sample_adapter = defn.build_adapter
|
|
@@ -105,13 +108,6 @@ module RubyLLM
|
|
|
105
108
|
end
|
|
106
109
|
end
|
|
107
110
|
|
|
108
|
-
def deep_dup_context(context)
|
|
109
|
-
context.transform_values do |v|
|
|
110
|
-
v.respond_to?(:dup) ? v.dup : v
|
|
111
|
-
rescue TypeError
|
|
112
|
-
v
|
|
113
|
-
end
|
|
114
|
-
end
|
|
115
111
|
end
|
|
116
112
|
end
|
|
117
113
|
end
|
|
@@ -10,11 +10,12 @@ module RubyLLM
|
|
|
10
10
|
# Then configure contract-specific options:
|
|
11
11
|
# RubyLLM::Contract.configure { |c| c.default_model = "gpt-4.1-mini" }
|
|
12
12
|
class Configuration
|
|
13
|
-
attr_accessor :default_adapter, :default_model
|
|
13
|
+
attr_accessor :default_adapter, :default_model, :logger
|
|
14
14
|
|
|
15
15
|
def initialize
|
|
16
16
|
@default_adapter = nil
|
|
17
17
|
@default_model = nil
|
|
18
|
+
@logger = nil
|
|
18
19
|
end
|
|
19
20
|
end
|
|
20
21
|
end
|
|
@@ -22,7 +22,7 @@ module RubyLLM
|
|
|
22
22
|
# dataset.case "name", input: {...}, expected: {...}
|
|
23
23
|
# dataset.case "name", input: {...}, expected_traits: {...}
|
|
24
24
|
# dataset.case "name", input: {...}, evaluator: proc
|
|
25
|
-
def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
25
|
+
def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil, step_expectations: nil)
|
|
26
26
|
case_name = name || "case_#{@cases.length + 1}"
|
|
27
27
|
if @cases.any? { |c| c.name == case_name }
|
|
28
28
|
raise ArgumentError, "Duplicate case name '#{case_name}'. Case names must be unique within a dataset."
|
|
@@ -33,7 +33,8 @@ module RubyLLM
|
|
|
33
33
|
input: input,
|
|
34
34
|
expected: expected,
|
|
35
35
|
expected_traits: expected_traits,
|
|
36
|
-
evaluator: evaluator
|
|
36
|
+
evaluator: evaluator,
|
|
37
|
+
step_expectations: step_expectations
|
|
37
38
|
)
|
|
38
39
|
end
|
|
39
40
|
|
|
@@ -44,14 +45,15 @@ module RubyLLM
|
|
|
44
45
|
class Case
|
|
45
46
|
include Concerns::DeepFreeze
|
|
46
47
|
|
|
47
|
-
attr_reader :name, :input, :expected, :expected_traits, :evaluator
|
|
48
|
+
attr_reader :name, :input, :expected, :expected_traits, :evaluator, :step_expectations
|
|
48
49
|
|
|
49
|
-
def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
50
|
+
def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil, step_expectations: nil)
|
|
50
51
|
@name = name
|
|
51
52
|
@input = deep_dup_freeze(input)
|
|
52
53
|
@expected = deep_dup_freeze(expected)
|
|
53
54
|
@expected_traits = deep_dup_freeze(expected_traits)
|
|
54
55
|
@evaluator = evaluator
|
|
56
|
+
@step_expectations = deep_dup_freeze(step_expectations)
|
|
55
57
|
freeze
|
|
56
58
|
end
|
|
57
59
|
end
|
|
@@ -31,7 +31,7 @@ module RubyLLM
|
|
|
31
31
|
Adapters::Test.new(response: @sample_response)
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil)
|
|
34
|
+
def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil, step_expectations: nil)
|
|
35
35
|
case_input = input.nil? ? @default_input : input
|
|
36
36
|
raise ArgumentError, "add_case requires input (set default_input or pass input:)" if case_input.nil?
|
|
37
37
|
validate_unique_case_name!(description)
|
|
@@ -41,7 +41,8 @@ module RubyLLM
|
|
|
41
41
|
input: case_input,
|
|
42
42
|
expected: expected,
|
|
43
43
|
expected_traits: expected_traits,
|
|
44
|
-
evaluator: evaluator
|
|
44
|
+
evaluator: evaluator,
|
|
45
|
+
step_expectations: step_expectations
|
|
45
46
|
}
|
|
46
47
|
end
|
|
47
48
|
|
|
@@ -72,7 +73,8 @@ module RubyLLM
|
|
|
72
73
|
eval_cases.each do |eval_case|
|
|
73
74
|
add_case(eval_case[:name], input: eval_case[:input], expected: eval_case[:expected],
|
|
74
75
|
expected_traits: eval_case[:expected_traits],
|
|
75
|
-
evaluator: eval_case[:evaluator]
|
|
76
|
+
evaluator: eval_case[:evaluator],
|
|
77
|
+
step_expectations: eval_case[:step_expectations])
|
|
76
78
|
end
|
|
77
79
|
end
|
|
78
80
|
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module RubyLLM
|
|
7
|
+
module Contract
|
|
8
|
+
module Eval
|
|
9
|
+
class EvalHistory
|
|
10
|
+
attr_reader :runs
|
|
11
|
+
|
|
12
|
+
def initialize(runs:)
|
|
13
|
+
@runs = runs.freeze
|
|
14
|
+
freeze
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def self.load(path)
|
|
18
|
+
return new(runs: []) unless File.exist?(path)
|
|
19
|
+
|
|
20
|
+
runs = File.readlines(path).filter_map do |line|
|
|
21
|
+
JSON.parse(line.strip, symbolize_names: true)
|
|
22
|
+
rescue JSON::ParserError
|
|
23
|
+
nil
|
|
24
|
+
end
|
|
25
|
+
new(runs: runs)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.append(path, run_data)
|
|
29
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
30
|
+
File.open(path, "a") { |f| f.puts(run_data.to_json) }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def score_trend
|
|
34
|
+
return :unknown if runs.length < 2
|
|
35
|
+
|
|
36
|
+
scores = runs.map { |r| r[:score] }
|
|
37
|
+
recent = scores.last(3)
|
|
38
|
+
if recent.all? { |s| s >= scores.first }
|
|
39
|
+
:stable_or_improving
|
|
40
|
+
elsif recent.last < scores.max * 0.9
|
|
41
|
+
:declining
|
|
42
|
+
else
|
|
43
|
+
:stable_or_improving
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def drift?(threshold: 0.1)
|
|
48
|
+
return false if runs.length < 2
|
|
49
|
+
|
|
50
|
+
baseline_score = runs.first[:score]
|
|
51
|
+
current_score = runs.last[:score]
|
|
52
|
+
(baseline_score - current_score) > threshold
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def scores
|
|
56
|
+
runs.map { |r| r[:score] }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def dates
|
|
60
|
+
runs.map { |r| r[:date] }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def latest
|
|
64
|
+
runs.last
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def to_s
|
|
68
|
+
return "No history" if runs.empty?
|
|
69
|
+
|
|
70
|
+
lines = ["#{runs.length} runs"]
|
|
71
|
+
runs.last(5).each do |r|
|
|
72
|
+
lines << " #{r[:date]} score=#{r[:score].round(2)} cost=$#{format("%.6f", r[:total_cost] || r[:cost] || 0)}"
|
|
73
|
+
end
|
|
74
|
+
lines.join("\n")
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -82,6 +82,24 @@ module RubyLLM
|
|
|
82
82
|
lines.join("\n")
|
|
83
83
|
end
|
|
84
84
|
|
|
85
|
+
def save_history!(path: nil, model: nil)
|
|
86
|
+
file = path || default_history_path(model: model)
|
|
87
|
+
run_data = {
|
|
88
|
+
date: Time.now.strftime("%Y-%m-%d"),
|
|
89
|
+
score: score,
|
|
90
|
+
total_cost: total_cost,
|
|
91
|
+
pass_rate: pass_rate,
|
|
92
|
+
cases_count: evaluated_results.length
|
|
93
|
+
}
|
|
94
|
+
EvalHistory.append(file, run_data)
|
|
95
|
+
file
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def eval_history(path: nil, model: nil)
|
|
99
|
+
file = path || default_history_path(model: model)
|
|
100
|
+
EvalHistory.load(file)
|
|
101
|
+
end
|
|
102
|
+
|
|
85
103
|
def save_baseline!(path: nil, model: nil)
|
|
86
104
|
file = path || default_baseline_path(model: model)
|
|
87
105
|
FileUtils.mkdir_p(File.dirname(file))
|
|
@@ -133,6 +151,15 @@ module RubyLLM
|
|
|
133
151
|
results.reject { |r| r.step_status == :skipped }
|
|
134
152
|
end
|
|
135
153
|
|
|
154
|
+
def default_history_path(model: nil)
|
|
155
|
+
parts = [".eval_history"]
|
|
156
|
+
parts << sanitize_name(@step_name) if @step_name
|
|
157
|
+
name = sanitize_name(dataset_name)
|
|
158
|
+
name = "#{name}_#{sanitize_name(model)}" if model
|
|
159
|
+
parts << "#{name}.jsonl"
|
|
160
|
+
File.join(*parts)
|
|
161
|
+
end
|
|
162
|
+
|
|
136
163
|
def default_baseline_path(model: nil)
|
|
137
164
|
parts = [".eval_baselines"]
|
|
138
165
|
parts << sanitize_name(@step_name) if @step_name
|
|
@@ -6,31 +6,100 @@ module RubyLLM
|
|
|
6
6
|
class Runner
|
|
7
7
|
include TraitEvaluator
|
|
8
8
|
include ContractDetailBuilder
|
|
9
|
+
include Concerns::ContextHelpers
|
|
9
10
|
|
|
10
|
-
def self.run(step:, dataset:, context: {})
|
|
11
|
-
new(step: step, dataset: dataset, context: context).run
|
|
11
|
+
def self.run(step:, dataset:, context: {}, concurrency: nil)
|
|
12
|
+
new(step: step, dataset: dataset, context: context, concurrency: concurrency).run
|
|
12
13
|
end
|
|
13
14
|
|
|
14
|
-
def initialize(step:, dataset:, context: {})
|
|
15
|
+
def initialize(step:, dataset:, context: {}, concurrency: nil)
|
|
15
16
|
@step = step
|
|
16
17
|
@dataset = dataset
|
|
17
18
|
@context = context
|
|
19
|
+
@concurrency = concurrency
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
def run
|
|
21
|
-
results = @
|
|
23
|
+
results = if @concurrency && @concurrency > 1
|
|
24
|
+
run_concurrent
|
|
25
|
+
else
|
|
26
|
+
@dataset.cases.map { |test_case| evaluate_case(test_case) }
|
|
27
|
+
end
|
|
22
28
|
step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
|
|
23
29
|
Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
|
|
24
30
|
end
|
|
25
31
|
|
|
26
32
|
private
|
|
27
33
|
|
|
34
|
+
def run_concurrent
|
|
35
|
+
require "concurrent"
|
|
36
|
+
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
37
|
+
|
|
38
|
+
# Pre-build per-case contexts: if adapter has responses:, each case
|
|
39
|
+
# gets a single-response adapter with its own response (by index).
|
|
40
|
+
per_case_contexts = build_per_case_contexts
|
|
41
|
+
|
|
42
|
+
futures = @dataset.cases.each_with_index.map do |test_case, i|
|
|
43
|
+
ctx = per_case_contexts[i]
|
|
44
|
+
Concurrent::Future.execute(executor: pool) do
|
|
45
|
+
evaluate_case_with_context(test_case, ctx)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
futures.map(&:value!)
|
|
49
|
+
ensure
|
|
50
|
+
pool&.shutdown
|
|
51
|
+
pool&.wait_for_termination(5)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def build_per_case_contexts
|
|
55
|
+
adapter = @context[:adapter]
|
|
56
|
+
responses = adapter.respond_to?(:responses_array) ? adapter.responses_array : nil
|
|
57
|
+
|
|
58
|
+
@dataset.cases.each_with_index.map do |_, i|
|
|
59
|
+
if responses
|
|
60
|
+
# Give each case its own single-response adapter
|
|
61
|
+
response = responses[i] || responses.last
|
|
62
|
+
per_case_adapter = Adapters::Test.new(response: response)
|
|
63
|
+
@context.merge(adapter: per_case_adapter)
|
|
64
|
+
else
|
|
65
|
+
isolate_context(@context)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def evaluate_case_with_context(test_case, context)
|
|
71
|
+
run_result = @step.run(test_case.input, context: context)
|
|
72
|
+
step_result = normalize_result(run_result)
|
|
73
|
+
eval_result = dispatch_evaluation(step_result, test_case)
|
|
74
|
+
|
|
75
|
+
result = build_case_result(test_case, step_result, eval_result)
|
|
76
|
+
|
|
77
|
+
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
78
|
+
run_result.respond_to?(:outputs_by_step)
|
|
79
|
+
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
80
|
+
else
|
|
81
|
+
result
|
|
82
|
+
end
|
|
83
|
+
rescue RubyLLM::Contract::Error => e
|
|
84
|
+
raise unless e.message.include?("No adapter configured")
|
|
85
|
+
|
|
86
|
+
skipped_result(test_case, e.message)
|
|
87
|
+
end
|
|
88
|
+
|
|
28
89
|
def evaluate_case(test_case)
|
|
29
90
|
run_result = @step.run(test_case.input, context: @context)
|
|
30
91
|
step_result = normalize_result(run_result)
|
|
31
92
|
eval_result = dispatch_evaluation(step_result, test_case)
|
|
32
93
|
|
|
33
|
-
build_case_result(test_case, step_result, eval_result)
|
|
94
|
+
result = build_case_result(test_case, step_result, eval_result)
|
|
95
|
+
|
|
96
|
+
# Pipeline per-step evaluation
|
|
97
|
+
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
98
|
+
run_result.respond_to?(:outputs_by_step)
|
|
99
|
+
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
100
|
+
else
|
|
101
|
+
result
|
|
102
|
+
end
|
|
34
103
|
rescue RubyLLM::Contract::Error => e
|
|
35
104
|
raise unless e.message.include?("No adapter configured")
|
|
36
105
|
|
|
@@ -145,6 +214,38 @@ module RubyLLM
|
|
|
145
214
|
)
|
|
146
215
|
end
|
|
147
216
|
|
|
217
|
+
def evaluate_step_expectations(result, outputs_by_step, expectations)
|
|
218
|
+
step_results = {}
|
|
219
|
+
all_passed = true
|
|
220
|
+
|
|
221
|
+
expectations.each do |step_alias, expected|
|
|
222
|
+
output = outputs_by_step[step_alias]
|
|
223
|
+
if output.nil?
|
|
224
|
+
step_results[step_alias] = { passed: false, details: "step not executed" }
|
|
225
|
+
all_passed = false
|
|
226
|
+
else
|
|
227
|
+
eval_res = dispatch_expected_evaluator(output: output, expected: expected, input: nil)
|
|
228
|
+
step_results[step_alias] = { passed: eval_res.passed, score: eval_res.score, details: eval_res.details }
|
|
229
|
+
all_passed = false unless eval_res.passed
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Rebuild CaseResult with step_results metadata
|
|
234
|
+
failed_steps = step_results.select { |_, v| !v[:passed] }
|
|
235
|
+
failure_details = failed_steps.map { |k, v| "#{k}: #{v[:details]}" }.join("; ")
|
|
236
|
+
|
|
237
|
+
CaseResult.new(
|
|
238
|
+
name: result.name, input: result.input, output: result.output,
|
|
239
|
+
expected: result.expected,
|
|
240
|
+
step_status: all_passed ? result.step_status : :step_expectation_failed,
|
|
241
|
+
score: all_passed ? result.score : 0.0,
|
|
242
|
+
passed: result.passed? && all_passed,
|
|
243
|
+
label: all_passed ? result.label : "FAIL",
|
|
244
|
+
details: all_passed ? result.details : "step expectations failed: #{failure_details}",
|
|
245
|
+
duration_ms: result.duration_ms, cost: result.cost
|
|
246
|
+
)
|
|
247
|
+
end
|
|
248
|
+
|
|
148
249
|
def skipped_result(test_case, reason)
|
|
149
250
|
CaseResult.new(
|
|
150
251
|
name: test_case.name,
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ruby_llm/contract"
|
|
4
|
+
|
|
5
|
+
module RubyLLM
|
|
6
|
+
module Contract
|
|
7
|
+
module MinitestHelpers
|
|
8
|
+
def assert_satisfies_contract(result, msg = nil)
|
|
9
|
+
assert result.ok?, msg || "Expected step result to satisfy contract, " \
|
|
10
|
+
"but got status: #{result.status}. Errors: #{result.validation_errors.join(", ")}"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def refute_satisfies_contract(result, msg = nil)
|
|
14
|
+
refute result.ok?, msg || "Expected step result NOT to satisfy contract, but it passed"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def assert_eval_passes(step, eval_name, minimum_score: nil, maximum_cost: nil, context: {}, msg: nil)
|
|
18
|
+
report = step.run_eval(eval_name, context: context)
|
|
19
|
+
|
|
20
|
+
if minimum_score
|
|
21
|
+
assert report.score >= minimum_score,
|
|
22
|
+
msg || "Expected #{eval_name} eval score >= #{minimum_score}, got #{report.score.round(2)} (#{report.pass_rate})"
|
|
23
|
+
else
|
|
24
|
+
assert report.passed?,
|
|
25
|
+
msg || "Expected #{eval_name} eval to pass, got #{report.score.round(2)} (#{report.pass_rate})"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
if maximum_cost
|
|
29
|
+
assert report.total_cost <= maximum_cost,
|
|
30
|
+
msg || "Expected #{eval_name} eval cost <= $#{format("%.4f", maximum_cost)}, got $#{format("%.4f", report.total_cost)}"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
report
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def stub_step(step_class, response: nil, responses: nil)
|
|
37
|
+
adapter = if responses
|
|
38
|
+
Adapters::Test.new(responses: responses)
|
|
39
|
+
else
|
|
40
|
+
Adapters::Test.new(response: response)
|
|
41
|
+
end
|
|
42
|
+
RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -3,15 +3,21 @@
|
|
|
3
3
|
module RubyLLM
|
|
4
4
|
module Contract
|
|
5
5
|
class Railtie < ::Rails::Railtie
|
|
6
|
-
#
|
|
7
|
-
# constants
|
|
8
|
-
|
|
6
|
+
# Ignore eval/ subdirs BEFORE Zeitwerk setup — eval files don't define
|
|
7
|
+
# constants, they call define_eval on existing Step classes.
|
|
8
|
+
initializer "ruby_llm_contract.ignore_eval_dirs", before: :set_autoload_paths do |app|
|
|
9
|
+
%w[app/contracts/eval app/steps/eval].each do |path|
|
|
10
|
+
full = app.root.join(path)
|
|
11
|
+
next unless full.exist?
|
|
12
|
+
|
|
13
|
+
Rails.autoloaders.each { |loader| loader.ignore(full.to_s) }
|
|
14
|
+
end
|
|
15
|
+
end
|
|
9
16
|
|
|
10
17
|
config.after_initialize do
|
|
11
18
|
RubyLLM::Contract.load_evals!
|
|
12
19
|
end
|
|
13
20
|
|
|
14
|
-
# Re-load eval files on code reload in development (Spring, zeitwerk:check, etc.)
|
|
15
21
|
config.to_prepare do
|
|
16
22
|
RubyLLM::Contract.load_evals!
|
|
17
23
|
end
|
|
@@ -60,8 +60,10 @@ module RubyLLM
|
|
|
60
60
|
|
|
61
61
|
KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists].freeze
|
|
62
62
|
|
|
63
|
+
include Concerns::ContextHelpers
|
|
64
|
+
|
|
63
65
|
def run(input, context: {})
|
|
64
|
-
context = (context
|
|
66
|
+
context = safe_context(context)
|
|
65
67
|
warn_unknown_context_keys(context)
|
|
66
68
|
adapter = resolve_adapter(context)
|
|
67
69
|
default_model = context[:model] || model || RubyLLM::Contract.configuration.default_model
|
|
@@ -77,6 +79,7 @@ module RubyLLM
|
|
|
77
79
|
context_temperature: ctx_temp, extra_options: extra)
|
|
78
80
|
end
|
|
79
81
|
|
|
82
|
+
log_result(result)
|
|
80
83
|
invoke_around_call(input, result)
|
|
81
84
|
end
|
|
82
85
|
|
|
@@ -121,6 +124,19 @@ module RubyLLM
|
|
|
121
124
|
validation_errors: [e.message])
|
|
122
125
|
end
|
|
123
126
|
|
|
127
|
+
def log_result(result)
|
|
128
|
+
logger = RubyLLM::Contract.configuration.logger
|
|
129
|
+
return unless logger
|
|
130
|
+
|
|
131
|
+
trace = result.trace
|
|
132
|
+
msg = "[ruby_llm-contract] #{name || self} " \
|
|
133
|
+
"model=#{trace.model} status=#{result.status} " \
|
|
134
|
+
"latency=#{trace.latency_ms}ms " \
|
|
135
|
+
"tokens=#{trace.usage&.dig(:input_tokens) || 0}+#{trace.usage&.dig(:output_tokens) || 0} " \
|
|
136
|
+
"cost=$#{format("%.6f", trace.cost || 0)}"
|
|
137
|
+
logger.info(msg)
|
|
138
|
+
end
|
|
139
|
+
|
|
124
140
|
def invoke_around_call(input, result)
|
|
125
141
|
return result unless around_call
|
|
126
142
|
|
data/lib/ruby_llm/contract.rb
CHANGED
|
@@ -88,10 +88,7 @@ module RubyLLM
|
|
|
88
88
|
full = ::Rails.root.join(path)
|
|
89
89
|
next unless full.exist?
|
|
90
90
|
|
|
91
|
-
#
|
|
92
|
-
# constants and are loaded separately by load_evals!
|
|
93
|
-
eval_dir = full.join("eval")
|
|
94
|
-
::Rails.autoloaders.main.ignore(eval_dir.to_s) if eval_dir.exist?
|
|
91
|
+
# eval/ subdirs already ignored by Railtie initializer (before Zeitwerk setup)
|
|
95
92
|
::Rails.autoloaders.main.eager_load_dir(full.to_s)
|
|
96
93
|
rescue StandardError
|
|
97
94
|
nil
|
|
@@ -108,6 +105,7 @@ module RubyLLM
|
|
|
108
105
|
end
|
|
109
106
|
end
|
|
110
107
|
|
|
108
|
+
require_relative "contract/concerns/context_helpers"
|
|
111
109
|
require_relative "contract/concerns/deep_freeze"
|
|
112
110
|
require_relative "contract/concerns/deep_symbolize"
|
|
113
111
|
require_relative "contract/concerns/eval_host"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-contract
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justyna
|
|
@@ -83,6 +83,7 @@ files:
|
|
|
83
83
|
- lib/ruby_llm/contract/adapters/response.rb
|
|
84
84
|
- lib/ruby_llm/contract/adapters/ruby_llm.rb
|
|
85
85
|
- lib/ruby_llm/contract/adapters/test.rb
|
|
86
|
+
- lib/ruby_llm/contract/concerns/context_helpers.rb
|
|
86
87
|
- lib/ruby_llm/contract/concerns/deep_freeze.rb
|
|
87
88
|
- lib/ruby_llm/contract/concerns/deep_symbolize.rb
|
|
88
89
|
- lib/ruby_llm/contract/concerns/eval_host.rb
|
|
@@ -104,6 +105,7 @@ files:
|
|
|
104
105
|
- lib/ruby_llm/contract/eval/contract_detail_builder.rb
|
|
105
106
|
- lib/ruby_llm/contract/eval/dataset.rb
|
|
106
107
|
- lib/ruby_llm/contract/eval/eval_definition.rb
|
|
108
|
+
- lib/ruby_llm/contract/eval/eval_history.rb
|
|
107
109
|
- lib/ruby_llm/contract/eval/evaluation_result.rb
|
|
108
110
|
- lib/ruby_llm/contract/eval/evaluator/exact.rb
|
|
109
111
|
- lib/ruby_llm/contract/eval/evaluator/json_includes.rb
|
|
@@ -114,6 +116,7 @@ files:
|
|
|
114
116
|
- lib/ruby_llm/contract/eval/report.rb
|
|
115
117
|
- lib/ruby_llm/contract/eval/runner.rb
|
|
116
118
|
- lib/ruby_llm/contract/eval/trait_evaluator.rb
|
|
119
|
+
- lib/ruby_llm/contract/minitest.rb
|
|
117
120
|
- lib/ruby_llm/contract/pipeline.rb
|
|
118
121
|
- lib/ruby_llm/contract/pipeline/base.rb
|
|
119
122
|
- lib/ruby_llm/contract/pipeline/result.rb
|