ruby_llm-contract 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +55 -0
- data/CHANGELOG.md +76 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +176 -0
- data/LICENSE +21 -0
- data/README.md +154 -0
- data/Rakefile +8 -0
- data/examples/00_basics.rb +500 -0
- data/examples/01_classify_threads.rb +220 -0
- data/examples/02_generate_comment.rb +203 -0
- data/examples/03_target_audience.rb +201 -0
- data/examples/04_real_llm.rb +410 -0
- data/examples/05_output_schema.rb +258 -0
- data/examples/07_keyword_extraction.rb +239 -0
- data/examples/08_translation.rb +353 -0
- data/examples/09_eval_dataset.rb +287 -0
- data/examples/10_reddit_full_showcase.rb +363 -0
- data/examples/README.md +140 -0
- data/lib/ruby_llm/contract/adapters/base.rb +13 -0
- data/lib/ruby_llm/contract/adapters/response.rb +17 -0
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
- data/lib/ruby_llm/contract/adapters/test.rb +44 -0
- data/lib/ruby_llm/contract/adapters.rb +6 -0
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
- data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
- data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
- data/lib/ruby_llm/contract/configuration.rb +21 -0
- data/lib/ruby_llm/contract/contract/definition.rb +39 -0
- data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
- data/lib/ruby_llm/contract/contract/parser.rb +143 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
- data/lib/ruby_llm/contract/contract/validator.rb +104 -0
- data/lib/ruby_llm/contract/contract.rb +7 -0
- data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
- data/lib/ruby_llm/contract/dsl.rb +13 -0
- data/lib/ruby_llm/contract/errors.rb +19 -0
- data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
- data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
- data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
- data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
- data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
- data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
- data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
- data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
- data/lib/ruby_llm/contract/eval/report.rb +115 -0
- data/lib/ruby_llm/contract/eval/runner.rb +162 -0
- data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
- data/lib/ruby_llm/contract/eval.rb +16 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
- data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
- data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
- data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
- data/lib/ruby_llm/contract/pipeline.rb +6 -0
- data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
- data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
- data/lib/ruby_llm/contract/prompt/node.rb +25 -0
- data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
- data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
- data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
- data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
- data/lib/ruby_llm/contract/railtie.rb +20 -0
- data/lib/ruby_llm/contract/rake_task.rb +78 -0
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
- data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
- data/lib/ruby_llm/contract/rspec.rb +6 -0
- data/lib/ruby_llm/contract/step/base.rb +138 -0
- data/lib/ruby_llm/contract/step/dsl.rb +144 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
- data/lib/ruby_llm/contract/step/result.rb +38 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
- data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
- data/lib/ruby_llm/contract/step/runner.rb +126 -0
- data/lib/ruby_llm/contract/step/trace.rb +70 -0
- data/lib/ruby_llm/contract/step.rb +10 -0
- data/lib/ruby_llm/contract/token_estimator.rb +19 -0
- data/lib/ruby_llm/contract/types.rb +11 -0
- data/lib/ruby_llm/contract/version.rb +7 -0
- data/lib/ruby_llm/contract.rb +108 -0
- data/ruby_llm-contract.gemspec +33 -0
- metadata +172 -0
data/examples/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Examples
|
|
2
|
+
|
|
3
|
+
## 00_basics.rb — From zero to ruby_llm-contract
|
|
4
|
+
|
|
5
|
+
Step-by-step tutorial covering every feature. Start here.
|
|
6
|
+
|
|
7
|
+
| Step | Feature | What it shows |
|
|
8
|
+
|------|---------|---------------|
|
|
9
|
+
| 1 | Plain string prompt | Simplest case — `user "{input}"` and nothing else |
|
|
10
|
+
| 2 | System + user | Separate instructions from data |
|
|
11
|
+
| 3 | Rules + output_schema | Requirements as statements + declarative output structure |
|
|
12
|
+
| 4 | Invariants | Custom business logic on top of schema |
|
|
13
|
+
| 5 | Examples | Few-shot (example input/output pairs) |
|
|
14
|
+
| 6 | Sections | Labeled context blocks (heredoc replacement, with before/after) |
|
|
15
|
+
| 7 | Hash input | Multiple fields with auto-interpolation |
|
|
16
|
+
| 8 | 2-arity invariants | Cross-validate output against input |
|
|
17
|
+
| 9 | Context override | Per-run adapter and model switching |
|
|
18
|
+
| 10 | StepResult | Full inspection: status, output, errors, trace |
|
|
19
|
+
| 11 | Pipeline | Chain steps with fail-fast data threading |
|
|
20
|
+
|
|
21
|
+
Every step has a corresponding test in `spec/integration/examples_00_basics_spec.rb`.
|
|
22
|
+
|
|
23
|
+
## 01_classify_threads.rb — Thread classification
|
|
24
|
+
|
|
25
|
+
Real-world before/after: classify Reddit threads as PROMO/FILLER/SKIP.
|
|
26
|
+
Shows ID matching, enum validation, score consistency invariants.
|
|
27
|
+
|
|
28
|
+
## 02_generate_comment.rb — Comment generation
|
|
29
|
+
|
|
30
|
+
Real-world before/after: generate Reddit comments with persona.
|
|
31
|
+
Shows sections, banned openings, link presence, length constraints, 2-arity invariants.
|
|
32
|
+
|
|
33
|
+
## 03_target_audience.rb — Audience profiling
|
|
34
|
+
|
|
35
|
+
Real-world before/after: generate target audience profiles.
|
|
36
|
+
Shows cascade failure prevention, locale validation, structural invariants.
|
|
37
|
+
|
|
38
|
+
## 04_real_llm.rb — Real LLM calls via ruby_llm
|
|
39
|
+
|
|
40
|
+
Connect to real LLM providers (OpenAI, Anthropic, Google, etc.) using Adapters::RubyLLM.
|
|
41
|
+
Shows configuration, model switching, temperature/max_tokens control, provider-agnostic steps.
|
|
42
|
+
|
|
43
|
+
| Step | Feature | What it shows |
|
|
44
|
+
|------|---------|---------------|
|
|
45
|
+
| 1 | Configure ruby_llm | Set API keys for your provider |
|
|
46
|
+
| 2 | Set RubyLLM adapter | Swap Test adapter for production |
|
|
47
|
+
| 3 | Define a step | Identical to Test adapter — provider-agnostic |
|
|
48
|
+
| 4 | Run with real LLM | Real call, real tokens, full contract enforcement |
|
|
49
|
+
| 5 | Compare models | A/B test different models per call |
|
|
50
|
+
| 6 | Generation params | Temperature, max_tokens forwarding |
|
|
51
|
+
| 7 | Switch providers | Same step, different provider — just change model name |
|
|
52
|
+
| 8 | Error handling | Contract enforcement with real LLM responses |
|
|
53
|
+
| 9 | Full power | Every feature combined in AnalyzeTicket |
|
|
54
|
+
| 10 | Pipeline | Chain steps with real LLM calls |
|
|
55
|
+
|
|
56
|
+
**Requires:** `export OPENAI_API_KEY=sk-...` (or another provider key)
|
|
57
|
+
|
|
58
|
+
## 05_output_schema.rb — Declarative output schema
|
|
59
|
+
|
|
60
|
+
Replace manual invariants with a schema DSL (ruby_llm-schema).
|
|
61
|
+
|
|
62
|
+
| Step | Feature | What it shows |
|
|
63
|
+
|------|---------|---------------|
|
|
64
|
+
| 1 | Before (invariants) | Manual enum, range, required checks |
|
|
65
|
+
| 2 | After (schema) | Same constraints in declarative DSL |
|
|
66
|
+
| 3 | Schema + invariants | Schema for structure, invariants for business logic |
|
|
67
|
+
| 4 | Complex schema | Nested objects, arrays, constraints |
|
|
68
|
+
| 5 | Provider-agnostic | Same schema works with Test and RubyLLM adapters |
|
|
69
|
+
| 6 | Pipeline + schemas | Fully typed multi-step composition |
|
|
70
|
+
|
|
71
|
+
## Running
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Test adapter — no API keys needed:
|
|
75
|
+
ruby examples/00_basics.rb
|
|
76
|
+
ruby examples/01_classify_threads.rb
|
|
77
|
+
ruby examples/02_generate_comment.rb
|
|
78
|
+
ruby examples/03_target_audience.rb
|
|
79
|
+
ruby examples/05_output_schema.rb
|
|
80
|
+
|
|
81
|
+
# Real LLM — requires API key:
|
|
82
|
+
ruby examples/04_real_llm.rb
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## 06_reddit_promo.rb — Real-world Reddit promo pipeline
|
|
86
|
+
|
|
87
|
+
3-step pipeline from the reddit_promo_planner case study:
|
|
88
|
+
|
|
89
|
+
| Step | Role | Invariants catch |
|
|
90
|
+
|------|------|------------------|
|
|
91
|
+
| 1 | TargetAudience | `locale: "USA"` instead of `"en"`, vague summary |
|
|
92
|
+
| 2 | ClassifyThreads | PROMO with score 2, SKIP with score 8 |
|
|
93
|
+
| 3 | GenerateComment | `{PRODUCT}` instead of URL, banned openings |
|
|
94
|
+
|
|
95
|
+
Runs with test adapter by default. `REAL_LLM=1` for Ollama, `MODEL=gemma:latest` to pick model.
|
|
96
|
+
|
|
97
|
+
## 07_keyword_extraction.rb — Keyword extraction with probability
|
|
98
|
+
|
|
99
|
+
Extract up to 15 keywords from an article, each with relevance probability.
|
|
100
|
+
|
|
101
|
+
| Feature | What it shows |
|
|
102
|
+
|---------|---------------|
|
|
103
|
+
| Array schema | `min_items: 1, max_items: 15` with nested objects |
|
|
104
|
+
| Number range | `probability: 0.0–1.0` |
|
|
105
|
+
| Sorting invariant | Schema can't express "sorted descending" |
|
|
106
|
+
| Uniqueness invariant | Schema can't express "no duplicates" |
|
|
107
|
+
| Cross-validation | Keywords must appear in source text (catches hallucination) |
|
|
108
|
+
| Pipeline | Keywords → Related Topics |
|
|
109
|
+
|
|
110
|
+
## 08_translation.rb — Translation pipeline with quality review
|
|
111
|
+
|
|
112
|
+
3-step pipeline: extract segments → translate → review quality.
|
|
113
|
+
|
|
114
|
+
| Step | LLM Skill | Invariants catch |
|
|
115
|
+
|------|-----------|------------------|
|
|
116
|
+
| Extract | Analysis | Duplicate keys, wrong target_lang |
|
|
117
|
+
| Translate | Creative | Missing segments, too long, echoed back untranslated |
|
|
118
|
+
| Review | Evaluation | Inconsistent counts, failed reviews without issues |
|
|
119
|
+
|
|
120
|
+
## Running
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
# Test adapter — no API keys needed:
|
|
124
|
+
ruby examples/00_basics.rb
|
|
125
|
+
ruby examples/01_classify_threads.rb
|
|
126
|
+
ruby examples/02_generate_comment.rb
|
|
127
|
+
ruby examples/03_target_audience.rb
|
|
128
|
+
ruby examples/05_output_schema.rb
|
|
129
|
+
ruby examples/06_reddit_promo.rb
|
|
130
|
+
ruby examples/07_keyword_extraction.rb
|
|
131
|
+
ruby examples/08_translation.rb
|
|
132
|
+
|
|
133
|
+
# Real LLM — requires Ollama or API key:
|
|
134
|
+
ruby examples/04_real_llm.rb
|
|
135
|
+
REAL_LLM=1 ruby examples/06_reddit_promo.rb
|
|
136
|
+
REAL_LLM=1 MODEL=llama3.2:3b ruby examples/06_reddit_promo.rb
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Examples 00–03, 05–06 use the test adapter by default — no API keys needed.
|
|
140
|
+
Example 04 and 06 with `REAL_LLM=1` require Ollama or an API key.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Adapters
|
|
6
|
+
class Response
|
|
7
|
+
attr_reader :content, :usage
|
|
8
|
+
|
|
9
|
+
def initialize(content:, usage: {})
|
|
10
|
+
@content = content
|
|
11
|
+
@usage = usage
|
|
12
|
+
freeze
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ruby_llm"
|
|
4
|
+
|
|
5
|
+
module RubyLLM
|
|
6
|
+
module Contract
|
|
7
|
+
module Adapters
|
|
8
|
+
class RubyLLM < Base
|
|
9
|
+
def call(messages:, **options)
|
|
10
|
+
system_contents, conversation = partition_messages(messages)
|
|
11
|
+
conversation = fallback_conversation(system_contents, conversation)
|
|
12
|
+
|
|
13
|
+
chat = build_chat(options, system_contents)
|
|
14
|
+
add_history(chat, conversation[0..-2])
|
|
15
|
+
|
|
16
|
+
response = chat.ask(conversation.last&.fetch(:content, ""))
|
|
17
|
+
build_response(response)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Maps option keys to the RubyLLM chat method and argument form.
|
|
21
|
+
CHAT_OPTION_METHODS = {
|
|
22
|
+
temperature: :with_temperature,
|
|
23
|
+
schema: :with_schema
|
|
24
|
+
}.freeze
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
# When prompt has only system/section/rule nodes and no user message,
|
|
29
|
+
# pop the last system message and use it as the user ask.
|
|
30
|
+
def fallback_conversation(system_contents, conversation)
|
|
31
|
+
return conversation unless conversation.empty?
|
|
32
|
+
|
|
33
|
+
content = system_contents.any? ? system_contents.pop : ""
|
|
34
|
+
[{ role: :user, content: content }]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def build_chat(options, system_contents)
|
|
38
|
+
chat = ::RubyLLM.chat(**chat_constructor_options(options))
|
|
39
|
+
chat.with_instructions(system_contents.join("\n\n")) if system_contents.any?
|
|
40
|
+
apply_chat_options(chat, options)
|
|
41
|
+
chat
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def chat_constructor_options(options)
|
|
45
|
+
opts = { model: options[:model] }
|
|
46
|
+
opts[:provider] = options[:provider] if options[:provider]
|
|
47
|
+
opts[:assume_model_exists] = options[:assume_model_exists] if options[:assume_model_exists]
|
|
48
|
+
opts
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def apply_chat_options(chat, options)
|
|
52
|
+
CHAT_OPTION_METHODS.each do |key, method_name|
|
|
53
|
+
chat.public_send(method_name, options[key]) if options[key]
|
|
54
|
+
end
|
|
55
|
+
chat.with_params(max_tokens: options[:max_tokens]) if options[:max_tokens]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def build_response(response)
|
|
59
|
+
content = response.content
|
|
60
|
+
content = content.to_s unless content.is_a?(Hash)
|
|
61
|
+
|
|
62
|
+
Response.new(
|
|
63
|
+
content: content,
|
|
64
|
+
usage: {
|
|
65
|
+
input_tokens: response.input_tokens || 0,
|
|
66
|
+
output_tokens: response.output_tokens || 0
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def partition_messages(messages)
|
|
72
|
+
system_contents = []
|
|
73
|
+
conversation = []
|
|
74
|
+
|
|
75
|
+
messages.each do |msg|
|
|
76
|
+
if msg[:role] == :system
|
|
77
|
+
system_contents << msg[:content]
|
|
78
|
+
else
|
|
79
|
+
conversation << msg
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
[system_contents, conversation]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def add_history(chat, messages)
|
|
87
|
+
messages&.each do |msg|
|
|
88
|
+
chat.add_message(role: msg[:role], content: msg[:content])
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Adapters
|
|
6
|
+
class Test < Base
|
|
7
|
+
def initialize(response: nil, responses: nil)
|
|
8
|
+
super()
|
|
9
|
+
if responses
|
|
10
|
+
raise ArgumentError, "responses: must not be empty (use response: nil for nil content)" if responses.empty?
|
|
11
|
+
|
|
12
|
+
@responses = responses.map { |r| normalize_response(r) }
|
|
13
|
+
@index = 0
|
|
14
|
+
else
|
|
15
|
+
@response = normalize_response(response)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def normalize_response(response)
|
|
22
|
+
case response
|
|
23
|
+
when Hash, Array then response
|
|
24
|
+
when nil then ""
|
|
25
|
+
else response.to_s
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
public
|
|
30
|
+
|
|
31
|
+
def call(messages:, **_options) # rubocop:disable Lint/UnusedMethodArgument
|
|
32
|
+
content = if @responses
|
|
33
|
+
c = @responses[@index] || @responses.last
|
|
34
|
+
@index += 1
|
|
35
|
+
c
|
|
36
|
+
else
|
|
37
|
+
@response
|
|
38
|
+
end
|
|
39
|
+
Response.new(content: content, usage: { input_tokens: 0, output_tokens: 0 })
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
module DeepSymbolize
|
|
7
|
+
def deep_symbolize(obj)
|
|
8
|
+
case obj
|
|
9
|
+
when Hash then obj.transform_keys(&:to_sym).transform_values { |val| deep_symbolize(val) }
|
|
10
|
+
when Array then obj.map { |val| deep_symbolize(val) }
|
|
11
|
+
else obj
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
module EvalHost
|
|
7
|
+
def define_eval(name, &)
|
|
8
|
+
@eval_definitions ||= {}
|
|
9
|
+
key = name.to_s
|
|
10
|
+
|
|
11
|
+
if @eval_definitions.key?(key) && !Thread.current[:ruby_llm_contract_reloading]
|
|
12
|
+
warn "[ruby_llm-contract] Redefining eval '#{key}' on #{self}. " \
|
|
13
|
+
"This replaces the previous definition."
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
@eval_definitions[key] = Eval::EvalDefinition.new(key, step_class: self, &)
|
|
17
|
+
Contract.register_eval_host(self)
|
|
18
|
+
register_subclasses(self)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def clear_eval_definitions!
|
|
22
|
+
@eval_definitions = {}
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def eval_names
|
|
26
|
+
all_eval_definitions.keys
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def eval_defined?
|
|
30
|
+
!all_eval_definitions.empty?
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def run_eval(name = nil, context: {})
|
|
34
|
+
if name
|
|
35
|
+
run_single_eval(name, context)
|
|
36
|
+
else
|
|
37
|
+
run_all_own_evals(context)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def compare_models(eval_name, models:, context: {})
|
|
42
|
+
reports = models.each_with_object({}) do |model, hash|
|
|
43
|
+
model_context = deep_dup_context(context).merge(model: model)
|
|
44
|
+
hash[model] = run_single_eval(eval_name, model_context)
|
|
45
|
+
end
|
|
46
|
+
Eval::ModelComparison.new(eval_name: eval_name, reports: reports)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def all_eval_definitions
|
|
52
|
+
inherited = if superclass.respond_to?(:all_eval_definitions, true)
|
|
53
|
+
superclass.send(:all_eval_definitions)
|
|
54
|
+
else
|
|
55
|
+
{}
|
|
56
|
+
end
|
|
57
|
+
own = defined?(@eval_definitions) ? @eval_definitions : {}
|
|
58
|
+
inherited.merge(own)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def run_single_eval(name, context)
|
|
62
|
+
defn = all_eval_definitions[name.to_s]
|
|
63
|
+
raise ArgumentError, "No eval '#{name}' defined. Available: #{all_eval_definitions.keys}" unless defn
|
|
64
|
+
|
|
65
|
+
effective_context = eval_context(defn, context)
|
|
66
|
+
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def run_all_own_evals(context)
|
|
70
|
+
all_eval_definitions.transform_values do |defn|
|
|
71
|
+
isolated_context = deep_dup_context(context)
|
|
72
|
+
effective_context = eval_context(defn, isolated_context)
|
|
73
|
+
Eval::Runner.run(step: self, dataset: defn.build_dataset, context: effective_context)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def eval_context(defn, context)
|
|
78
|
+
return context if context[:adapter]
|
|
79
|
+
|
|
80
|
+
sample_adapter = defn.build_adapter
|
|
81
|
+
return context unless sample_adapter
|
|
82
|
+
|
|
83
|
+
context.merge(adapter: sample_adapter)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def register_subclasses(klass)
|
|
87
|
+
if klass.respond_to?(:subclasses)
|
|
88
|
+
klass.subclasses.each do |sub|
|
|
89
|
+
Contract.register_eval_host(sub)
|
|
90
|
+
register_subclasses(sub)
|
|
91
|
+
end
|
|
92
|
+
else
|
|
93
|
+
ObjectSpace.each_object(Class) do |sub|
|
|
94
|
+
Contract.register_eval_host(sub) if sub < klass
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def deep_dup_context(context)
|
|
100
|
+
context.transform_values do |v|
|
|
101
|
+
v.respond_to?(:dup) ? v.dup : v
|
|
102
|
+
rescue TypeError
|
|
103
|
+
v
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
module TraceEquality
|
|
7
|
+
def ==(other)
|
|
8
|
+
return to_h == other if other.is_a?(Hash)
|
|
9
|
+
|
|
10
|
+
other.is_a?(self.class) && to_h == other.to_h
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Concerns
|
|
6
|
+
module UsageAggregator
|
|
7
|
+
private
|
|
8
|
+
|
|
9
|
+
def extract_usage(trace_entry)
|
|
10
|
+
if trace_entry.respond_to?(:usage)
|
|
11
|
+
trace_entry.usage
|
|
12
|
+
elsif trace_entry.respond_to?(:[])
|
|
13
|
+
trace_entry[:usage]
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def sum_tokens(traces)
|
|
18
|
+
traces.sum do |trace_entry|
|
|
19
|
+
usage = extract_usage(trace_entry)
|
|
20
|
+
next 0 unless usage.is_a?(Hash)
|
|
21
|
+
|
|
22
|
+
(usage[:input_tokens] || 0) + (usage[:output_tokens] || 0)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def aggregate_usage(traces)
|
|
27
|
+
input_total = 0
|
|
28
|
+
output_total = 0
|
|
29
|
+
|
|
30
|
+
traces.each do |trace_entry|
|
|
31
|
+
usage = extract_usage(trace_entry)
|
|
32
|
+
next unless usage.is_a?(Hash)
|
|
33
|
+
|
|
34
|
+
input_total += usage[:input_tokens] || 0
|
|
35
|
+
output_total += usage[:output_tokens] || 0
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
{ input_tokens: input_total, output_tokens: output_total }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
# Configuration for ruby_llm-contract.
|
|
6
|
+
#
|
|
7
|
+
# API keys should be configured directly via RubyLLM:
|
|
8
|
+
# RubyLLM.configure { |c| c.openai_api_key = ENV["OPENAI_API_KEY"] }
|
|
9
|
+
#
|
|
10
|
+
# Then configure contract-specific options:
|
|
11
|
+
# RubyLLM::Contract.configure { |c| c.default_model = "gpt-4.1-mini" }
|
|
12
|
+
class Configuration
|
|
13
|
+
attr_accessor :default_adapter, :default_model
|
|
14
|
+
|
|
15
|
+
def initialize
|
|
16
|
+
@default_adapter = nil
|
|
17
|
+
@default_model = nil
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
class Definition
|
|
6
|
+
attr_reader :parse_strategy, :invariants
|
|
7
|
+
|
|
8
|
+
def initialize(&block)
|
|
9
|
+
@parse_strategy = :text
|
|
10
|
+
@invariants = []
|
|
11
|
+
instance_eval(&block) if block
|
|
12
|
+
@invariants = @invariants.freeze
|
|
13
|
+
freeze
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def parse(strategy)
|
|
17
|
+
@parse_strategy = strategy
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def invariant(description, &block)
|
|
21
|
+
@invariants << Invariant.new(description, block)
|
|
22
|
+
end
|
|
23
|
+
alias validate invariant
|
|
24
|
+
|
|
25
|
+
def self.build(&)
|
|
26
|
+
new(&)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.merge(base, extra_invariants: [], parse_override: nil)
|
|
30
|
+
new do
|
|
31
|
+
parse(parse_override || base.parse_strategy)
|
|
32
|
+
(base.invariants + extra_invariants).each do |inv|
|
|
33
|
+
@invariants << inv
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
class Invariant
|
|
6
|
+
attr_reader :description
|
|
7
|
+
|
|
8
|
+
def initialize(description, block)
|
|
9
|
+
@description = description
|
|
10
|
+
@block = block
|
|
11
|
+
freeze
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call(parsed_output, input: nil)
|
|
15
|
+
if @block.arity >= 2
|
|
16
|
+
@block.call(parsed_output, input)
|
|
17
|
+
else
|
|
18
|
+
@block.call(parsed_output)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|