ruby_llm-contract 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +55 -0
- data/CHANGELOG.md +76 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +176 -0
- data/LICENSE +21 -0
- data/README.md +154 -0
- data/Rakefile +8 -0
- data/examples/00_basics.rb +500 -0
- data/examples/01_classify_threads.rb +220 -0
- data/examples/02_generate_comment.rb +203 -0
- data/examples/03_target_audience.rb +201 -0
- data/examples/04_real_llm.rb +410 -0
- data/examples/05_output_schema.rb +258 -0
- data/examples/07_keyword_extraction.rb +239 -0
- data/examples/08_translation.rb +353 -0
- data/examples/09_eval_dataset.rb +287 -0
- data/examples/10_reddit_full_showcase.rb +363 -0
- data/examples/README.md +140 -0
- data/lib/ruby_llm/contract/adapters/base.rb +13 -0
- data/lib/ruby_llm/contract/adapters/response.rb +17 -0
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
- data/lib/ruby_llm/contract/adapters/test.rb +44 -0
- data/lib/ruby_llm/contract/adapters.rb +6 -0
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
- data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
- data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
- data/lib/ruby_llm/contract/configuration.rb +21 -0
- data/lib/ruby_llm/contract/contract/definition.rb +39 -0
- data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
- data/lib/ruby_llm/contract/contract/parser.rb +143 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
- data/lib/ruby_llm/contract/contract/validator.rb +104 -0
- data/lib/ruby_llm/contract/contract.rb +7 -0
- data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
- data/lib/ruby_llm/contract/dsl.rb +13 -0
- data/lib/ruby_llm/contract/errors.rb +19 -0
- data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
- data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
- data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
- data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
- data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
- data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
- data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
- data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
- data/lib/ruby_llm/contract/eval/report.rb +115 -0
- data/lib/ruby_llm/contract/eval/runner.rb +162 -0
- data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
- data/lib/ruby_llm/contract/eval.rb +16 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
- data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
- data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
- data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
- data/lib/ruby_llm/contract/pipeline.rb +6 -0
- data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
- data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
- data/lib/ruby_llm/contract/prompt/node.rb +25 -0
- data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
- data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
- data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
- data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
- data/lib/ruby_llm/contract/railtie.rb +20 -0
- data/lib/ruby_llm/contract/rake_task.rb +78 -0
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
- data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
- data/lib/ruby_llm/contract/rspec.rb +6 -0
- data/lib/ruby_llm/contract/step/base.rb +138 -0
- data/lib/ruby_llm/contract/step/dsl.rb +144 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
- data/lib/ruby_llm/contract/step/result.rb +38 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
- data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
- data/lib/ruby_llm/contract/step/runner.rb +126 -0
- data/lib/ruby_llm/contract/step/trace.rb +70 -0
- data/lib/ruby_llm/contract/step.rb +10 -0
- data/lib/ruby_llm/contract/token_estimator.rb +19 -0
- data/lib/ruby_llm/contract/types.rb +11 -0
- data/lib/ruby_llm/contract/version.rb +7 -0
- data/lib/ruby_llm/contract.rb +108 -0
- data/ruby_llm-contract.gemspec +33 -0
- metadata +172 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: dc8b1278c5464978cfc50d87ac90cbde94c2a5920b00996365a2e366bb27f1e6
|
|
4
|
+
data.tar.gz: daf51e9b66472464137d371439f503317b84706f167fa74c2118635ae82823b1
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 7899b4c2df5e7824a5104c24698b728b017e996cced82022c26d81167e8876085fcec93396d13ce67aed7034cfbd0cfbde2e0ebd76376a8dd198d6a561d273d7
|
|
7
|
+
data.tar.gz: 7ca10ee16ea71eda609439546b9f02b20e184e9c6a8861d88a172c0e75ca611cf3c2bddf8827b820b31aa9cc5baf88bdaf9b708732b78f3e6321438030186ef8
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.2
|
|
3
|
+
NewCops: enable
|
|
4
|
+
SuggestExtensions: false
|
|
5
|
+
|
|
6
|
+
Style/Documentation:
|
|
7
|
+
Enabled: false
|
|
8
|
+
|
|
9
|
+
Style/StringLiterals:
|
|
10
|
+
EnforcedStyle: double_quotes
|
|
11
|
+
|
|
12
|
+
Style/StringLiteralsInInterpolation:
|
|
13
|
+
EnforcedStyle: double_quotes
|
|
14
|
+
|
|
15
|
+
Metrics/BlockLength:
|
|
16
|
+
Exclude:
|
|
17
|
+
- 'spec/**/*'
|
|
18
|
+
- '*.gemspec'
|
|
19
|
+
|
|
20
|
+
Metrics/MethodLength:
|
|
21
|
+
Max: 25
|
|
22
|
+
|
|
23
|
+
Layout/LineLength:
|
|
24
|
+
Max: 120
|
|
25
|
+
|
|
26
|
+
Style/OneClassPerFile:
|
|
27
|
+
Exclude:
|
|
28
|
+
- 'spec/**/*'
|
|
29
|
+
- 'examples/**/*'
|
|
30
|
+
|
|
31
|
+
Lint/UnusedBlockArgument:
|
|
32
|
+
Exclude:
|
|
33
|
+
- 'spec/**/*'
|
|
34
|
+
|
|
35
|
+
Naming/VariableNumber:
|
|
36
|
+
Exclude:
|
|
37
|
+
- 'spec/**/*'
|
|
38
|
+
- 'examples/**/*'
|
|
39
|
+
|
|
40
|
+
AllCops:
|
|
41
|
+
Exclude:
|
|
42
|
+
- 'internal/**/*'
|
|
43
|
+
|
|
44
|
+
Metrics/ClassLength:
|
|
45
|
+
Max: 130
|
|
46
|
+
|
|
47
|
+
Metrics/AbcSize:
|
|
48
|
+
Max: 30
|
|
49
|
+
|
|
50
|
+
Metrics/ParameterLists:
|
|
51
|
+
Max: 11
|
|
52
|
+
MaxOptionalParameters: 9
|
|
53
|
+
|
|
54
|
+
Style/FormatStringToken:
|
|
55
|
+
Enabled: false
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 (2026-03-23)
|
|
4
|
+
|
|
5
|
+
Contracts for LLM quality. Know which model to use, what it costs, and when accuracy drops.
|
|
6
|
+
|
|
7
|
+
### Breaking changes
|
|
8
|
+
|
|
9
|
+
- **`report.results` returns `CaseResult` objects** instead of hashes. Use `result.name`, `result.passed?`, `result.score` instead of `result[:case_name]`, `result[:passed]`. `CaseResult#to_h` for backward compat.
|
|
10
|
+
- **`report.print_summary`** replaces `report.pretty_print` (avoids shadowing `Kernel#pretty_print`).
|
|
11
|
+
|
|
12
|
+
### Features
|
|
13
|
+
|
|
14
|
+
- **`add_case` in `define_eval`** — `add_case "billing", input: "...", expected: { priority: "high" }` with partial matching. Supports `expected_traits:` for regex/range matching.
|
|
15
|
+
- **`CaseResult` value objects** — `result.name`, `result.passed?`, `result.output`, `result.expected`, `result.mismatches` (structured diff), `result.cost`, `result.duration_ms`.
|
|
16
|
+
- **`report.failures`** — returns only failed cases. `report.skipped` counts skipped (offline) cases.
|
|
17
|
+
- **Model comparison** — `Step.compare_models("eval", models: %w[nano mini full])` runs same eval across models. Returns table with score/cost/latency per model. `comparison.best_for(min_score: 0.95)` returns cheapest model meeting threshold.
|
|
18
|
+
- **Cost tracking** — `report.total_cost`, `report.avg_latency_ms`, per-case `result.cost`. Pipeline eval uses total pipeline cost, not just last step.
|
|
19
|
+
- **Cost prediction** — `Step.estimate_cost(input:, model:)` and `Step.estimate_eval_cost("eval", models: [...])` predict spend before API calls.
|
|
20
|
+
- **CI gating** — `pass_eval("regression").with_minimum_score(0.8).with_maximum_cost(0.01)`. RakeTask with suite-level `minimum_score` and `maximum_cost`.
|
|
21
|
+
- **`RubyLLM::Contract.run_all_evals`** — discovers all Steps/Pipelines with evals, runs them all. Includes inherited evals.
|
|
22
|
+
- **`RubyLLM::Contract::RakeTask`** — `rake ruby_llm_contract:eval` with `minimum_score`, `maximum_cost`, `fail_on_empty`, `eval_dirs`.
|
|
23
|
+
- **Rails Railtie** — auto-loads eval files via `config.after_initialize` + `config.to_prepare` (supports development reload).
|
|
24
|
+
- **Offline mode** — cases without adapter return `:skipped` instead of crashing. Skipped cases excluded from score/passed.
|
|
25
|
+
- **Safe `define_eval`** — warns on duplicate name; suppressed during reload.
|
|
26
|
+
|
|
27
|
+
### Fixes
|
|
28
|
+
|
|
29
|
+
- **P1: Eval files not autoloaded by Rails** — Railtie uses `load` (not Zeitwerk). Hooks into reloader for dev.
|
|
30
|
+
- **P2: report.results returns raw Hashes** — now returns `CaseResult` objects.
|
|
31
|
+
- **P3: No way to run all evals at once** — `Contract.run_all_evals` + Rake task.
|
|
32
|
+
- **P4: String vs symbol key mismatch** — warns when `validate` or `verify` proc returns nil.
|
|
33
|
+
- **Pipeline eval cost** — uses `Pipeline::Trace#total_cost` (all steps), not just last step.
|
|
34
|
+
- **Reload lifecycle** — `load_evals!` clears definitions before re-loading. Registry filters stale hosts.
|
|
35
|
+
- **Adapter isolation** — `compare_models` and `run_all_own_evals` deep-dup context per run.
|
|
36
|
+
|
|
37
|
+
### Verified with real API
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
Model Score Cost Avg Latency
|
|
41
|
+
---------------------------------------------------------
|
|
42
|
+
gpt-4.1-nano 0.67 $0.000032 687ms
|
|
43
|
+
gpt-4.1-mini 1.00 $0.000102 1070ms
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Stats
|
|
47
|
+
|
|
48
|
+
- 1077 tests, 0 failures
|
|
49
|
+
- 3 architecture review rounds, 32 findings fixed
|
|
50
|
+
- Verified with real OpenAI API (gpt-4.1-nano, gpt-4.1-mini)
|
|
51
|
+
|
|
52
|
+
## 0.1.0 (2026-03-20)
|
|
53
|
+
|
|
54
|
+
Initial release.
|
|
55
|
+
|
|
56
|
+
### Features
|
|
57
|
+
|
|
58
|
+
- **Step abstraction** — `RubyLLM::Contract::Step::Base` with prompt DSL, typed input/output
|
|
59
|
+
- **Output schema** — declarative structure via ruby_llm-schema, sent to provider for enforcement
|
|
60
|
+
- **Validate** — business logic checks (1-arity and 2-arity with input cross-validation)
|
|
61
|
+
- **Retry with model escalation** — start cheap, auto-escalate on contract failure or network error
|
|
62
|
+
- **Preflight limits** — `max_input`, `max_cost`, `max_output` refuse before calling the LLM
|
|
63
|
+
- **Pipeline** — multi-step composition with fail-fast, timeout, token budget
|
|
64
|
+
- **Eval** — offline contract verification with `define_eval`, `run_eval`, zero-verify auto-case
|
|
65
|
+
- **Adapters** — RubyLLM (production), Test (deterministic specs)
|
|
66
|
+
- **RSpec matchers** — `satisfy_contract`, `pass_eval`
|
|
67
|
+
- **Structured trace** — model, latency, tokens, cost, attempt log per step
|
|
68
|
+
|
|
69
|
+
### Robustness
|
|
70
|
+
|
|
71
|
+
- 1005 tests, 0 failures
|
|
72
|
+
- 42 bugs found and fixed via 10 rounds of adversarial testing
|
|
73
|
+
- 0 RuboCop offenses
|
|
74
|
+
- Parser handles: markdown code fences, UTF-8 BOM, JSON extraction from prose
|
|
75
|
+
- SchemaValidator: full nested validation, additionalProperties, minItems/maxItems, minLength/maxLength
|
|
76
|
+
- Deep-frozen parsed_output prevents mutation via shared references
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
ruby_llm-contract (0.2.0)
|
|
5
|
+
dry-types (~> 1.7)
|
|
6
|
+
ruby_llm (~> 1.0)
|
|
7
|
+
ruby_llm-schema (~> 0.3)
|
|
8
|
+
|
|
9
|
+
GEM
|
|
10
|
+
remote: https://rubygems.org/
|
|
11
|
+
specs:
|
|
12
|
+
addressable (2.8.9)
|
|
13
|
+
public_suffix (>= 2.0.2, < 8.0)
|
|
14
|
+
ast (2.4.3)
|
|
15
|
+
base64 (0.3.0)
|
|
16
|
+
bigdecimal (4.0.1)
|
|
17
|
+
concurrent-ruby (1.3.6)
|
|
18
|
+
diff-lcs (1.6.2)
|
|
19
|
+
dry-core (1.2.0)
|
|
20
|
+
concurrent-ruby (~> 1.0)
|
|
21
|
+
logger
|
|
22
|
+
zeitwerk (~> 2.6)
|
|
23
|
+
dry-inflector (1.3.1)
|
|
24
|
+
dry-logic (1.6.0)
|
|
25
|
+
bigdecimal
|
|
26
|
+
concurrent-ruby (~> 1.0)
|
|
27
|
+
dry-core (~> 1.1)
|
|
28
|
+
zeitwerk (~> 2.6)
|
|
29
|
+
dry-types (1.9.1)
|
|
30
|
+
bigdecimal (>= 3.0)
|
|
31
|
+
concurrent-ruby (~> 1.0)
|
|
32
|
+
dry-core (~> 1.0)
|
|
33
|
+
dry-inflector (~> 1.0)
|
|
34
|
+
dry-logic (~> 1.4)
|
|
35
|
+
zeitwerk (~> 2.6)
|
|
36
|
+
event_stream_parser (1.0.0)
|
|
37
|
+
faraday (2.14.1)
|
|
38
|
+
faraday-net_http (>= 2.0, < 3.5)
|
|
39
|
+
json
|
|
40
|
+
logger
|
|
41
|
+
faraday-multipart (1.2.0)
|
|
42
|
+
multipart-post (~> 2.0)
|
|
43
|
+
faraday-net_http (3.4.2)
|
|
44
|
+
net-http (~> 0.5)
|
|
45
|
+
faraday-retry (2.4.0)
|
|
46
|
+
faraday (~> 2.0)
|
|
47
|
+
json (2.19.2)
|
|
48
|
+
json-schema (6.2.0)
|
|
49
|
+
addressable (~> 2.8)
|
|
50
|
+
bigdecimal (>= 3.1, < 5)
|
|
51
|
+
language_server-protocol (3.17.0.5)
|
|
52
|
+
lint_roller (1.1.0)
|
|
53
|
+
logger (1.7.0)
|
|
54
|
+
marcel (1.1.0)
|
|
55
|
+
mcp (0.9.0)
|
|
56
|
+
json-schema (>= 4.1)
|
|
57
|
+
multipart-post (2.4.1)
|
|
58
|
+
net-http (0.9.1)
|
|
59
|
+
uri (>= 0.11.1)
|
|
60
|
+
parallel (1.27.0)
|
|
61
|
+
parser (3.3.10.2)
|
|
62
|
+
ast (~> 2.4.1)
|
|
63
|
+
racc
|
|
64
|
+
prism (1.9.0)
|
|
65
|
+
public_suffix (7.0.5)
|
|
66
|
+
racc (1.8.1)
|
|
67
|
+
rainbow (3.1.1)
|
|
68
|
+
rake (13.3.1)
|
|
69
|
+
regexp_parser (2.11.3)
|
|
70
|
+
rspec (3.13.2)
|
|
71
|
+
rspec-core (~> 3.13.0)
|
|
72
|
+
rspec-expectations (~> 3.13.0)
|
|
73
|
+
rspec-mocks (~> 3.13.0)
|
|
74
|
+
rspec-core (3.13.6)
|
|
75
|
+
rspec-support (~> 3.13.0)
|
|
76
|
+
rspec-expectations (3.13.5)
|
|
77
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
78
|
+
rspec-support (~> 3.13.0)
|
|
79
|
+
rspec-mocks (3.13.8)
|
|
80
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
81
|
+
rspec-support (~> 3.13.0)
|
|
82
|
+
rspec-support (3.13.7)
|
|
83
|
+
rubocop (1.85.1)
|
|
84
|
+
json (~> 2.3)
|
|
85
|
+
language_server-protocol (~> 3.17.0.2)
|
|
86
|
+
lint_roller (~> 1.1.0)
|
|
87
|
+
mcp (~> 0.6)
|
|
88
|
+
parallel (~> 1.10)
|
|
89
|
+
parser (>= 3.3.0.2)
|
|
90
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
91
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
|
92
|
+
rubocop-ast (>= 1.49.0, < 2.0)
|
|
93
|
+
ruby-progressbar (~> 1.7)
|
|
94
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
|
95
|
+
rubocop-ast (1.49.1)
|
|
96
|
+
parser (>= 3.3.7.2)
|
|
97
|
+
prism (~> 1.7)
|
|
98
|
+
ruby-progressbar (1.13.0)
|
|
99
|
+
ruby_llm (1.14.0)
|
|
100
|
+
base64
|
|
101
|
+
event_stream_parser (~> 1)
|
|
102
|
+
faraday (>= 1.10.0)
|
|
103
|
+
faraday-multipart (>= 1)
|
|
104
|
+
faraday-net_http (>= 1)
|
|
105
|
+
faraday-retry (>= 1)
|
|
106
|
+
marcel (~> 1)
|
|
107
|
+
ruby_llm-schema (~> 0)
|
|
108
|
+
zeitwerk (~> 2)
|
|
109
|
+
ruby_llm-schema (0.3.0)
|
|
110
|
+
unicode-display_width (3.2.0)
|
|
111
|
+
unicode-emoji (~> 4.1)
|
|
112
|
+
unicode-emoji (4.2.0)
|
|
113
|
+
uri (1.1.1)
|
|
114
|
+
zeitwerk (2.7.5)
|
|
115
|
+
|
|
116
|
+
PLATFORMS
|
|
117
|
+
arm64-darwin-25
|
|
118
|
+
ruby
|
|
119
|
+
|
|
120
|
+
DEPENDENCIES
|
|
121
|
+
rake (~> 13.0)
|
|
122
|
+
rspec (~> 3.13)
|
|
123
|
+
rubocop (~> 1.75)
|
|
124
|
+
ruby_llm-contract!
|
|
125
|
+
|
|
126
|
+
CHECKSUMS
|
|
127
|
+
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
128
|
+
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
129
|
+
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
130
|
+
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
131
|
+
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
132
|
+
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
133
|
+
dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
|
|
134
|
+
dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
|
|
135
|
+
dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
|
|
136
|
+
dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
|
|
137
|
+
event_stream_parser (1.0.0) sha256=a2683bab70126286f8184dc88f7968ffc4028f813161fb073ec90d171f7de3c8
|
|
138
|
+
faraday (2.14.1) sha256=a43cceedc1e39d188f4d2cdd360a8aaa6a11da0c407052e426ba8d3fb42ef61c
|
|
139
|
+
faraday-multipart (1.2.0) sha256=7d89a949693714176f612323ca13746a2ded204031a6ba528adee788694ef757
|
|
140
|
+
faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
|
|
141
|
+
faraday-retry (2.4.0) sha256=7b79c48fb7e56526faf247b12d94a680071ff40c9fda7cf1ec1549439ad11ebe
|
|
142
|
+
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
143
|
+
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
144
|
+
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
145
|
+
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
146
|
+
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
147
|
+
marcel (1.1.0) sha256=fdcfcfa33cc52e93c4308d40e4090a5d4ea279e160a7f6af988260fa970e0bee
|
|
148
|
+
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
149
|
+
multipart-post (2.4.1) sha256=9872d03a8e552020ca096adadbf5e3cb1cd1cdd6acd3c161136b8a5737cdb4a8
|
|
150
|
+
net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
|
|
151
|
+
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
152
|
+
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
153
|
+
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
154
|
+
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
|
155
|
+
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
156
|
+
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
157
|
+
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
158
|
+
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
159
|
+
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
160
|
+
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
161
|
+
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
162
|
+
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
163
|
+
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
164
|
+
rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
|
|
165
|
+
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
166
|
+
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
167
|
+
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
168
|
+
ruby_llm-contract (0.2.0)
|
|
169
|
+
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
170
|
+
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
171
|
+
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
|
172
|
+
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
|
173
|
+
zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
|
|
174
|
+
|
|
175
|
+
BUNDLED WITH
|
|
176
|
+
4.0.3
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Justyna
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# ruby_llm-contract
|
|
2
|
+
|
|
3
|
+
Contracts for LLM quality. Know which model to use, what it costs, and when accuracy drops.
|
|
4
|
+
|
|
5
|
+
Companion gem for [ruby_llm](https://github.com/crmne/ruby_llm).
|
|
6
|
+
|
|
7
|
+
## The problem
|
|
8
|
+
|
|
9
|
+
You call an LLM. It returns bad JSON, wrong values, or costs 4x more than it should. You switch models and quality drops silently. You have no data to decide which model to use.
|
|
10
|
+
|
|
11
|
+
## The fix
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
class ClassifyTicket < RubyLLM::Contract::Step::Base
|
|
15
|
+
prompt do
|
|
16
|
+
system "You are a support ticket classifier."
|
|
17
|
+
rule "Return valid JSON only, no markdown."
|
|
18
|
+
rule "Use exactly one priority: low, medium, high, urgent."
|
|
19
|
+
example input: "My invoice is wrong", output: '{"priority": "high"}'
|
|
20
|
+
user "{input}"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
output_schema do
|
|
24
|
+
string :priority, enum: %w[low medium high urgent]
|
|
25
|
+
string :category
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
validate("urgent needs justification") { |o, input| o[:priority] != "urgent" || input.length > 20 }
|
|
29
|
+
retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini gpt-4.1]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
result = ClassifyTicket.run("I was charged twice")
|
|
33
|
+
result.ok? # => true
|
|
34
|
+
result.parsed_output # => {priority: "high", category: "billing"}
|
|
35
|
+
result.trace[:cost] # => 0.000032
|
|
36
|
+
result.trace[:model] # => "gpt-4.1-nano"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Bad JSON? Auto-retry. Wrong value? Escalate to a smarter model. Schema violated? Caught client-side even if the provider ignores it. All with cost tracking.
|
|
40
|
+
|
|
41
|
+
## Which model should I use?
|
|
42
|
+
|
|
43
|
+
Define test cases. Compare models. Get data.
|
|
44
|
+
|
|
45
|
+
```ruby
|
|
46
|
+
ClassifyTicket.define_eval("regression") do
|
|
47
|
+
add_case "billing", input: "I was charged twice", expected: { priority: "high" }
|
|
48
|
+
add_case "feature", input: "Add dark mode please", expected: { priority: "low" }
|
|
49
|
+
add_case "outage", input: "Database is down", expected: { priority: "urgent" }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
comparison = ClassifyTicket.compare_models("regression",
|
|
53
|
+
models: %w[gpt-4.1-nano gpt-4.1-mini])
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Real output from real API calls:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
Model Score Cost Avg Latency
|
|
60
|
+
---------------------------------------------------------
|
|
61
|
+
gpt-4.1-nano 0.67 $0.000032 687ms
|
|
62
|
+
gpt-4.1-mini 1.00 $0.000102 1070ms
|
|
63
|
+
|
|
64
|
+
Cheapest at 100%: gpt-4.1-mini
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
comparison.best_for(min_score: 0.95) # => "gpt-4.1-mini"
|
|
69
|
+
|
|
70
|
+
# Inspect failures
|
|
71
|
+
comparison.reports["gpt-4.1-nano"].failures.each do |f|
|
|
72
|
+
puts "#{f.name}: expected #{f.expected}, got #{f.output}"
|
|
73
|
+
puts " mismatches: #{f.mismatches}"
|
|
74
|
+
# => outage: expected {priority: "urgent"}, got {priority: "high"}
|
|
75
|
+
# mismatches: {priority: {expected: "urgent", got: "high"}}
|
|
76
|
+
end
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Pipeline
|
|
80
|
+
|
|
81
|
+
Chain steps with fail-fast. Hallucination in step 1 stops before step 2 spends tokens.
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
class TicketPipeline < RubyLLM::Contract::Pipeline::Base
|
|
85
|
+
step ClassifyTicket, as: :classify
|
|
86
|
+
step RouteToTeam, as: :route
|
|
87
|
+
step DraftResponse, as: :draft
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
result = TicketPipeline.run("I was charged twice")
|
|
91
|
+
result.ok? # => true
|
|
92
|
+
result.outputs_by_step[:classify] # => {priority: "high", category: "billing"}
|
|
93
|
+
result.trace.total_cost # => 0.000128
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## CI gate
|
|
97
|
+
|
|
98
|
+
```ruby
|
|
99
|
+
# RSpec — block merge if accuracy drops or cost spikes
|
|
100
|
+
expect(ClassifyTicket).to pass_eval("regression")
|
|
101
|
+
.with_context(model: "gpt-4.1-mini")
|
|
102
|
+
.with_minimum_score(0.8)
|
|
103
|
+
.with_maximum_cost(0.01)
|
|
104
|
+
|
|
105
|
+
# Rake — run all evals across all steps
|
|
106
|
+
require "ruby_llm/contract/rake_task"
|
|
107
|
+
RubyLLM::Contract::RakeTask.new do |t|
|
|
108
|
+
t.minimum_score = 0.8
|
|
109
|
+
t.maximum_cost = 0.05
|
|
110
|
+
end
|
|
111
|
+
# bundle exec rake ruby_llm_contract:eval
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Predict cost before running
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
ClassifyTicket.estimate_eval_cost("regression", models: %w[gpt-4.1-nano gpt-4.1-mini])
|
|
118
|
+
# => { "gpt-4.1-nano" => 0.000024, "gpt-4.1-mini" => 0.000096 }
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Install
|
|
122
|
+
|
|
123
|
+
```ruby
|
|
124
|
+
gem "ruby_llm-contract"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
```ruby
|
|
128
|
+
RubyLLM.configure { |c| c.openai_api_key = ENV["OPENAI_API_KEY"] }
|
|
129
|
+
RubyLLM::Contract.configure { |c| c.default_model = "gpt-4.1-mini" }
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
|
|
133
|
+
|
|
134
|
+
## Docs
|
|
135
|
+
|
|
136
|
+
| Guide | |
|
|
137
|
+
|-------|-|
|
|
138
|
+
| [Getting Started](docs/guide/getting_started.md) | Features walkthrough, model escalation, eval |
|
|
139
|
+
| [Best Practices](docs/guide/best_practices.md) | 6 patterns for bulletproof validates |
|
|
140
|
+
| [Output Schema](docs/guide/output_schema.md) | Full schema reference + constraints |
|
|
141
|
+
| [Pipeline](docs/guide/pipeline.md) | Multi-step composition, timeout, fail-fast |
|
|
142
|
+
| [Testing](docs/guide/testing.md) | Test adapter, RSpec matchers |
|
|
143
|
+
|
|
144
|
+
## Roadmap
|
|
145
|
+
|
|
146
|
+
**v0.2 (current):** Model comparison, cost tracking, eval with `add_case`, CI gating, Rails Railtie.
|
|
147
|
+
|
|
148
|
+
**v0.3:** Regression baselines — compare eval results with previous run, detect quality drift.
|
|
149
|
+
|
|
150
|
+
**v0.4:** Auto-routing — learn which model works for which input pattern.
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
MIT
|