ruby_llm-contract 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +55 -0
  4. data/CHANGELOG.md +76 -0
  5. data/Gemfile +11 -0
  6. data/Gemfile.lock +176 -0
  7. data/LICENSE +21 -0
  8. data/README.md +154 -0
  9. data/Rakefile +8 -0
  10. data/examples/00_basics.rb +500 -0
  11. data/examples/01_classify_threads.rb +220 -0
  12. data/examples/02_generate_comment.rb +203 -0
  13. data/examples/03_target_audience.rb +201 -0
  14. data/examples/04_real_llm.rb +410 -0
  15. data/examples/05_output_schema.rb +258 -0
  16. data/examples/07_keyword_extraction.rb +239 -0
  17. data/examples/08_translation.rb +353 -0
  18. data/examples/09_eval_dataset.rb +287 -0
  19. data/examples/10_reddit_full_showcase.rb +363 -0
  20. data/examples/README.md +140 -0
  21. data/lib/ruby_llm/contract/adapters/base.rb +13 -0
  22. data/lib/ruby_llm/contract/adapters/response.rb +17 -0
  23. data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
  24. data/lib/ruby_llm/contract/adapters/test.rb +44 -0
  25. data/lib/ruby_llm/contract/adapters.rb +6 -0
  26. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
  27. data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
  28. data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
  29. data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
  30. data/lib/ruby_llm/contract/configuration.rb +21 -0
  31. data/lib/ruby_llm/contract/contract/definition.rb +39 -0
  32. data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
  33. data/lib/ruby_llm/contract/contract/parser.rb +143 -0
  34. data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
  35. data/lib/ruby_llm/contract/contract/validator.rb +104 -0
  36. data/lib/ruby_llm/contract/contract.rb +7 -0
  37. data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
  38. data/lib/ruby_llm/contract/dsl.rb +13 -0
  39. data/lib/ruby_llm/contract/errors.rb +19 -0
  40. data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
  41. data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
  42. data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
  43. data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
  44. data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
  45. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
  46. data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
  47. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
  48. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
  49. data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
  50. data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
  51. data/lib/ruby_llm/contract/eval/report.rb +115 -0
  52. data/lib/ruby_llm/contract/eval/runner.rb +162 -0
  53. data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
  54. data/lib/ruby_llm/contract/eval.rb +16 -0
  55. data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
  56. data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
  57. data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
  58. data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
  59. data/lib/ruby_llm/contract/pipeline.rb +6 -0
  60. data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
  61. data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
  62. data/lib/ruby_llm/contract/prompt/node.rb +25 -0
  63. data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
  64. data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
  65. data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
  66. data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
  67. data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
  68. data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
  69. data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
  70. data/lib/ruby_llm/contract/railtie.rb +20 -0
  71. data/lib/ruby_llm/contract/rake_task.rb +78 -0
  72. data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
  73. data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
  74. data/lib/ruby_llm/contract/rspec.rb +6 -0
  75. data/lib/ruby_llm/contract/step/base.rb +138 -0
  76. data/lib/ruby_llm/contract/step/dsl.rb +144 -0
  77. data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
  78. data/lib/ruby_llm/contract/step/result.rb +38 -0
  79. data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
  80. data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
  81. data/lib/ruby_llm/contract/step/runner.rb +126 -0
  82. data/lib/ruby_llm/contract/step/trace.rb +70 -0
  83. data/lib/ruby_llm/contract/step.rb +10 -0
  84. data/lib/ruby_llm/contract/token_estimator.rb +19 -0
  85. data/lib/ruby_llm/contract/types.rb +11 -0
  86. data/lib/ruby_llm/contract/version.rb +7 -0
  87. data/lib/ruby_llm/contract.rb +108 -0
  88. data/ruby_llm-contract.gemspec +33 -0
  89. metadata +172 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: dc8b1278c5464978cfc50d87ac90cbde94c2a5920b00996365a2e366bb27f1e6
4
+ data.tar.gz: daf51e9b66472464137d371439f503317b84706f167fa74c2118635ae82823b1
5
+ SHA512:
6
+ metadata.gz: 7899b4c2df5e7824a5104c24698b728b017e996cced82022c26d81167e8876085fcec93396d13ce67aed7034cfbd0cfbde2e0ebd76376a8dd198d6a561d273d7
7
+ data.tar.gz: 7ca10ee16ea71eda609439546b9f02b20e184e9c6a8861d88a172c0e75ca611cf3c2bddf8827b820b31aa9cc5baf88bdaf9b708732b78f3e6321438030186ef8
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,55 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.2
3
+ NewCops: enable
4
+ SuggestExtensions: false
5
+
6
+ Style/Documentation:
7
+ Enabled: false
8
+
9
+ Style/StringLiterals:
10
+ EnforcedStyle: double_quotes
11
+
12
+ Style/StringLiteralsInInterpolation:
13
+ EnforcedStyle: double_quotes
14
+
15
+ Metrics/BlockLength:
16
+ Exclude:
17
+ - 'spec/**/*'
18
+ - '*.gemspec'
19
+
20
+ Metrics/MethodLength:
21
+ Max: 25
22
+
23
+ Layout/LineLength:
24
+ Max: 120
25
+
26
+ Style/OneClassPerFile:
27
+ Exclude:
28
+ - 'spec/**/*'
29
+ - 'examples/**/*'
30
+
31
+ Lint/UnusedBlockArgument:
32
+ Exclude:
33
+ - 'spec/**/*'
34
+
35
+ Naming/VariableNumber:
36
+ Exclude:
37
+ - 'spec/**/*'
38
+ - 'examples/**/*'
39
+
40
+ AllCops:
41
+ Exclude:
42
+ - 'internal/**/*'
43
+
44
+ Metrics/ClassLength:
45
+ Max: 130
46
+
47
+ Metrics/AbcSize:
48
+ Max: 30
49
+
50
+ Metrics/ParameterLists:
51
+ Max: 11
52
+ MaxOptionalParameters: 9
53
+
54
+ Style/FormatStringToken:
55
+ Enabled: false
data/CHANGELOG.md ADDED
@@ -0,0 +1,76 @@
1
+ # Changelog
2
+
3
+ ## 0.2.0 (2026-03-23)
4
+
5
+ Contracts for LLM quality. Know which model to use, what it costs, and when accuracy drops.
6
+
7
+ ### Breaking changes
8
+
9
+ - **`report.results` returns `CaseResult` objects** instead of hashes. Use `result.name`, `result.passed?`, `result.score` instead of `result[:case_name]`, `result[:passed]`. `CaseResult#to_h` for backward compat.
10
+ - **`report.print_summary`** replaces `report.pretty_print` (avoids shadowing `Kernel#pretty_print`).
11
+
12
+ ### Features
13
+
14
+ - **`add_case` in `define_eval`** — `add_case "billing", input: "...", expected: { priority: "high" }` with partial matching. Supports `expected_traits:` for regex/range matching.
15
+ - **`CaseResult` value objects** — `result.name`, `result.passed?`, `result.output`, `result.expected`, `result.mismatches` (structured diff), `result.cost`, `result.duration_ms`.
16
+ - **`report.failures`** — returns only failed cases. `report.skipped` counts skipped (offline) cases.
17
+ - **Model comparison** — `Step.compare_models("eval", models: %w[nano mini full])` runs same eval across models. Returns table with score/cost/latency per model. `comparison.best_for(min_score: 0.95)` returns cheapest model meeting threshold.
18
+ - **Cost tracking** — `report.total_cost`, `report.avg_latency_ms`, per-case `result.cost`. Pipeline eval uses total pipeline cost, not just last step.
19
+ - **Cost prediction** — `Step.estimate_cost(input:, model:)` and `Step.estimate_eval_cost("eval", models: [...])` predict spend before API calls.
20
+ - **CI gating** — `pass_eval("regression").with_minimum_score(0.8).with_maximum_cost(0.01)`. RakeTask with suite-level `minimum_score` and `maximum_cost`.
21
+ - **`RubyLLM::Contract.run_all_evals`** — discovers all Steps/Pipelines with evals, runs them all. Includes inherited evals.
22
+ - **`RubyLLM::Contract::RakeTask`** — `rake ruby_llm_contract:eval` with `minimum_score`, `maximum_cost`, `fail_on_empty`, `eval_dirs`.
23
+ - **Rails Railtie** — auto-loads eval files via `config.after_initialize` + `config.to_prepare` (supports development reload).
24
+ - **Offline mode** — cases without adapter return `:skipped` instead of crashing. Skipped cases excluded from score/passed.
25
+ - **Safe `define_eval`** — warns on duplicate name; suppressed during reload.
26
+
27
+ ### Fixes
28
+
29
+ - **P1: Eval files not autoloaded by Rails** — Railtie uses `load` (not Zeitwerk). Hooks into reloader for dev.
30
+ - **P2: report.results returns raw Hashes** — now returns `CaseResult` objects.
31
+ - **P3: No way to run all evals at once** — `Contract.run_all_evals` + Rake task.
32
+ - **P4: String vs symbol key mismatch** — warns when `validate` or `verify` proc returns nil.
33
+ - **Pipeline eval cost** — uses `Pipeline::Trace#total_cost` (all steps), not just last step.
34
+ - **Reload lifecycle** — `load_evals!` clears definitions before re-loading. Registry filters stale hosts.
35
+ - **Adapter isolation** — `compare_models` and `run_all_own_evals` deep-dup context per run.
36
+
37
+ ### Verified with real API
38
+
39
+ ```
40
+ Model Score Cost Avg Latency
41
+ ---------------------------------------------------------
42
+ gpt-4.1-nano 0.67 $0.000032 687ms
43
+ gpt-4.1-mini 1.00 $0.000102 1070ms
44
+ ```
45
+
46
+ ### Stats
47
+
48
+ - 1077 tests, 0 failures
49
+ - 3 architecture review rounds, 32 findings fixed
50
+ - Verified with real OpenAI API (gpt-4.1-nano, gpt-4.1-mini)
51
+
52
+ ## 0.1.0 (2026-03-20)
53
+
54
+ Initial release.
55
+
56
+ ### Features
57
+
58
+ - **Step abstraction** — `RubyLLM::Contract::Step::Base` with prompt DSL, typed input/output
59
+ - **Output schema** — declarative structure via ruby_llm-schema, sent to provider for enforcement
60
+ - **Validate** — business logic checks (1-arity and 2-arity with input cross-validation)
61
+ - **Retry with model escalation** — start cheap, auto-escalate on contract failure or network error
62
+ - **Preflight limits** — `max_input`, `max_cost`, `max_output` refuse before calling the LLM
63
+ - **Pipeline** — multi-step composition with fail-fast, timeout, token budget
64
+ - **Eval** — offline contract verification with `define_eval`, `run_eval`, zero-verify auto-case
65
+ - **Adapters** — RubyLLM (production), Test (deterministic specs)
66
+ - **RSpec matchers** — `satisfy_contract`, `pass_eval`
67
+ - **Structured trace** — model, latency, tokens, cost, attempt log per step
68
+
69
+ ### Robustness
70
+
71
+ - 1005 tests, 0 failures
72
+ - 42 bugs found and fixed via 10 rounds of adversarial testing
73
+ - 0 RuboCop offenses
74
+ - Parser handles: markdown code fences, UTF-8 BOM, JSON extraction from prose
75
+ - SchemaValidator: full nested validation, additionalProperties, minItems/maxItems, minLength/maxLength
76
+ - Deep-frozen parsed_output prevents mutation via shared references
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ group :development, :test do
8
+ gem "rake", "~> 13.0"
9
+ gem "rspec", "~> 3.13"
10
+ gem "rubocop", "~> 1.75"
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,176 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ ruby_llm-contract (0.2.0)
5
+ dry-types (~> 1.7)
6
+ ruby_llm (~> 1.0)
7
+ ruby_llm-schema (~> 0.3)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.8.9)
13
+ public_suffix (>= 2.0.2, < 8.0)
14
+ ast (2.4.3)
15
+ base64 (0.3.0)
16
+ bigdecimal (4.0.1)
17
+ concurrent-ruby (1.3.6)
18
+ diff-lcs (1.6.2)
19
+ dry-core (1.2.0)
20
+ concurrent-ruby (~> 1.0)
21
+ logger
22
+ zeitwerk (~> 2.6)
23
+ dry-inflector (1.3.1)
24
+ dry-logic (1.6.0)
25
+ bigdecimal
26
+ concurrent-ruby (~> 1.0)
27
+ dry-core (~> 1.1)
28
+ zeitwerk (~> 2.6)
29
+ dry-types (1.9.1)
30
+ bigdecimal (>= 3.0)
31
+ concurrent-ruby (~> 1.0)
32
+ dry-core (~> 1.0)
33
+ dry-inflector (~> 1.0)
34
+ dry-logic (~> 1.4)
35
+ zeitwerk (~> 2.6)
36
+ event_stream_parser (1.0.0)
37
+ faraday (2.14.1)
38
+ faraday-net_http (>= 2.0, < 3.5)
39
+ json
40
+ logger
41
+ faraday-multipart (1.2.0)
42
+ multipart-post (~> 2.0)
43
+ faraday-net_http (3.4.2)
44
+ net-http (~> 0.5)
45
+ faraday-retry (2.4.0)
46
+ faraday (~> 2.0)
47
+ json (2.19.2)
48
+ json-schema (6.2.0)
49
+ addressable (~> 2.8)
50
+ bigdecimal (>= 3.1, < 5)
51
+ language_server-protocol (3.17.0.5)
52
+ lint_roller (1.1.0)
53
+ logger (1.7.0)
54
+ marcel (1.1.0)
55
+ mcp (0.9.0)
56
+ json-schema (>= 4.1)
57
+ multipart-post (2.4.1)
58
+ net-http (0.9.1)
59
+ uri (>= 0.11.1)
60
+ parallel (1.27.0)
61
+ parser (3.3.10.2)
62
+ ast (~> 2.4.1)
63
+ racc
64
+ prism (1.9.0)
65
+ public_suffix (7.0.5)
66
+ racc (1.8.1)
67
+ rainbow (3.1.1)
68
+ rake (13.3.1)
69
+ regexp_parser (2.11.3)
70
+ rspec (3.13.2)
71
+ rspec-core (~> 3.13.0)
72
+ rspec-expectations (~> 3.13.0)
73
+ rspec-mocks (~> 3.13.0)
74
+ rspec-core (3.13.6)
75
+ rspec-support (~> 3.13.0)
76
+ rspec-expectations (3.13.5)
77
+ diff-lcs (>= 1.2.0, < 2.0)
78
+ rspec-support (~> 3.13.0)
79
+ rspec-mocks (3.13.8)
80
+ diff-lcs (>= 1.2.0, < 2.0)
81
+ rspec-support (~> 3.13.0)
82
+ rspec-support (3.13.7)
83
+ rubocop (1.85.1)
84
+ json (~> 2.3)
85
+ language_server-protocol (~> 3.17.0.2)
86
+ lint_roller (~> 1.1.0)
87
+ mcp (~> 0.6)
88
+ parallel (~> 1.10)
89
+ parser (>= 3.3.0.2)
90
+ rainbow (>= 2.2.2, < 4.0)
91
+ regexp_parser (>= 2.9.3, < 3.0)
92
+ rubocop-ast (>= 1.49.0, < 2.0)
93
+ ruby-progressbar (~> 1.7)
94
+ unicode-display_width (>= 2.4.0, < 4.0)
95
+ rubocop-ast (1.49.1)
96
+ parser (>= 3.3.7.2)
97
+ prism (~> 1.7)
98
+ ruby-progressbar (1.13.0)
99
+ ruby_llm (1.14.0)
100
+ base64
101
+ event_stream_parser (~> 1)
102
+ faraday (>= 1.10.0)
103
+ faraday-multipart (>= 1)
104
+ faraday-net_http (>= 1)
105
+ faraday-retry (>= 1)
106
+ marcel (~> 1)
107
+ ruby_llm-schema (~> 0)
108
+ zeitwerk (~> 2)
109
+ ruby_llm-schema (0.3.0)
110
+ unicode-display_width (3.2.0)
111
+ unicode-emoji (~> 4.1)
112
+ unicode-emoji (4.2.0)
113
+ uri (1.1.1)
114
+ zeitwerk (2.7.5)
115
+
116
+ PLATFORMS
117
+ arm64-darwin-25
118
+ ruby
119
+
120
+ DEPENDENCIES
121
+ rake (~> 13.0)
122
+ rspec (~> 3.13)
123
+ rubocop (~> 1.75)
124
+ ruby_llm-contract!
125
+
126
+ CHECKSUMS
127
+ addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
128
+ ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
129
+ base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
130
+ bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
131
+ concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
132
+ diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
133
+ dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
134
+ dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
135
+ dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
136
+ dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
137
+ event_stream_parser (1.0.0) sha256=a2683bab70126286f8184dc88f7968ffc4028f813161fb073ec90d171f7de3c8
138
+ faraday (2.14.1) sha256=a43cceedc1e39d188f4d2cdd360a8aaa6a11da0c407052e426ba8d3fb42ef61c
139
+ faraday-multipart (1.2.0) sha256=7d89a949693714176f612323ca13746a2ded204031a6ba528adee788694ef757
140
+ faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
141
+ faraday-retry (2.4.0) sha256=7b79c48fb7e56526faf247b12d94a680071ff40c9fda7cf1ec1549439ad11ebe
142
+ json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
143
+ json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
144
+ language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
145
+ lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
146
+ logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
147
+ marcel (1.1.0) sha256=fdcfcfa33cc52e93c4308d40e4090a5d4ea279e160a7f6af988260fa970e0bee
148
+ mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
149
+ multipart-post (2.4.1) sha256=9872d03a8e552020ca096adadbf5e3cb1cd1cdd6acd3c161136b8a5737cdb4a8
150
+ net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
151
+ parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
152
+ parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
153
+ prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
154
+ public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
155
+ racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
156
+ rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
157
+ rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
158
+ regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
159
+ rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
160
+ rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
161
+ rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
162
+ rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
163
+ rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
164
+ rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
165
+ rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
166
+ ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
167
+ ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
168
+ ruby_llm-contract (0.2.0)
169
+ ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
170
+ unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
171
+ unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
172
+ uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
173
+ zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
174
+
175
+ BUNDLED WITH
176
+ 4.0.3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Justyna
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,154 @@
1
+ # ruby_llm-contract
2
+
3
+ Contracts for LLM quality. Know which model to use, what it costs, and when accuracy drops.
4
+
5
+ Companion gem for [ruby_llm](https://github.com/crmne/ruby_llm).
6
+
7
+ ## The problem
8
+
9
+ You call an LLM. It returns bad JSON, wrong values, or costs 4x more than it should. You switch models and quality drops silently. You have no data to decide which model to use.
10
+
11
+ ## The fix
12
+
13
+ ```ruby
14
+ class ClassifyTicket < RubyLLM::Contract::Step::Base
15
+ prompt do
16
+ system "You are a support ticket classifier."
17
+ rule "Return valid JSON only, no markdown."
18
+ rule "Use exactly one priority: low, medium, high, urgent."
19
+ example input: "My invoice is wrong", output: '{"priority": "high"}'
20
+ user "{input}"
21
+ end
22
+
23
+ output_schema do
24
+ string :priority, enum: %w[low medium high urgent]
25
+ string :category
26
+ end
27
+
28
+ validate("urgent needs justification") { |o, input| o[:priority] != "urgent" || input.length > 20 }
29
+ retry_policy models: %w[gpt-4.1-nano gpt-4.1-mini gpt-4.1]
30
+ end
31
+
32
+ result = ClassifyTicket.run("I was charged twice")
33
+ result.ok? # => true
34
+ result.parsed_output # => {priority: "high", category: "billing"}
35
+ result.trace[:cost] # => 0.000032
36
+ result.trace[:model] # => "gpt-4.1-nano"
37
+ ```
38
+
39
+ Bad JSON? Auto-retry. Wrong value? Escalate to a smarter model. Schema violated? Caught client-side even if the provider ignores it. All with cost tracking.
40
+
41
+ ## Which model should I use?
42
+
43
+ Define test cases. Compare models. Get data.
44
+
45
+ ```ruby
46
+ ClassifyTicket.define_eval("regression") do
47
+ add_case "billing", input: "I was charged twice", expected: { priority: "high" }
48
+ add_case "feature", input: "Add dark mode please", expected: { priority: "low" }
49
+ add_case "outage", input: "Database is down", expected: { priority: "urgent" }
50
+ end
51
+
52
+ comparison = ClassifyTicket.compare_models("regression",
53
+ models: %w[gpt-4.1-nano gpt-4.1-mini])
54
+ ```
55
+
56
+ Real output from real API calls:
57
+
58
+ ```
59
+ Model Score Cost Avg Latency
60
+ ---------------------------------------------------------
61
+ gpt-4.1-nano 0.67 $0.000032 687ms
62
+ gpt-4.1-mini 1.00 $0.000102 1070ms
63
+
64
+ Cheapest at 100%: gpt-4.1-mini
65
+ ```
66
+
67
+ ```ruby
68
+ comparison.best_for(min_score: 0.95) # => "gpt-4.1-mini"
69
+
70
+ # Inspect failures
71
+ comparison.reports["gpt-4.1-nano"].failures.each do |f|
72
+ puts "#{f.name}: expected #{f.expected}, got #{f.output}"
73
+ puts " mismatches: #{f.mismatches}"
74
+ # => outage: expected {priority: "urgent"}, got {priority: "high"}
75
+ # mismatches: {priority: {expected: "urgent", got: "high"}}
76
+ end
77
+ ```
78
+
79
+ ## Pipeline
80
+
81
+ Chain steps with fail-fast. Hallucination in step 1 stops before step 2 spends tokens.
82
+
83
+ ```ruby
84
+ class TicketPipeline < RubyLLM::Contract::Pipeline::Base
85
+ step ClassifyTicket, as: :classify
86
+ step RouteToTeam, as: :route
87
+ step DraftResponse, as: :draft
88
+ end
89
+
90
+ result = TicketPipeline.run("I was charged twice")
91
+ result.ok? # => true
92
+ result.outputs_by_step[:classify] # => {priority: "high", category: "billing"}
93
+ result.trace.total_cost # => 0.000128
94
+ ```
95
+
96
+ ## CI gate
97
+
98
+ ```ruby
99
+ # RSpec — block merge if accuracy drops or cost spikes
100
+ expect(ClassifyTicket).to pass_eval("regression")
101
+ .with_context(model: "gpt-4.1-mini")
102
+ .with_minimum_score(0.8)
103
+ .with_maximum_cost(0.01)
104
+
105
+ # Rake — run all evals across all steps
106
+ require "ruby_llm/contract/rake_task"
107
+ RubyLLM::Contract::RakeTask.new do |t|
108
+ t.minimum_score = 0.8
109
+ t.maximum_cost = 0.05
110
+ end
111
+ # bundle exec rake ruby_llm_contract:eval
112
+ ```
113
+
114
+ ## Predict cost before running
115
+
116
+ ```ruby
117
+ ClassifyTicket.estimate_eval_cost("regression", models: %w[gpt-4.1-nano gpt-4.1-mini])
118
+ # => { "gpt-4.1-nano" => 0.000024, "gpt-4.1-mini" => 0.000096 }
119
+ ```
120
+
121
+ ## Install
122
+
123
+ ```ruby
124
+ gem "ruby_llm-contract"
125
+ ```
126
+
127
+ ```ruby
128
+ RubyLLM.configure { |c| c.openai_api_key = ENV["OPENAI_API_KEY"] }
129
+ RubyLLM::Contract.configure { |c| c.default_model = "gpt-4.1-mini" }
130
+ ```
131
+
132
+ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
133
+
134
+ ## Docs
135
+
136
+ | Guide | |
137
+ |-------|-|
138
+ | [Getting Started](docs/guide/getting_started.md) | Features walkthrough, model escalation, eval |
139
+ | [Best Practices](docs/guide/best_practices.md) | 6 patterns for bulletproof validates |
140
+ | [Output Schema](docs/guide/output_schema.md) | Full schema reference + constraints |
141
+ | [Pipeline](docs/guide/pipeline.md) | Multi-step composition, timeout, fail-fast |
142
+ | [Testing](docs/guide/testing.md) | Test adapter, RSpec matchers |
143
+
144
+ ## Roadmap
145
+
146
+ **v0.2 (current):** Model comparison, cost tracking, eval with `add_case`, CI gating, Rails Railtie.
147
+
148
+ **v0.3:** Regression baselines — compare eval results with previous run, detect quality drift.
149
+
150
+ **v0.4:** Auto-routing — learn which model works for which input pattern.
151
+
152
+ ## License
153
+
154
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec