ruby_llm-contract 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +55 -0
  4. data/CHANGELOG.md +76 -0
  5. data/Gemfile +11 -0
  6. data/Gemfile.lock +176 -0
  7. data/LICENSE +21 -0
  8. data/README.md +154 -0
  9. data/Rakefile +8 -0
  10. data/examples/00_basics.rb +500 -0
  11. data/examples/01_classify_threads.rb +220 -0
  12. data/examples/02_generate_comment.rb +203 -0
  13. data/examples/03_target_audience.rb +201 -0
  14. data/examples/04_real_llm.rb +410 -0
  15. data/examples/05_output_schema.rb +258 -0
  16. data/examples/07_keyword_extraction.rb +239 -0
  17. data/examples/08_translation.rb +353 -0
  18. data/examples/09_eval_dataset.rb +287 -0
  19. data/examples/10_reddit_full_showcase.rb +363 -0
  20. data/examples/README.md +140 -0
  21. data/lib/ruby_llm/contract/adapters/base.rb +13 -0
  22. data/lib/ruby_llm/contract/adapters/response.rb +17 -0
  23. data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
  24. data/lib/ruby_llm/contract/adapters/test.rb +44 -0
  25. data/lib/ruby_llm/contract/adapters.rb +6 -0
  26. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
  27. data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
  28. data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
  29. data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
  30. data/lib/ruby_llm/contract/configuration.rb +21 -0
  31. data/lib/ruby_llm/contract/contract/definition.rb +39 -0
  32. data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
  33. data/lib/ruby_llm/contract/contract/parser.rb +143 -0
  34. data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
  35. data/lib/ruby_llm/contract/contract/validator.rb +104 -0
  36. data/lib/ruby_llm/contract/contract.rb +7 -0
  37. data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
  38. data/lib/ruby_llm/contract/dsl.rb +13 -0
  39. data/lib/ruby_llm/contract/errors.rb +19 -0
  40. data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
  41. data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
  42. data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
  43. data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
  44. data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
  45. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
  46. data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
  47. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
  48. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
  49. data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
  50. data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
  51. data/lib/ruby_llm/contract/eval/report.rb +115 -0
  52. data/lib/ruby_llm/contract/eval/runner.rb +162 -0
  53. data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
  54. data/lib/ruby_llm/contract/eval.rb +16 -0
  55. data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
  56. data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
  57. data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
  58. data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
  59. data/lib/ruby_llm/contract/pipeline.rb +6 -0
  60. data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
  61. data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
  62. data/lib/ruby_llm/contract/prompt/node.rb +25 -0
  63. data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
  64. data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
  65. data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
  66. data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
  67. data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
  68. data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
  69. data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
  70. data/lib/ruby_llm/contract/railtie.rb +20 -0
  71. data/lib/ruby_llm/contract/rake_task.rb +78 -0
  72. data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
  73. data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
  74. data/lib/ruby_llm/contract/rspec.rb +6 -0
  75. data/lib/ruby_llm/contract/step/base.rb +138 -0
  76. data/lib/ruby_llm/contract/step/dsl.rb +144 -0
  77. data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
  78. data/lib/ruby_llm/contract/step/result.rb +38 -0
  79. data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
  80. data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
  81. data/lib/ruby_llm/contract/step/runner.rb +126 -0
  82. data/lib/ruby_llm/contract/step/trace.rb +70 -0
  83. data/lib/ruby_llm/contract/step.rb +10 -0
  84. data/lib/ruby_llm/contract/token_estimator.rb +19 -0
  85. data/lib/ruby_llm/contract/types.rb +11 -0
  86. data/lib/ruby_llm/contract/version.rb +7 -0
  87. data/lib/ruby_llm/contract.rb +108 -0
  88. data/ruby_llm-contract.gemspec +33 -0
  89. metadata +172 -0
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class Dataset
7
+ attr_reader :name, :cases
8
+
9
+ def initialize(name = "unnamed", &block)
10
+ @name = name
11
+ @cases = []
12
+ instance_eval(&block) if block
13
+ end
14
+
15
+ def self.define(name = "unnamed", &)
16
+ new(name, &)
17
+ end
18
+
19
+ private
20
+
21
+ # DSL: define a test case
22
+ # dataset.case "name", input: {...}, expected: {...}
23
+ # dataset.case "name", input: {...}, expected_traits: {...}
24
+ # dataset.case "name", input: {...}, evaluator: proc
25
+ def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil)
26
+ @cases << Case.new(
27
+ name: name || "case_#{@cases.length + 1}",
28
+ input: input,
29
+ expected: expected,
30
+ expected_traits: expected_traits,
31
+ evaluator: evaluator
32
+ )
33
+ end
34
+
35
+ # Allow using `add_case` in DSL
36
+ public :add_case
37
+ end
38
+
39
+ class Case
40
+ attr_reader :name, :input, :expected, :expected_traits, :evaluator
41
+
42
+ def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil)
43
+ @name = name
44
+ @input = input
45
+ @expected = expected
46
+ @expected_traits = expected_traits
47
+ @evaluator = evaluator
48
+ freeze
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class EvalDefinition
7
+ attr_reader :name, :cases
8
+
9
+ def initialize(name, step_class: nil, &)
10
+ @name = name
11
+ @step_class = step_class
12
+ @default_input = nil
13
+ @sample_response = nil
14
+ @cases = []
15
+ instance_eval(&)
16
+ end
17
+
18
+ def default_input(input)
19
+ @default_input = input
20
+ end
21
+
22
+ def sample_response(response)
23
+ @sample_response = response
24
+ pre_validate_sample! if @step_class
25
+ end
26
+
27
+ def build_adapter
28
+ return nil unless @sample_response
29
+
30
+ Adapters::Test.new(response: @sample_response.is_a?(String) ? @sample_response : @sample_response.to_json)
31
+ end
32
+
33
+ def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil)
34
+ case_input = input || @default_input
35
+ raise ArgumentError, "add_case requires input (set default_input or pass input:)" unless case_input
36
+
37
+ @cases << {
38
+ name: description,
39
+ input: case_input,
40
+ expected: expected,
41
+ expected_traits: expected_traits,
42
+ evaluator: evaluator
43
+ }
44
+ end
45
+
46
+ def verify(description, expected_or_proc = nil, input: nil, expect: nil)
47
+ if expected_or_proc && expect
48
+ raise ArgumentError, "verify accepts either a positional argument or expect: keyword, not both"
49
+ end
50
+
51
+ expected_or_proc = expect if expect
52
+ case_input = input || @default_input
53
+ validate_verify_args!(expected_or_proc, case_input)
54
+
55
+ evaluator = expected_or_proc.is_a?(::Proc) ? expected_or_proc : nil
56
+
57
+ @cases << {
58
+ name: description,
59
+ input: case_input,
60
+ expected: evaluator ? nil : expected_or_proc,
61
+ evaluator: evaluator
62
+ }
63
+ end
64
+
65
+ def build_dataset
66
+ eval_cases = effective_cases
67
+ eval_name = @name
68
+ Dataset.define(eval_name) do
69
+ eval_cases.each do |eval_case|
70
+ add_case(eval_case[:name], input: eval_case[:input], expected: eval_case[:expected],
71
+ expected_traits: eval_case[:expected_traits],
72
+ evaluator: eval_case[:evaluator])
73
+ end
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def effective_cases
80
+ return @cases if @cases.any?
81
+ return [] unless @default_input
82
+
83
+ # Zero-verify: auto-add a contract check case
84
+ [{ name: "contract check", input: @default_input, expected: nil, evaluator: nil }]
85
+ end
86
+
87
+ def validate_verify_args!(expected_or_proc, case_input)
88
+ raise ArgumentError, "verify requires either a positional argument or expect: keyword" unless expected_or_proc
89
+ raise ArgumentError, "verify requires input (set default_input or pass input:)" unless case_input
90
+ end
91
+
92
+ def pre_validate_sample!
93
+ schema = @step_class.respond_to?(:output_schema) ? @step_class.output_schema : nil
94
+ return unless schema
95
+
96
+ errors = validate_sample_against_schema(schema)
97
+ return if errors.empty?
98
+
99
+ raise ArgumentError, "sample_response does not satisfy step schema: #{errors.join(", ")}"
100
+ rescue JSON::ParserError
101
+ # Not JSON -- skip pre-validation
102
+ end
103
+
104
+ def validate_sample_against_schema(schema)
105
+ response_hash = @sample_response.is_a?(Hash) ? @sample_response : JSON.parse(@sample_response.to_s)
106
+ symbolized = Parser.symbolize_keys(response_hash)
107
+ SchemaValidator.validate(symbolized, schema)
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class EvaluationResult
7
+ attr_reader :score, :passed, :label, :details
8
+
9
+ def initialize(score:, passed:, label: nil, details: nil)
10
+ @score = score.to_f.clamp(0.0, 1.0)
11
+ @passed = passed
12
+ @label = label || (passed ? "PASS" : "FAIL")
13
+ @details = details
14
+ freeze
15
+ end
16
+
17
+ def failed?
18
+ !@passed
19
+ end
20
+
21
+ def to_s
22
+ "#{@label} (score: #{@score}#{" — #{@details}" if @details})"
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ module Evaluator
7
+ class Exact
8
+ def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
9
+ if output == expected
10
+ EvaluationResult.new(score: 1.0, passed: true, details: "exact match")
11
+ else
12
+ EvaluationResult.new(score: 0.0, passed: false,
13
+ details: "expected #{expected.inspect}, got #{output.inspect}")
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ module Evaluator
7
+ class JsonIncludes
8
+ def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
9
+ return type_error(output, expected) unless output.is_a?(Hash) && expected.is_a?(Hash)
10
+
11
+ errors = check_keys(output, expected)
12
+ build_result(errors, expected.length)
13
+ end
14
+
15
+ private
16
+
17
+ def check_keys(output, expected)
18
+ expected.each_with_object([]) do |(key, value), errors|
19
+ actual = output[key]
20
+ error = check_single_key(key, value, actual)
21
+ errors << error if error
22
+ end
23
+ end
24
+
25
+ def check_single_key(key, expected_value, actual)
26
+ if actual.nil?
27
+ "missing key: #{key}"
28
+ elsif expected_value.is_a?(::Regexp)
29
+ mismatch_message(key, expected_value, actual) unless actual.to_s.match?(expected_value)
30
+ elsif actual != expected_value
31
+ mismatch_message(key, expected_value, actual)
32
+ end
33
+ end
34
+
35
+ def mismatch_message(key, expected_value, actual)
36
+ "#{key}: expected #{expected_value.inspect}, got #{actual.inspect}"
37
+ end
38
+
39
+ def build_result(errors, total)
40
+ if errors.empty?
41
+ return EvaluationResult.new(score: 1.0, passed: true,
42
+ details: "all expected keys present and matching")
43
+ end
44
+
45
+ matched = total - errors.length
46
+ score = total.zero? ? 0.0 : matched.to_f / total
47
+ EvaluationResult.new(score: score, passed: false, details: errors.join("; "))
48
+ end
49
+
50
+ def type_error(output, expected)
51
+ EvaluationResult.new(score: 0.0, passed: false,
52
+ details: "expected Hash, got #{output.class} and #{expected.class}")
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ module Evaluator
7
+ class ProcEvaluator
8
+ def initialize(callable)
9
+ @callable = callable
10
+ end
11
+
12
+ def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument,Metrics
13
+ result = if @callable.arity == 2 || (@callable.arity.negative? && @callable.parameters.length >= 2)
14
+ @callable.call(output, input)
15
+ else
16
+ @callable.call(output)
17
+ end
18
+
19
+ if result.nil?
20
+ warn "[ruby_llm-contract] verify/evaluator proc returned nil. " \
21
+ "This usually means a key mismatch (string vs symbol). " \
22
+ "Output keys are always symbols."
23
+ end
24
+
25
+ case result
26
+ when true
27
+ EvaluationResult.new(score: 1.0, passed: true, details: "passed")
28
+ when false
29
+ EvaluationResult.new(score: 0.0, passed: false, details: "not passed")
30
+ when Numeric
31
+ EvaluationResult.new(score: result, passed: result >= 0.5, details: "custom score: #{result}")
32
+ else
33
+ EvaluationResult.new(score: result ? 1.0 : 0.0, passed: !!result, details: "custom: #{result}")
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ module Evaluator
7
+ class Regex
8
+ def initialize(pattern)
9
+ @pattern = pattern.is_a?(::Regexp) ? pattern : ::Regexp.new(pattern)
10
+ end
11
+
12
+ def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument
13
+ text = output.is_a?(Hash) ? output.values.join(" ") : output.to_s
14
+
15
+ if text.match?(@pattern)
16
+ EvaluationResult.new(score: 1.0, passed: true,
17
+ details: "matches #{@pattern.inspect}")
18
+ else
19
+ EvaluationResult.new(score: 0.0, passed: false,
20
+ details: "does not match #{@pattern.inspect}")
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class ModelComparison
7
+ attr_reader :eval_name, :reports
8
+
9
+ def initialize(eval_name:, reports:)
10
+ @eval_name = eval_name
11
+ @reports = reports.freeze # { "model_name" => Report }
12
+ freeze
13
+ end
14
+
15
+ def models
16
+ @reports.keys
17
+ end
18
+
19
+ def score_for(model)
20
+ @reports[model]&.score
21
+ end
22
+
23
+ def cost_for(model)
24
+ @reports[model]&.total_cost
25
+ end
26
+
27
+ def best_for(min_score: 0.0)
28
+ eligible = @reports.select { |_, report| report.score > 0.0 && report.score >= min_score }
29
+ return nil if eligible.empty?
30
+
31
+ eligible.min_by { |_, report| report.total_cost }&.first
32
+ end
33
+
34
+ def cost_per_point
35
+ @reports.transform_values do |report|
36
+ report.score.positive? ? report.total_cost / report.score : Float::INFINITY
37
+ end
38
+ end
39
+
40
+ def table
41
+ lines = [" Model Score Cost Avg Latency"]
42
+ lines << " #{"-" * 57}"
43
+
44
+ @reports.each do |model, report|
45
+ latency = report.avg_latency_ms ? "#{report.avg_latency_ms.round}ms" : "n/a"
46
+ cost = report.total_cost.positive? ? "$#{format("%.4f", report.total_cost)}" : "n/a"
47
+ lines << format(" %-25s %6.2f %10s %12s", model, report.score, cost, latency)
48
+ end
49
+
50
+ lines.join("\n")
51
+ end
52
+
53
+ def print_summary(io = $stdout)
54
+ io.puts "#{@eval_name} — model comparison"
55
+ io.puts
56
+ io.puts table
57
+ io.puts
58
+
59
+ best = best_for(min_score: 0.0)
60
+ io.puts " Best overall: #{best}" if best
61
+
62
+ cheapest_passing = best_for(min_score: 1.0)
63
+ io.puts " Cheapest at 100%: #{cheapest_passing}" if cheapest_passing
64
+ end
65
+
66
+ def to_h
67
+ @reports.transform_values do |report|
68
+ {
69
+ score: report.score,
70
+ total_cost: report.total_cost,
71
+ avg_latency_ms: report.avg_latency_ms,
72
+ pass_rate: report.pass_rate,
73
+ passed: report.passed?
74
+ }
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Lightweight adapter that wraps a Pipeline::Result to look like a Step::Result.
7
+ # Replaces OpenStruct usage in Runner#normalize_pipeline_result.
8
+ PipelineResultAdapter = Struct.new(:status, :ok_flag, :parsed_output, :validation_errors, :trace) do
9
+ def ok?
10
+ ok_flag
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class Report
7
+ attr_reader :dataset_name, :results
8
+
9
+ def initialize(dataset_name:, results:)
10
+ @dataset_name = dataset_name
11
+ @results = results.freeze
12
+ freeze
13
+ end
14
+
15
+ def score
16
+ evaluated = evaluated_results
17
+ return 0.0 if evaluated.empty?
18
+
19
+ evaluated.sum(&:score) / evaluated.length
20
+ end
21
+
22
+ def passed
23
+ evaluated_results.count(&:passed?)
24
+ end
25
+
26
+ def failed
27
+ evaluated_results.count(&:failed?)
28
+ end
29
+
30
+ def skipped
31
+ results.count { |r| r.step_status == :skipped }
32
+ end
33
+
34
+ def failures
35
+ evaluated_results.select(&:failed?)
36
+ end
37
+
38
+ def pass_rate
39
+ "#{passed}/#{evaluated_results.length}"
40
+ end
41
+
42
+ def total_cost
43
+ results.sum { |r| r.cost || 0.0 }
44
+ end
45
+
46
+ def avg_latency_ms
47
+ latencies = results.filter_map(&:duration_ms)
48
+ return nil if latencies.empty?
49
+
50
+ latencies.sum.to_f / latencies.length
51
+ end
52
+
53
+ def passed?
54
+ evaluated = evaluated_results
55
+ return false if evaluated.empty?
56
+
57
+ evaluated.all?(&:passed?)
58
+ end
59
+
60
+ def each(&)
61
+ results.each(&)
62
+ end
63
+
64
+ def summary
65
+ parts = ["#{dataset_name}: #{pass_rate} checks passed"]
66
+ parts << "#{skipped} skipped" if skipped.positive?
67
+ parts << format_cost(total_cost) if total_cost.positive?
68
+ parts.join(", ")
69
+ end
70
+
71
+ GENERIC_DETAILS = ["passed", "not passed"].freeze
72
+
73
+ def to_s
74
+ lines = [summary]
75
+ failures.each do |result|
76
+ lines << format_failure(result)
77
+ end
78
+ lines.join("\n")
79
+ end
80
+
81
+ def print_summary(io = $stdout)
82
+ io.puts summary
83
+ io.puts
84
+ results.each do |result|
85
+ icon = result.label
86
+ cost_str = result.cost ? " #{format_cost(result.cost)}" : ""
87
+ latency_str = result.duration_ms ? " #{result.duration_ms}ms" : ""
88
+ io.puts " #{icon} #{result.name}#{cost_str}#{latency_str}"
89
+ io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
90
+ end
91
+ end
92
+
93
+ private
94
+
95
+ def format_failure(result)
96
+ line = " FAIL #{result.name}"
97
+ line += ": #{result.details}" if useful_details?(result.details)
98
+ line
99
+ end
100
+
101
+ def useful_details?(details)
102
+ details && !GENERIC_DETAILS.include?(details)
103
+ end
104
+
105
+ def evaluated_results
106
+ results.reject { |r| r.step_status == :skipped }
107
+ end
108
+
109
+ def format_cost(cost)
110
+ "$#{format("%.6f", cost)}"
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end