ruby_llm-tribunal 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +32 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +442 -0
  5. data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
  6. data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
  7. data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
  8. data/lib/ruby_llm/tribunal/assertions.rb +141 -0
  9. data/lib/ruby_llm/tribunal/configuration.rb +38 -0
  10. data/lib/ruby_llm/tribunal/dataset.rb +118 -0
  11. data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
  12. data/lib/ruby_llm/tribunal/judge.rb +166 -0
  13. data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
  14. data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
  15. data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
  16. data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
  17. data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
  18. data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
  19. data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
  20. data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
  21. data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
  22. data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
  23. data/lib/ruby_llm/tribunal/red_team.rb +306 -0
  24. data/lib/ruby_llm/tribunal/reporter.rb +48 -0
  25. data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
  26. data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
  27. data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
  28. data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
  29. data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
  30. data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
  31. data/lib/ruby_llm/tribunal/test_case.rb +124 -0
  32. data/lib/ruby_llm/tribunal/version.rb +7 -0
  33. data/lib/ruby_llm/tribunal.rb +130 -0
  34. data/lib/ruby_llm-tribunal.rb +3 -0
  35. data/lib/tasks/tribunal.rake +269 -0
  36. metadata +99 -0
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Reporters
6
+ # HTML report for shareable results.
7
+ class HTML
8
+ class << self
9
+ def format(results)
10
+ <<~HTML
11
+ <!DOCTYPE html>
12
+ <html lang="en">
13
+ <head>
14
+ <meta charset="UTF-8">
15
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
16
+ <title>Tribunal Evaluation Report</title>
17
+ <style>
18
+ * { box-sizing: border-box; margin: 0; padding: 0; }
19
+ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; line-height: 1.6; padding: 2rem; background: #f5f5f5; }
20
+ .container { max-width: 900px; margin: 0 auto; }
21
+ h1 { color: #333; margin-bottom: 1.5rem; }
22
+ .summary { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
23
+ .summary-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; }
24
+ .stat { text-align: center; }
25
+ .stat-value { font-size: 2rem; font-weight: bold; color: #333; }
26
+ .stat-label { color: #666; font-size: 0.9rem; }
27
+ .status { padding: 0.5rem 1rem; border-radius: 4px; display: inline-block; font-weight: bold; margin-top: 1rem; }
28
+ .status.passed { background: #d4edda; color: #155724; }
29
+ .status.failed { background: #f8d7da; color: #721c24; }
30
+ .metrics { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
31
+ .metrics h2 { margin-bottom: 1rem; color: #333; font-size: 1.2rem; }
32
+ .metric-row { display: flex; align-items: center; margin-bottom: 0.75rem; }
33
+ .metric-name { width: 120px; font-weight: 500; }
34
+ .metric-bar { flex: 1; height: 20px; background: #e9ecef; border-radius: 4px; overflow: hidden; margin: 0 1rem; }
35
+ .metric-fill { height: 100%; background: #28a745; transition: width 0.3s; }
36
+ .metric-fill.warning { background: #ffc107; }
37
+ .metric-fill.danger { background: #dc3545; }
38
+ .metric-value { width: 100px; text-align: right; font-size: 0.9rem; color: #666; }
39
+ .failures { background: white; border-radius: 8px; padding: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
40
+ .failures h2 { margin-bottom: 1rem; color: #333; font-size: 1.2rem; }
41
+ .failure { background: #fff5f5; border-left: 4px solid #dc3545; padding: 1rem; margin-bottom: 1rem; border-radius: 0 4px 4px 0; }
42
+ .failure-input { font-weight: 500; color: #333; margin-bottom: 0.5rem; }
43
+ .failure-reason { color: #666; font-size: 0.9rem; }
44
+ .failure-reason code { background: #f1f1f1; padding: 0.2rem 0.4rem; border-radius: 3px; }
45
+ .footer { text-align: center; margin-top: 2rem; color: #666; font-size: 0.85rem; }
46
+ </style>
47
+ </head>
48
+ <body>
49
+ <div class="container">
50
+ <h1>Tribunal Evaluation Report</h1>
51
+ #{summary_section(results[:summary])}
52
+ #{metrics_section(results[:metrics])}
53
+ #{failures_section(results[:cases])}
54
+ <div class="footer">Generated by Tribunal</div>
55
+ </div>
56
+ </body>
57
+ </html>
58
+ HTML
59
+ end
60
+
61
+ private
62
+
63
+ def summary_section(summary)
64
+ passed = summary[:threshold_passed] != false && summary[:failed].zero?
65
+ status_class = passed ? 'passed' : 'failed'
66
+ status_text = passed ? 'PASSED' : 'FAILED'
67
+
68
+ threshold_info = if summary[:strict]
69
+ ' (strict mode)'
70
+ elsif summary[:threshold]
71
+ " (threshold: #{(summary[:threshold] * 100).round}%)"
72
+ else
73
+ ''
74
+ end
75
+
76
+ <<~HTML
77
+ <div class="summary">
78
+ <div class="summary-grid">
79
+ <div class="stat">
80
+ <div class="stat-value">#{summary[:total]}</div>
81
+ <div class="stat-label">Total Tests</div>
82
+ </div>
83
+ <div class="stat">
84
+ <div class="stat-value">#{summary[:passed]}</div>
85
+ <div class="stat-label">Passed</div>
86
+ </div>
87
+ <div class="stat">
88
+ <div class="stat-value">#{summary[:failed]}</div>
89
+ <div class="stat-label">Failed</div>
90
+ </div>
91
+ <div class="stat">
92
+ <div class="stat-value">#{(summary[:pass_rate] * 100).round}%</div>
93
+ <div class="stat-label">Pass Rate</div>
94
+ </div>
95
+ <div class="stat">
96
+ <div class="stat-value">#{format_duration(summary[:duration_ms])}</div>
97
+ <div class="stat-label">Duration</div>
98
+ </div>
99
+ </div>
100
+ <div class="status #{status_class}">#{status_text}#{threshold_info}</div>
101
+ </div>
102
+ HTML
103
+ end
104
+
105
+ def metrics_section(metrics)
106
+ return '' if metrics.nil? || metrics.empty?
107
+
108
+ rows = metrics.map { |name, data| format_metric_row(name, data) }.join("\n")
109
+
110
+ <<~HTML
111
+ <div class="metrics">
112
+ <h2>Results by Metric</h2>
113
+ #{rows}
114
+ </div>
115
+ HTML
116
+ end
117
+
118
+ def format_metric_row(name, data)
119
+ rate = data[:total].positive? ? data[:passed].to_f / data[:total] : 0
120
+ percent = (rate * 100).round
121
+
122
+ fill_class = if percent >= 90
123
+ ''
124
+ elsif percent >= 70
125
+ 'warning'
126
+ else
127
+ 'danger'
128
+ end
129
+
130
+ <<~HTML
131
+ <div class="metric-row">
132
+ <div class="metric-name">#{escape_html(name.to_s)}</div>
133
+ <div class="metric-bar">
134
+ <div class="metric-fill #{fill_class}" style="width: #{percent}%"></div>
135
+ </div>
136
+ <div class="metric-value">#{data[:passed]}/#{data[:total]} (#{percent}%)</div>
137
+ </div>
138
+ HTML
139
+ end
140
+
141
+ def failures_section(cases)
142
+ failures = cases.select { |c| c[:status] == :failed }
143
+ return '' if failures.empty?
144
+
145
+ rows = failures.map { |c| format_failure_row(c) }.join("\n")
146
+
147
+ <<~HTML
148
+ <div class="failures">
149
+ <h2>Failed Cases</h2>
150
+ #{rows}
151
+ </div>
152
+ HTML
153
+ end
154
+
155
+ def format_failure_row(test_case)
156
+ reasons = test_case[:failures].map do |type, reason|
157
+ "<code>#{escape_html(type.to_s)}</code>: #{escape_html(reason.to_s)}"
158
+ end.join('<br>')
159
+
160
+ <<~HTML
161
+ <div class="failure">
162
+ <div class="failure-input">#{escape_html(test_case[:input].to_s)}</div>
163
+ <div class="failure-reason">#{reasons}</div>
164
+ </div>
165
+ HTML
166
+ end
167
+
168
+ def escape_html(str)
169
+ str.to_s
170
+ .gsub('&', '&amp;')
171
+ .gsub('<', '&lt;')
172
+ .gsub('>', '&gt;')
173
+ .gsub('"', '&quot;')
174
+ end
175
+
176
+ def format_duration(duration_ms)
177
+ return "#{duration_ms}ms" if duration_ms < 1000
178
+
179
+ "#{(duration_ms / 1000.0).round(1)}s"
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Reporters
6
+ # JSON output for CI/machine consumption.
7
+ class JSON
8
+ class << self
9
+ def format(results)
10
+ convert_for_json(results).to_json
11
+ end
12
+
13
+ private
14
+
15
+ def convert_for_json(data)
16
+ case data
17
+ when Hash
18
+ data.transform_keys(&:to_s).transform_values { |v| convert_for_json(v) }
19
+ when Array
20
+ data.map { |item| convert_for_json(item) }
21
+ when Symbol
22
+ data.to_s
23
+ else
24
+ data
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Reporters
6
+ # JUnit XML format for CI tools.
7
+ class JUnit
8
+ class << self
9
+ def format(results)
10
+ test_cases = results[:cases].map { |c| format_testcase(c) }.join("\n")
11
+
12
+ <<~XML
13
+ <?xml version="1.0" encoding="UTF-8"?>
14
+ <testsuites name="Tribunal" tests="#{results[:summary][:total]}" failures="#{results[:summary][:failed]}" time="#{results[:summary][:duration_ms] / 1000.0}">
15
+ <testsuite name="eval" tests="#{results[:summary][:total]}" failures="#{results[:summary][:failed]}">
16
+ #{test_cases}
17
+ </testsuite>
18
+ </testsuites>
19
+ XML
20
+ end
21
+
22
+ private
23
+
24
+ def format_testcase(test_case)
25
+ name = escape_xml(test_case[:input].to_s)
26
+ time = (test_case[:duration_ms] || 0) / 1000.0
27
+
28
+ return %( <testcase name="#{name}" time="#{time}"/>) if test_case[:status] == :passed
29
+
30
+ failure_msg = test_case[:failures]
31
+ .map { |type, reason| "#{type}: #{reason}" }
32
+ .join("\n")
33
+ .then { |msg| escape_xml(msg) }
34
+
35
+ build_failure_xml(name, time, failure_msg)
36
+ end
37
+
38
+ def build_failure_xml(name, time, failure_msg)
39
+ <<~XML.chomp
40
+ <testcase name="#{name}" time="#{time}">
41
+ <failure message="Assertion failed">#{failure_msg}</failure>
42
+ </testcase>
43
+ XML
44
+ end
45
+
46
+ def escape_xml(str)
47
+ str.to_s
48
+ .gsub('&', '&amp;')
49
+ .gsub('<', '&lt;')
50
+ .gsub('>', '&gt;')
51
+ .gsub('"', '&quot;')
52
+ .gsub("'", '&apos;')
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Reporters
6
+ # Plain ASCII text output (no unicode).
7
+ class Text
8
+ class << self
9
+ def format(results)
10
+ [
11
+ header,
12
+ summary_section(results[:summary]),
13
+ metrics_section(results[:metrics]),
14
+ failures_section(results[:cases]),
15
+ footer(results[:summary])
16
+ ].join("\n")
17
+ end
18
+
19
+ private
20
+
21
+ def header
22
+ <<~HEADER
23
+
24
+ Tribunal LLM Evaluation
25
+ ===================================================================
26
+ HEADER
27
+ end
28
+
29
+ def summary_section(summary)
30
+ <<~SUMMARY
31
+ Summary
32
+ -------------------------------------------------------------------
33
+ Total: #{summary[:total]} test cases
34
+ Passed: #{summary[:passed]} (#{(summary[:pass_rate] * 100).round}%)
35
+ Failed: #{summary[:failed]}
36
+ Duration: #{format_duration(summary[:duration_ms])}
37
+ SUMMARY
38
+ end
39
+
40
+ def metrics_section(metrics)
41
+ return '' if metrics.nil? || metrics.empty?
42
+
43
+ rows = metrics.map do |name, data|
44
+ rate = data[:total].positive? ? data[:passed].to_f / data[:total] : 0
45
+ bar = progress_bar(rate, 20)
46
+ " #{pad(name, 14)} #{data[:passed]}/#{data[:total]} passed #{(rate * 100).round}% #{bar}"
47
+ end.join("\n")
48
+
49
+ <<~METRICS
50
+ Results by Metric
51
+ -------------------------------------------------------------------
52
+ #{rows}
53
+ METRICS
54
+ end
55
+
56
+ def failures_section(cases)
57
+ failures = cases.select { |c| c[:status] == :failed }
58
+ return '' if failures.empty?
59
+
60
+ rows = failures.each_with_index.map do |c, idx|
61
+ format_failure_row(c, idx + 1)
62
+ end.join("\n")
63
+
64
+ <<~FAILURES
65
+ Failed Cases
66
+ -------------------------------------------------------------------
67
+ #{rows}
68
+ FAILURES
69
+ end
70
+
71
+ def format_failure_row(test_case, idx)
72
+ input = test_case[:input].to_s[0, 50]
73
+ reasons = test_case[:failures].map do |type, reason|
74
+ " |- #{type}: #{reason}"
75
+ end.join("\n")
76
+
77
+ <<~ROW
78
+ #{idx}. "#{input}"
79
+ #{reasons}
80
+ ROW
81
+ end
82
+
83
+ def footer(summary)
84
+ passed = summary[:threshold_passed] != false && summary[:failed].zero?
85
+ status = passed ? 'PASSED' : 'FAILED'
86
+
87
+ threshold_info = if summary[:strict]
88
+ ' (strict mode)'
89
+ elsif summary[:threshold]
90
+ " (threshold: #{(summary[:threshold] * 100).round}%)"
91
+ else
92
+ ''
93
+ end
94
+
95
+ <<~FOOTER
96
+ -------------------------------------------------------------------
97
+ #{status}#{threshold_info}
98
+ FOOTER
99
+ end
100
+
101
+ def progress_bar(rate, width)
102
+ filled = (rate * width).round
103
+ empty = width - filled
104
+ "#{'#' * filled}#{'-' * empty}"
105
+ end
106
+
107
+ def pad(term, width)
108
+ term.to_s.ljust(width)
109
+ end
110
+
111
+ def format_duration(duration_ms)
112
+ return "#{duration_ms}ms" if duration_ms < 1000
113
+
114
+ "#{(duration_ms / 1000.0).round(1)}s"
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ # Represents a single evaluation test case.
6
+ #
7
+ # @attr_reader input [String] The user query/prompt (required)
8
+ # @attr_reader actual_output [String, nil] The LLM response to evaluate
9
+ # @attr_reader expected_output [String, nil] Golden/ideal answer for comparison
10
+ # @attr_reader context [Array<String>, nil] Ground truth context for faithfulness checks
11
+ # @attr_reader retrieval_context [Array<String>, nil] Actual retrieved docs from RAG
12
+ # @attr_reader metadata [Hash, nil] Additional info like latency, tokens, cost
13
+ #
14
+ # @example
15
+ # test_case = TestCase.new(
16
+ # input: "What's the return policy?",
17
+ # actual_output: "You can return items within 30 days.",
18
+ # context: ["Returns accepted within 30 days with receipt."],
19
+ # expected_output: "Items can be returned within 30 days with a receipt."
20
+ # )
21
+ class TestCase
22
+ attr_reader :input, :actual_output, :expected_output, :context, :retrieval_context, :metadata
23
+
24
+ # Creates a new test case from a hash.
25
+ #
26
+ # @param attrs [Hash] Test case attributes
27
+ # @option attrs [String] :input The user query/prompt
28
+ # @option attrs [String] :actual_output The LLM response to evaluate
29
+ # @option attrs [String] :expected_output Golden/ideal answer
30
+ # @option attrs [Array<String>, String] :context Ground truth context
31
+ # @option attrs [Array<String>] :retrieval_context Retrieved docs from RAG
32
+ # @option attrs [Hash] :metadata Additional info
33
+ def initialize(attrs = {})
34
+ attrs = normalize_keys(attrs)
35
+
36
+ @input = attrs[:input]
37
+ @actual_output = attrs[:actual_output]
38
+ @expected_output = attrs[:expected_output]
39
+ @context = normalize_context(attrs[:context])
40
+ @retrieval_context = normalize_context(attrs[:retrieval_context])
41
+ @metadata = attrs[:metadata]
42
+ end
43
+
44
+ # Sets the actual output on an existing test case.
45
+ # Useful when the dataset provides input/context but output comes from your LLM.
46
+ #
47
+ # @param output [String] The LLM response
48
+ # @return [TestCase] A new test case with the output set
49
+ def with_output(output)
50
+ TestCase.new(
51
+ input: @input,
52
+ actual_output: output,
53
+ expected_output: @expected_output,
54
+ context: @context,
55
+ retrieval_context: @retrieval_context,
56
+ metadata: @metadata
57
+ )
58
+ end
59
+
60
+ # Sets the retrieval context from your RAG pipeline.
61
+ #
62
+ # @param context [Array<String>, String] Retrieved documents
63
+ # @return [TestCase] A new test case with retrieval context set
64
+ def with_retrieval_context(context)
65
+ TestCase.new(
66
+ input: @input,
67
+ actual_output: @actual_output,
68
+ expected_output: @expected_output,
69
+ context: @context,
70
+ retrieval_context: normalize_context(context),
71
+ metadata: @metadata
72
+ )
73
+ end
74
+
75
+ # Adds metadata (latency, tokens, cost, etc).
76
+ #
77
+ # @param new_metadata [Hash] Metadata to merge
78
+ # @return [TestCase] A new test case with merged metadata
79
+ def with_metadata(new_metadata)
80
+ merged = (@metadata || {}).merge(new_metadata)
81
+ TestCase.new(
82
+ input: @input,
83
+ actual_output: @actual_output,
84
+ expected_output: @expected_output,
85
+ context: @context,
86
+ retrieval_context: @retrieval_context,
87
+ metadata: merged
88
+ )
89
+ end
90
+
91
+ # Converts the test case to a hash.
92
+ #
93
+ # @return [Hash] Test case as hash
94
+ def to_h
95
+ {
96
+ input: @input,
97
+ actual_output: @actual_output,
98
+ expected_output: @expected_output,
99
+ context: @context,
100
+ retrieval_context: @retrieval_context,
101
+ metadata: @metadata
102
+ }.compact
103
+ end
104
+
105
+ private
106
+
107
+ def normalize_keys(hash)
108
+ hash.transform_keys do |key|
109
+ case key
110
+ when String then key.to_sym
111
+ else key
112
+ end
113
+ end
114
+ end
115
+
116
+ def normalize_context(ctx)
117
+ return nil if ctx.nil?
118
+ return [ctx] if ctx.is_a?(String)
119
+
120
+ ctx
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ VERSION = '0.1.0'
6
+ end
7
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ruby_llm'
4
+ require 'json'
5
+ require 'yaml'
6
+
7
+ require_relative 'tribunal/version'
8
+ require_relative 'tribunal/configuration'
9
+ require_relative 'tribunal/test_case'
10
+ require_relative 'tribunal/assertions'
11
+ require_relative 'tribunal/assertions/deterministic'
12
+ require_relative 'tribunal/assertions/judge'
13
+ require_relative 'tribunal/assertions/embedding'
14
+ require_relative 'tribunal/judge'
15
+ require_relative 'tribunal/judges/faithful'
16
+ require_relative 'tribunal/judges/relevant'
17
+ require_relative 'tribunal/judges/hallucination'
18
+ require_relative 'tribunal/judges/correctness'
19
+ require_relative 'tribunal/judges/bias'
20
+ require_relative 'tribunal/judges/toxicity'
21
+ require_relative 'tribunal/judges/harmful'
22
+ require_relative 'tribunal/judges/jailbreak'
23
+ require_relative 'tribunal/judges/pii'
24
+ require_relative 'tribunal/judges/refusal'
25
+ require_relative 'tribunal/dataset'
26
+ require_relative 'tribunal/red_team'
27
+ require_relative 'tribunal/reporter'
28
+ require_relative 'tribunal/reporters/console'
29
+ require_relative 'tribunal/reporters/text'
30
+ require_relative 'tribunal/reporters/json'
31
+ require_relative 'tribunal/reporters/html'
32
+ require_relative 'tribunal/reporters/github'
33
+ require_relative 'tribunal/reporters/junit'
34
+ require_relative 'tribunal/eval_helpers'
35
+
36
+ module RubyLLM
37
+ # LLM evaluation framework for Ruby.
38
+ #
39
+ # Tribunal provides tools for evaluating LLM outputs,
40
+ # detecting hallucinations, and measuring response quality.
41
+ #
42
+ # @example Quick Start
43
+ # test_case = RubyLLM::Tribunal.test_case(
44
+ # input: "What's the return policy?",
45
+ # actual_output: "Returns within 30 days.",
46
+ # context: ["Return policy: 30 days with receipt."]
47
+ # )
48
+ #
49
+ # assertions = [
50
+ # [:contains, { value: "30 days" }],
51
+ # [:faithful, { threshold: 0.8 }]
52
+ # ]
53
+ #
54
+ # results = RubyLLM::Tribunal.evaluate(test_case, assertions)
55
+ #
56
+ module Tribunal
57
+ class Error < StandardError; end
58
+
59
+ class << self
60
+ # Configuration for Tribunal
61
+ def configuration
62
+ @configuration ||= Configuration.new
63
+ end
64
+
65
+ def configure
66
+ yield(configuration)
67
+ end
68
+
69
+ # Evaluates a test case against assertions.
70
+ #
71
+ # @param test_case [TestCase] The test case to evaluate
72
+ # @param assertions [Array, Hash] Assertions to run
73
+ # @return [Hash] Results map of assertion_type => result
74
+ #
75
+ # @example
76
+ # test_case = TestCase.new(
77
+ # input: "What's the return policy?",
78
+ # actual_output: "Returns within 30 days.",
79
+ # context: ["Return policy: 30 days with receipt."]
80
+ # )
81
+ #
82
+ # assertions = [
83
+ # [:contains, { value: "30 days" }],
84
+ # [:faithful, { threshold: 0.8 }]
85
+ # ]
86
+ #
87
+ # Tribunal.evaluate(test_case, assertions)
88
+ # #=> { contains: [:pass, {...}], faithful: [:pass, {...}] }
89
+ def evaluate(test_case, assertions)
90
+ Assertions.evaluate_all(assertions, test_case)
91
+ end
92
+
93
+ # Returns available assertion types based on loaded dependencies.
94
+ #
95
+ # @return [Array<Symbol>] List of available assertion types
96
+ def available_assertions
97
+ Assertions.available
98
+ end
99
+
100
+ # Creates a new test case.
101
+ #
102
+ # @param attrs [Hash] Test case attributes
103
+ # @return [TestCase] New test case instance
104
+ #
105
+ # @example
106
+ # Tribunal.test_case(
107
+ # input: "What's the price?",
108
+ # actual_output: "The price is $29.99.",
109
+ # context: ["Product costs $29.99"]
110
+ # )
111
+ def test_case(attrs)
112
+ TestCase.new(attrs)
113
+ end
114
+
115
+ # Registers a custom judge.
116
+ #
117
+ # @param judge_class [Class] A class implementing the Judge interface
118
+ def register_judge(judge_class)
119
+ Judge.register(judge_class)
120
+ end
121
+
122
+ # Returns all registered judge names.
123
+ #
124
+ # @return [Array<Symbol>] List of judge names
125
+ def judge_names
126
+ Judge.all_judge_names
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ruby_llm/tribunal'