ruby_llm-tribunal 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +32 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +442 -0
  5. data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
  6. data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
  7. data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
  8. data/lib/ruby_llm/tribunal/assertions.rb +141 -0
  9. data/lib/ruby_llm/tribunal/configuration.rb +38 -0
  10. data/lib/ruby_llm/tribunal/dataset.rb +118 -0
  11. data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
  12. data/lib/ruby_llm/tribunal/judge.rb +166 -0
  13. data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
  14. data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
  15. data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
  16. data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
  17. data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
  18. data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
  19. data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
  20. data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
  21. data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
  22. data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
  23. data/lib/ruby_llm/tribunal/red_team.rb +306 -0
  24. data/lib/ruby_llm/tribunal/reporter.rb +48 -0
  25. data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
  26. data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
  27. data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
  28. data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
  29. data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
  30. data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
  31. data/lib/ruby_llm/tribunal/test_case.rb +124 -0
  32. data/lib/ruby_llm/tribunal/version.rb +7 -0
  33. data/lib/ruby_llm/tribunal.rb +130 -0
  34. data/lib/ruby_llm-tribunal.rb +3 -0
  35. data/lib/tasks/tribunal.rake +269 -0
  36. metadata +99 -0
@@ -0,0 +1,259 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Assertions
6
+ # Deterministic assertions that don't require LLM calls.
7
+ #
8
+ # Fast, free, and should run first before expensive LLM-based checks.
9
+ module Deterministic
10
+ class << self
11
+ # Evaluates a deterministic assertion.
12
+ #
13
+ # @param type [Symbol] The assertion type
14
+ # @param output [String] The output to evaluate
15
+ # @param opts [Hash] Options for the assertion
16
+ # @return [Array] [:pass, details] or [:fail, details]
17
+ #
18
+ # @example
19
+ # evaluate(:contains, "Hello world", value: "world")
20
+ # #=> [:pass, { matched: ["world"] }]
21
+ def evaluate(type, output, opts = {})
22
+ send(:"evaluate_#{type}", output, opts)
23
+ end
24
+
25
+ private
26
+
27
+ def evaluate_contains(output, opts)
28
+ values = Array(opts[:value] || opts[:values])
29
+ results = values.map { |v| [v, output.include?(v)] }
30
+
31
+ if results.all? { |_, matched| matched }
32
+ [:pass, { matched: values }]
33
+ else
34
+ missing = results.reject { |_, m| m }.map(&:first)
35
+ [:fail, { missing:, reason: "Output missing: #{missing.inspect}" }]
36
+ end
37
+ end
38
+
39
+ def evaluate_not_contains(output, opts)
40
+ values = Array(opts[:value] || opts[:values])
41
+ found = values.select { |v| output.include?(v) }
42
+
43
+ if found.empty?
44
+ [:pass, { checked: values }]
45
+ else
46
+ [:fail, { found:, reason: "Output contains forbidden: #{found.inspect}" }]
47
+ end
48
+ end
49
+
50
+ def evaluate_contains_any(output, opts)
51
+ values = Array(opts[:value] || opts[:values])
52
+ found = values.find { |v| output.include?(v) }
53
+
54
+ if found
55
+ [:pass, { matched: found }]
56
+ else
57
+ [:fail, { expected_any: values, reason: "Output contains none of: #{values.inspect}" }]
58
+ end
59
+ end
60
+
61
+ def evaluate_contains_all(output, opts)
62
+ values = Array(opts[:value] || opts[:values])
63
+ results = values.map { |v| [v, output.include?(v)] }
64
+
65
+ if results.all? { |_, matched| matched }
66
+ [:pass, { matched: values }]
67
+ else
68
+ missing = results.reject { |_, m| m }.map(&:first)
69
+ [:fail, { missing:, reason: "Output missing: #{missing.inspect}" }]
70
+ end
71
+ end
72
+
73
+ def evaluate_regex(output, opts)
74
+ pattern = opts[:value] || opts[:pattern]
75
+ regex = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern)
76
+
77
+ match = output.match(regex)
78
+ if match
79
+ [:pass, { matched: match[0], pattern: regex.source }]
80
+ else
81
+ [:fail, { pattern: regex.source, reason: "Pattern not found: #{regex.source}" }]
82
+ end
83
+ end
84
+
85
+ def evaluate_is_json(output, _opts)
86
+ parsed = JSON.parse(output)
87
+ [:pass, { parsed: }]
88
+ rescue JSON::ParserError
89
+ [:fail, { reason: 'Invalid JSON' }]
90
+ end
91
+
92
+ def evaluate_max_tokens(output, opts)
93
+ max = opts[:value] || opts[:max] || 500
94
+
95
+ # Approximate: 1 token ~= 0.75 words ~= 4 chars
96
+ # Using word count as a reasonable approximation
97
+ word_count = output.split(/\s+/).length
98
+ approx_tokens = (word_count / 0.75).ceil
99
+
100
+ if approx_tokens <= max
101
+ [:pass, { approx_tokens:, max: }]
102
+ else
103
+ [:fail, {
104
+ approx_tokens:,
105
+ max:,
106
+ reason: "Output ~#{approx_tokens} tokens exceeds max #{max}"
107
+ }]
108
+ end
109
+ end
110
+
111
+ def evaluate_latency_ms(_output, opts)
112
+ max = opts[:value] || opts[:max] || 5000
113
+ actual = opts[:actual] || opts[:latency]
114
+
115
+ if actual.nil?
116
+ [:fail, { reason: 'No latency value provided in opts[:actual]' }]
117
+ elsif actual <= max
118
+ [:pass, { latency_ms: actual, max: }]
119
+ else
120
+ [:fail, { latency_ms: actual, max:, reason: "Latency #{actual}ms exceeds max #{max}ms" }]
121
+ end
122
+ end
123
+
124
+ def evaluate_starts_with(output, opts)
125
+ prefix = opts[:value]
126
+
127
+ if output.start_with?(prefix)
128
+ [:pass, { prefix: }]
129
+ else
130
+ [:fail, { expected: prefix, reason: "Output does not start with: #{prefix}" }]
131
+ end
132
+ end
133
+
134
+ def evaluate_ends_with(output, opts)
135
+ suffix = opts[:value]
136
+
137
+ if output.end_with?(suffix)
138
+ [:pass, { suffix: }]
139
+ else
140
+ [:fail, { expected: suffix, reason: "Output does not end with: #{suffix}" }]
141
+ end
142
+ end
143
+
144
+ def evaluate_equals(output, opts)
145
+ expected = opts[:value]
146
+
147
+ if output == expected
148
+ [:pass, {}]
149
+ else
150
+ [:fail, { expected:, actual: output, reason: 'Output does not match expected' }]
151
+ end
152
+ end
153
+
154
+ def evaluate_min_length(output, opts)
155
+ min = opts[:value] || opts[:min]
156
+ length = output.length
157
+
158
+ if length >= min
159
+ [:pass, { length:, min: }]
160
+ else
161
+ [:fail, { length:, min:, reason: "Output length #{length} below minimum #{min}" }]
162
+ end
163
+ end
164
+
165
+ def evaluate_max_length(output, opts)
166
+ max = opts[:value] || opts[:max]
167
+ length = output.length
168
+
169
+ if length <= max
170
+ [:pass, { length:, max: }]
171
+ else
172
+ [:fail, { length:, max:, reason: "Output length #{length} exceeds maximum #{max}" }]
173
+ end
174
+ end
175
+
176
+ def evaluate_word_count(output, opts)
177
+ min = opts[:min] || 0
178
+ max = opts[:max]
179
+
180
+ words = output.split(/\s+/).reject(&:empty?)
181
+ count = words.length
182
+
183
+ if count < min
184
+ [:fail, { word_count: count, min:, reason: "Word count #{count} below minimum #{min}" }]
185
+ elsif max && count > max
186
+ [:fail, { word_count: count, max:, reason: "Word count #{count} exceeds maximum #{max}" }]
187
+ else
188
+ [:pass, { word_count: count }]
189
+ end
190
+ end
191
+
192
+ def evaluate_is_url(output, _opts)
193
+ url = output.strip
194
+
195
+ if url.match?(%r{^https?://[^\s]+$})
196
+ [:pass, { url: }]
197
+ else
198
+ [:fail, { reason: 'Output is not a valid URL' }]
199
+ end
200
+ end
201
+
202
+ def evaluate_is_email(output, _opts)
203
+ email = output.strip
204
+
205
+ if email.match?(/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/)
206
+ [:pass, { email: }]
207
+ else
208
+ [:fail, { reason: 'Output is not a valid email' }]
209
+ end
210
+ end
211
+
212
+ def evaluate_levenshtein(output, opts)
213
+ target = opts[:value]
214
+ max_distance = opts[:max_distance] || 3
215
+
216
+ distance = levenshtein_distance(output, target)
217
+
218
+ if distance <= max_distance
219
+ [:pass, { distance:, max_distance: }]
220
+ else
221
+ [:fail, {
222
+ distance:,
223
+ max_distance:,
224
+ reason: "Edit distance #{distance} exceeds max #{max_distance}"
225
+ }]
226
+ end
227
+ end
228
+
229
+ # Levenshtein distance algorithm
230
+ def levenshtein_distance(s1, s2)
231
+ s1_chars = s1.chars
232
+ s2_chars = s2.chars
233
+
234
+ return s2_chars.length if s1_chars.empty?
235
+ return s1_chars.length if s2_chars.empty?
236
+
237
+ row = (0..s2_chars.length).to_a
238
+
239
+ s1_chars.each_with_index do |c1, i|
240
+ prev_row = row
241
+ row = [i + 1]
242
+
243
+ s2_chars.each_with_index do |c2, j|
244
+ cost = c1 == c2 ? 0 : 1
245
+ row << [
246
+ row[j] + 1, # deletion
247
+ prev_row[j + 1] + 1, # insertion
248
+ prev_row[j] + cost # substitution
249
+ ].min
250
+ end
251
+ end
252
+
253
+ row.last
254
+ end
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Assertions
6
+ # Embedding-based semantic similarity assertions.
7
+ #
8
+ # Uses sentence embeddings to determine if two texts are semantically similar.
9
+ module Embedding
10
+ DEFAULT_THRESHOLD = 0.7
11
+
12
+ class << self
13
+ # Returns list of available embedding assertion types.
14
+ #
15
+ # @return [Array<Symbol>]
16
+ def available
17
+ [:similar]
18
+ end
19
+
20
+ # Evaluates semantic similarity between actual and expected output.
21
+ #
22
+ # @param test_case [TestCase] The test case
23
+ # @param opts [Hash] Options
24
+ # @option opts [Float] :threshold Similarity threshold (0.0 to 1.0). Default: 0.7
25
+ # @option opts [Proc] :similarity_fn Custom similarity function for testing
26
+ # @return [Array] [:pass, details], [:fail, details], or [:error, message]
27
+ #
28
+ # @example
29
+ # test_case = TestCase.new(
30
+ # actual_output: "The cat is sleeping",
31
+ # expected_output: "A feline is resting"
32
+ # )
33
+ #
34
+ # Embedding.evaluate(test_case, threshold: 0.8)
35
+ # #=> [:pass, { similarity: 0.85, threshold: 0.8 }]
36
+ def evaluate(test_case, opts = {})
37
+ if test_case.expected_output.nil?
38
+ return [:error,
39
+ 'Similar assertion requires expected_output to be provided']
40
+ end
41
+
42
+ threshold = opts[:threshold] || DEFAULT_THRESHOLD
43
+ similarity_fn = opts[:similarity_fn] || method(:default_similarity)
44
+
45
+ result = similarity_fn.call(test_case.actual_output, test_case.expected_output, opts)
46
+
47
+ case result
48
+ in [:ok, similarity] if similarity >= threshold
49
+ [:pass, { similarity:, threshold: }]
50
+ in [:ok, similarity]
51
+ [:fail, {
52
+ similarity:,
53
+ threshold:,
54
+ reason: "Output is not semantically similar to expected (#{similarity.round(2)} < #{threshold})"
55
+ }]
56
+ in [:error, reason]
57
+ [:error, "Failed to compute similarity: #{reason}"]
58
+ end
59
+ end
60
+
61
+ private
62
+
63
+ def default_similarity(text1, text2, opts)
64
+ # Use RubyLLM embeddings
65
+ model = opts[:embedding_model] || 'text-embedding-3-small'
66
+
67
+ embedding1 = RubyLLM.embed(text1, model:).vectors.first
68
+ embedding2 = RubyLLM.embed(text2, model:).vectors.first
69
+
70
+ # Compute cosine similarity
71
+ similarity = cosine_similarity(embedding1, embedding2)
72
+ [:ok, similarity]
73
+ rescue StandardError => e
74
+ [:error, e.message]
75
+ end
76
+
77
+ def cosine_similarity(vec1, vec2)
78
+ dot_product = vec1.zip(vec2).sum { |a, b| a * b }
79
+ magnitude1 = Math.sqrt(vec1.sum { |x| x**2 })
80
+ magnitude2 = Math.sqrt(vec2.sum { |x| x**2 })
81
+
82
+ return 0.0 if magnitude1.zero? || magnitude2.zero?
83
+
84
+ dot_product / (magnitude1 * magnitude2)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ module Assertions
6
+ # LLM-as-judge assertions for evaluating LLM outputs.
7
+ #
8
+ # Uses RubyLLM to get structured verdicts from the judge model.
9
+ module Judge
10
+ DEFAULT_MODEL = 'anthropic:claude-3-5-haiku-latest'
11
+ DEFAULT_THRESHOLD = 0.8
12
+
13
+ SYSTEM_PROMPT = <<~PROMPT
14
+ You are a precise evaluator of LLM outputs. Your task is to assess outputs
15
+ based on specific criteria and provide structured verdicts.
16
+
17
+ Always respond with valid JSON containing:
18
+ - verdict: "yes", "no", or "partial"
19
+ - reason: A brief explanation
20
+ - score: A float from 0.0 to 1.0
21
+
22
+ Be objective and consistent in your evaluations.
23
+ PROMPT
24
+
25
+ class << self
26
+ # Returns list of available judge assertion types.
27
+ #
28
+ # @return [Array<Symbol>]
29
+ def available
30
+ Tribunal::Judge.all_judge_names
31
+ end
32
+
33
+ # Evaluates a judge assertion against a test case.
34
+ #
35
+ # @param type [Symbol] The judge type
36
+ # @param test_case [TestCase] The test case
37
+ # @param opts [Hash] Options
38
+ # @return [Array] [:pass, details], [:fail, details], or [:error, message]
39
+ def evaluate(type, test_case, opts)
40
+ judge_class = Tribunal::Judge.find(type)
41
+ return [:error, "Unknown judge assertion: #{type}"] unless judge_class
42
+
43
+ # Validate test case
44
+ error = judge_class.validate(test_case) if judge_class.respond_to?(:validate)
45
+ return [:error, error] if error
46
+
47
+ run_judge(judge_class, test_case, opts)
48
+ end
49
+
50
+ private
51
+
52
+ def run_judge(judge_class, test_case, opts)
53
+ model = opts[:model] || Tribunal.configuration.default_model || DEFAULT_MODEL
54
+ threshold = opts[:threshold] || Tribunal.configuration.default_threshold || DEFAULT_THRESHOLD
55
+ prompt = judge_class.prompt(test_case, opts)
56
+
57
+ response = call_llm(model, prompt, opts)
58
+
59
+ case response
60
+ in [:ok, result]
61
+ # Check if judge has custom evaluation
62
+ if judge_class.respond_to?(:evaluate_result)
63
+ custom_result = judge_class.evaluate_result(result, opts)
64
+ return custom_result if custom_result
65
+ end
66
+
67
+ negative_metric = judge_class.respond_to?(:negative_metric?) && judge_class.negative_metric?
68
+ interpret_response(result, threshold, negative_metric)
69
+ in [:error, reason]
70
+ [:error, reason.to_s]
71
+ end
72
+ end
73
+
74
+ def call_llm(model, prompt, opts)
75
+ # Allow injecting custom LLM for tests via opts[:llm]
76
+ return opts[:llm].call(model, build_messages(prompt), opts) if opts[:llm]
77
+
78
+ call_ruby_llm(model, prompt, opts)
79
+ end
80
+
81
+ def build_messages(prompt)
82
+ [
83
+ { role: 'system', content: SYSTEM_PROMPT },
84
+ { role: 'user', content: prompt }
85
+ ]
86
+ end
87
+
88
+ def call_ruby_llm(model, prompt, _opts)
89
+ # Parse model string (e.g., "anthropic:claude-3-5-haiku-latest")
90
+ _provider, model_name = parse_model(model)
91
+
92
+ chat = RubyLLM.chat(model: model_name)
93
+
94
+ # Add system message
95
+ chat.with_instructions(SYSTEM_PROMPT)
96
+
97
+ # Get response
98
+ response = chat.ask(prompt)
99
+ content = response.content
100
+
101
+ # Parse JSON response
102
+ parsed = JSON.parse(content)
103
+ [:ok, parsed]
104
+ rescue JSON::ParserError => e
105
+ # Try to extract JSON from response
106
+ if (match = content&.match(/\{[\s\S]*\}/))
107
+ begin
108
+ parsed = JSON.parse(match[0])
109
+ return [:ok, parsed]
110
+ rescue JSON::ParserError
111
+ # Fall through to error
112
+ end
113
+ end
114
+ [:error, "Failed to parse LLM response as JSON: #{e.message}"]
115
+ rescue StandardError => e
116
+ [:error, e.message]
117
+ end
118
+
119
+ def parse_model(model_string)
120
+ if model_string.include?(':')
121
+ model_string.split(':', 2)
122
+ else
123
+ [nil, model_string]
124
+ end
125
+ end
126
+
127
+ def interpret_response(response, threshold, negative_metric)
128
+ details = {
129
+ verdict: response['verdict'],
130
+ reason: response['reason'],
131
+ score: response['score']
132
+ }
133
+
134
+ verdict_result(response['verdict'], response['score'], threshold, negative_metric, details)
135
+ end
136
+
137
+ def verdict_result(verdict, score, threshold, negative_metric, details)
138
+ return [:error, "Unexpected verdict: #{verdict}"] unless %w[yes no partial].include?(verdict)
139
+
140
+ passed = case verdict
141
+ when 'yes' then !negative_metric
142
+ when 'no' then negative_metric
143
+ when 'partial' then score.is_a?(Numeric) && score >= threshold
144
+ end
145
+
146
+ passed ? [:pass, details] : [:fail, details]
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ # Assertion evaluation engine.
6
+ #
7
+ # Routes assertions to the appropriate implementation:
8
+ # - Deterministic: `contains`, `regex`, `is_json`, etc.
9
+ # - Judge (requires ruby_llm): `faithful`, `relevant`, etc.
10
+ # - Embedding (requires neighbor): `similar`
11
+ module Assertions
12
+ DETERMINISTIC_ASSERTIONS = %i[
13
+ contains
14
+ not_contains
15
+ contains_any
16
+ contains_all
17
+ regex
18
+ is_json
19
+ max_tokens
20
+ latency_ms
21
+ starts_with
22
+ ends_with
23
+ equals
24
+ min_length
25
+ max_length
26
+ word_count
27
+ is_url
28
+ is_email
29
+ levenshtein
30
+ ].freeze
31
+
32
+ EMBEDDING_ASSERTIONS = %i[similar].freeze
33
+
34
+ class << self
35
+ # Evaluates a single assertion against a test case.
36
+ #
37
+ # @param assertion_type [Symbol] The type of assertion
38
+ # @param test_case [TestCase] The test case to evaluate
39
+ # @param opts [Hash] Options for the assertion
40
+ # @return [Array] [:pass, details] or [:fail, details]
41
+ def evaluate(assertion_type, test_case, opts = {})
42
+ if DETERMINISTIC_ASSERTIONS.include?(assertion_type)
43
+ Deterministic.evaluate(assertion_type, test_case.actual_output, opts)
44
+ elsif Tribunal::Judge.builtin_judge?(assertion_type) || Tribunal::Judge.custom_judge?(assertion_type)
45
+ evaluate_judge(assertion_type, test_case, opts)
46
+ elsif EMBEDDING_ASSERTIONS.include?(assertion_type)
47
+ evaluate_embedding(assertion_type, test_case, opts)
48
+ else
49
+ [:error, "Unknown assertion type: #{assertion_type}"]
50
+ end
51
+ end
52
+
53
+ # Evaluates multiple assertions against a test case.
54
+ #
55
+ # @param assertions [Array, Hash] Assertions to evaluate
56
+ # @param test_case [TestCase] The test case to evaluate
57
+ # @return [Hash] Map of assertion_type => result
58
+ def evaluate_all(assertions, test_case)
59
+ normalized = normalize_assertions(assertions)
60
+ normalized.each_with_object({}) do |(type, opts), results|
61
+ results[type] = evaluate(type, test_case, opts)
62
+ end
63
+ end
64
+
65
+ # Checks if all assertions passed.
66
+ #
67
+ # @param results [Hash] Results from evaluate_all
68
+ # @return [Boolean] True if all passed
69
+ def all_passed?(results)
70
+ results.all? { |_type, result| result.first == :pass }
71
+ end
72
+
73
+ # Returns list of available assertion types based on loaded dependencies.
74
+ #
75
+ # @return [Array<Symbol>] Available assertion types
76
+ def available
77
+ base = DETERMINISTIC_ASSERTIONS.dup
78
+
79
+ # Add judge assertions
80
+ base.concat(Tribunal::Judge.all_judge_names)
81
+
82
+ # Add embedding assertions if neighbor is available
83
+ begin
84
+ require 'neighbor'
85
+ base.concat(EMBEDDING_ASSERTIONS)
86
+ rescue LoadError
87
+ # neighbor not available
88
+ end
89
+
90
+ base
91
+ end
92
+
93
+ private
94
+
95
+ def normalize_assertions(assertions)
96
+ case assertions
97
+ when Hash
98
+ assertions.map do |type, opts|
99
+ opts = { value: opts } unless opts.is_a?(Hash)
100
+ [type.to_sym, opts]
101
+ end
102
+ when Array
103
+ assertions.map do |item|
104
+ case item
105
+ when Symbol, String
106
+ [item.to_sym, {}]
107
+ when Array
108
+ type, opts = item
109
+ opts = { value: opts } unless opts.is_a?(Hash)
110
+ [type.to_sym, opts]
111
+ else
112
+ raise ArgumentError, "Invalid assertion format: #{item.inspect}"
113
+ end
114
+ end
115
+ else
116
+ raise ArgumentError, 'Assertions must be a Hash or Array'
117
+ end
118
+ end
119
+
120
+ def evaluate_judge(type, test_case, opts)
121
+ Assertions::Judge.evaluate(type, test_case, opts)
122
+ end
123
+
124
+ def evaluate_embedding(_type, test_case, opts)
125
+ begin
126
+ require 'neighbor'
127
+ rescue LoadError
128
+ raise Error, <<~MSG
129
+ Embedding similarity requires the 'neighbor' gem.
130
+
131
+ Add to your Gemfile:
132
+ gem 'neighbor', '~> 0.4'
133
+ MSG
134
+ end
135
+
136
+ Embedding.evaluate(test_case, opts)
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Tribunal
5
+ # Configuration for Tribunal.
6
+ #
7
+ # @example
8
+ # RubyLLM::Tribunal.configure do |config|
9
+ # config.default_model = "anthropic:claude-3-5-haiku-latest"
10
+ # config.default_threshold = 0.8
11
+ # config.verbose = true
12
+ # end
13
+ class Configuration
14
+ # Default LLM model for judge assertions
15
+ # @return [String]
16
+ attr_accessor :default_model
17
+
18
+ # Default threshold for judge assertions (0.0-1.0)
19
+ # @return [Float]
20
+ attr_accessor :default_threshold
21
+
22
+ # Whether to print verbose output
23
+ # @return [Boolean]
24
+ attr_accessor :verbose
25
+
26
+ # Custom judges to register
27
+ # @return [Array<Class>]
28
+ attr_accessor :custom_judges
29
+
30
+ def initialize
31
+ @default_model = 'anthropic:claude-3-5-haiku-latest'
32
+ @default_threshold = 0.8
33
+ @verbose = false
34
+ @custom_judges = []
35
+ end
36
+ end
37
+ end
38
+ end