ruby_llm-tribunal 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +32 -0
- data/LICENSE.txt +21 -0
- data/README.md +442 -0
- data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
- data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
- data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
- data/lib/ruby_llm/tribunal/assertions.rb +141 -0
- data/lib/ruby_llm/tribunal/configuration.rb +38 -0
- data/lib/ruby_llm/tribunal/dataset.rb +118 -0
- data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
- data/lib/ruby_llm/tribunal/judge.rb +166 -0
- data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
- data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
- data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
- data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
- data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
- data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
- data/lib/ruby_llm/tribunal/red_team.rb +306 -0
- data/lib/ruby_llm/tribunal/reporter.rb +48 -0
- data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
- data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
- data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
- data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
- data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
- data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
- data/lib/ruby_llm/tribunal/test_case.rb +124 -0
- data/lib/ruby_llm/tribunal/version.rb +7 -0
- data/lib/ruby_llm/tribunal.rb +130 -0
- data/lib/ruby_llm-tribunal.rb +3 -0
- data/lib/tasks/tribunal.rake +269 -0
- metadata +99 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Assertions
|
|
6
|
+
# Deterministic assertions that don't require LLM calls.
|
|
7
|
+
#
|
|
8
|
+
# Fast, free, and should run first before expensive LLM-based checks.
|
|
9
|
+
module Deterministic
|
|
10
|
+
class << self
|
|
11
|
+
# Evaluates a deterministic assertion.
|
|
12
|
+
#
|
|
13
|
+
# @param type [Symbol] The assertion type
|
|
14
|
+
# @param output [String] The output to evaluate
|
|
15
|
+
# @param opts [Hash] Options for the assertion
|
|
16
|
+
# @return [Array] [:pass, details] or [:fail, details]
|
|
17
|
+
#
|
|
18
|
+
# @example
|
|
19
|
+
# evaluate(:contains, "Hello world", value: "world")
|
|
20
|
+
# #=> [:pass, { matched: ["world"] }]
|
|
21
|
+
def evaluate(type, output, opts = {})
|
|
22
|
+
send(:"evaluate_#{type}", output, opts)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def evaluate_contains(output, opts)
|
|
28
|
+
values = Array(opts[:value] || opts[:values])
|
|
29
|
+
results = values.map { |v| [v, output.include?(v)] }
|
|
30
|
+
|
|
31
|
+
if results.all? { |_, matched| matched }
|
|
32
|
+
[:pass, { matched: values }]
|
|
33
|
+
else
|
|
34
|
+
missing = results.reject { |_, m| m }.map(&:first)
|
|
35
|
+
[:fail, { missing:, reason: "Output missing: #{missing.inspect}" }]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def evaluate_not_contains(output, opts)
|
|
40
|
+
values = Array(opts[:value] || opts[:values])
|
|
41
|
+
found = values.select { |v| output.include?(v) }
|
|
42
|
+
|
|
43
|
+
if found.empty?
|
|
44
|
+
[:pass, { checked: values }]
|
|
45
|
+
else
|
|
46
|
+
[:fail, { found:, reason: "Output contains forbidden: #{found.inspect}" }]
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def evaluate_contains_any(output, opts)
|
|
51
|
+
values = Array(opts[:value] || opts[:values])
|
|
52
|
+
found = values.find { |v| output.include?(v) }
|
|
53
|
+
|
|
54
|
+
if found
|
|
55
|
+
[:pass, { matched: found }]
|
|
56
|
+
else
|
|
57
|
+
[:fail, { expected_any: values, reason: "Output contains none of: #{values.inspect}" }]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def evaluate_contains_all(output, opts)
|
|
62
|
+
values = Array(opts[:value] || opts[:values])
|
|
63
|
+
results = values.map { |v| [v, output.include?(v)] }
|
|
64
|
+
|
|
65
|
+
if results.all? { |_, matched| matched }
|
|
66
|
+
[:pass, { matched: values }]
|
|
67
|
+
else
|
|
68
|
+
missing = results.reject { |_, m| m }.map(&:first)
|
|
69
|
+
[:fail, { missing:, reason: "Output missing: #{missing.inspect}" }]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def evaluate_regex(output, opts)
|
|
74
|
+
pattern = opts[:value] || opts[:pattern]
|
|
75
|
+
regex = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern)
|
|
76
|
+
|
|
77
|
+
match = output.match(regex)
|
|
78
|
+
if match
|
|
79
|
+
[:pass, { matched: match[0], pattern: regex.source }]
|
|
80
|
+
else
|
|
81
|
+
[:fail, { pattern: regex.source, reason: "Pattern not found: #{regex.source}" }]
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def evaluate_is_json(output, _opts)
|
|
86
|
+
parsed = JSON.parse(output)
|
|
87
|
+
[:pass, { parsed: }]
|
|
88
|
+
rescue JSON::ParserError
|
|
89
|
+
[:fail, { reason: 'Invalid JSON' }]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def evaluate_max_tokens(output, opts)
|
|
93
|
+
max = opts[:value] || opts[:max] || 500
|
|
94
|
+
|
|
95
|
+
# Approximate: 1 token ~= 0.75 words ~= 4 chars
|
|
96
|
+
# Using word count as a reasonable approximation
|
|
97
|
+
word_count = output.split(/\s+/).length
|
|
98
|
+
approx_tokens = (word_count / 0.75).ceil
|
|
99
|
+
|
|
100
|
+
if approx_tokens <= max
|
|
101
|
+
[:pass, { approx_tokens:, max: }]
|
|
102
|
+
else
|
|
103
|
+
[:fail, {
|
|
104
|
+
approx_tokens:,
|
|
105
|
+
max:,
|
|
106
|
+
reason: "Output ~#{approx_tokens} tokens exceeds max #{max}"
|
|
107
|
+
}]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def evaluate_latency_ms(_output, opts)
|
|
112
|
+
max = opts[:value] || opts[:max] || 5000
|
|
113
|
+
actual = opts[:actual] || opts[:latency]
|
|
114
|
+
|
|
115
|
+
if actual.nil?
|
|
116
|
+
[:fail, { reason: 'No latency value provided in opts[:actual]' }]
|
|
117
|
+
elsif actual <= max
|
|
118
|
+
[:pass, { latency_ms: actual, max: }]
|
|
119
|
+
else
|
|
120
|
+
[:fail, { latency_ms: actual, max:, reason: "Latency #{actual}ms exceeds max #{max}ms" }]
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def evaluate_starts_with(output, opts)
|
|
125
|
+
prefix = opts[:value]
|
|
126
|
+
|
|
127
|
+
if output.start_with?(prefix)
|
|
128
|
+
[:pass, { prefix: }]
|
|
129
|
+
else
|
|
130
|
+
[:fail, { expected: prefix, reason: "Output does not start with: #{prefix}" }]
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def evaluate_ends_with(output, opts)
|
|
135
|
+
suffix = opts[:value]
|
|
136
|
+
|
|
137
|
+
if output.end_with?(suffix)
|
|
138
|
+
[:pass, { suffix: }]
|
|
139
|
+
else
|
|
140
|
+
[:fail, { expected: suffix, reason: "Output does not end with: #{suffix}" }]
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def evaluate_equals(output, opts)
|
|
145
|
+
expected = opts[:value]
|
|
146
|
+
|
|
147
|
+
if output == expected
|
|
148
|
+
[:pass, {}]
|
|
149
|
+
else
|
|
150
|
+
[:fail, { expected:, actual: output, reason: 'Output does not match expected' }]
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def evaluate_min_length(output, opts)
|
|
155
|
+
min = opts[:value] || opts[:min]
|
|
156
|
+
length = output.length
|
|
157
|
+
|
|
158
|
+
if length >= min
|
|
159
|
+
[:pass, { length:, min: }]
|
|
160
|
+
else
|
|
161
|
+
[:fail, { length:, min:, reason: "Output length #{length} below minimum #{min}" }]
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def evaluate_max_length(output, opts)
|
|
166
|
+
max = opts[:value] || opts[:max]
|
|
167
|
+
length = output.length
|
|
168
|
+
|
|
169
|
+
if length <= max
|
|
170
|
+
[:pass, { length:, max: }]
|
|
171
|
+
else
|
|
172
|
+
[:fail, { length:, max:, reason: "Output length #{length} exceeds maximum #{max}" }]
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def evaluate_word_count(output, opts)
|
|
177
|
+
min = opts[:min] || 0
|
|
178
|
+
max = opts[:max]
|
|
179
|
+
|
|
180
|
+
words = output.split(/\s+/).reject(&:empty?)
|
|
181
|
+
count = words.length
|
|
182
|
+
|
|
183
|
+
if count < min
|
|
184
|
+
[:fail, { word_count: count, min:, reason: "Word count #{count} below minimum #{min}" }]
|
|
185
|
+
elsif max && count > max
|
|
186
|
+
[:fail, { word_count: count, max:, reason: "Word count #{count} exceeds maximum #{max}" }]
|
|
187
|
+
else
|
|
188
|
+
[:pass, { word_count: count }]
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def evaluate_is_url(output, _opts)
|
|
193
|
+
url = output.strip
|
|
194
|
+
|
|
195
|
+
if url.match?(%r{^https?://[^\s]+$})
|
|
196
|
+
[:pass, { url: }]
|
|
197
|
+
else
|
|
198
|
+
[:fail, { reason: 'Output is not a valid URL' }]
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def evaluate_is_email(output, _opts)
|
|
203
|
+
email = output.strip
|
|
204
|
+
|
|
205
|
+
if email.match?(/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/)
|
|
206
|
+
[:pass, { email: }]
|
|
207
|
+
else
|
|
208
|
+
[:fail, { reason: 'Output is not a valid email' }]
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def evaluate_levenshtein(output, opts)
|
|
213
|
+
target = opts[:value]
|
|
214
|
+
max_distance = opts[:max_distance] || 3
|
|
215
|
+
|
|
216
|
+
distance = levenshtein_distance(output, target)
|
|
217
|
+
|
|
218
|
+
if distance <= max_distance
|
|
219
|
+
[:pass, { distance:, max_distance: }]
|
|
220
|
+
else
|
|
221
|
+
[:fail, {
|
|
222
|
+
distance:,
|
|
223
|
+
max_distance:,
|
|
224
|
+
reason: "Edit distance #{distance} exceeds max #{max_distance}"
|
|
225
|
+
}]
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Levenshtein distance algorithm
|
|
230
|
+
def levenshtein_distance(s1, s2)
|
|
231
|
+
s1_chars = s1.chars
|
|
232
|
+
s2_chars = s2.chars
|
|
233
|
+
|
|
234
|
+
return s2_chars.length if s1_chars.empty?
|
|
235
|
+
return s1_chars.length if s2_chars.empty?
|
|
236
|
+
|
|
237
|
+
row = (0..s2_chars.length).to_a
|
|
238
|
+
|
|
239
|
+
s1_chars.each_with_index do |c1, i|
|
|
240
|
+
prev_row = row
|
|
241
|
+
row = [i + 1]
|
|
242
|
+
|
|
243
|
+
s2_chars.each_with_index do |c2, j|
|
|
244
|
+
cost = c1 == c2 ? 0 : 1
|
|
245
|
+
row << [
|
|
246
|
+
row[j] + 1, # deletion
|
|
247
|
+
prev_row[j + 1] + 1, # insertion
|
|
248
|
+
prev_row[j] + cost # substitution
|
|
249
|
+
].min
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
row.last
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Assertions
|
|
6
|
+
# Embedding-based semantic similarity assertions.
|
|
7
|
+
#
|
|
8
|
+
# Uses sentence embeddings to determine if two texts are semantically similar.
|
|
9
|
+
module Embedding
|
|
10
|
+
DEFAULT_THRESHOLD = 0.7
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
# Returns list of available embedding assertion types.
|
|
14
|
+
#
|
|
15
|
+
# @return [Array<Symbol>]
|
|
16
|
+
def available
|
|
17
|
+
[:similar]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Evaluates semantic similarity between actual and expected output.
|
|
21
|
+
#
|
|
22
|
+
# @param test_case [TestCase] The test case
|
|
23
|
+
# @param opts [Hash] Options
|
|
24
|
+
# @option opts [Float] :threshold Similarity threshold (0.0 to 1.0). Default: 0.7
|
|
25
|
+
# @option opts [Proc] :similarity_fn Custom similarity function for testing
|
|
26
|
+
# @return [Array] [:pass, details], [:fail, details], or [:error, message]
|
|
27
|
+
#
|
|
28
|
+
# @example
|
|
29
|
+
# test_case = TestCase.new(
|
|
30
|
+
# actual_output: "The cat is sleeping",
|
|
31
|
+
# expected_output: "A feline is resting"
|
|
32
|
+
# )
|
|
33
|
+
#
|
|
34
|
+
# Embedding.evaluate(test_case, threshold: 0.8)
|
|
35
|
+
# #=> [:pass, { similarity: 0.85, threshold: 0.8 }]
|
|
36
|
+
def evaluate(test_case, opts = {})
|
|
37
|
+
if test_case.expected_output.nil?
|
|
38
|
+
return [:error,
|
|
39
|
+
'Similar assertion requires expected_output to be provided']
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
threshold = opts[:threshold] || DEFAULT_THRESHOLD
|
|
43
|
+
similarity_fn = opts[:similarity_fn] || method(:default_similarity)
|
|
44
|
+
|
|
45
|
+
result = similarity_fn.call(test_case.actual_output, test_case.expected_output, opts)
|
|
46
|
+
|
|
47
|
+
case result
|
|
48
|
+
in [:ok, similarity] if similarity >= threshold
|
|
49
|
+
[:pass, { similarity:, threshold: }]
|
|
50
|
+
in [:ok, similarity]
|
|
51
|
+
[:fail, {
|
|
52
|
+
similarity:,
|
|
53
|
+
threshold:,
|
|
54
|
+
reason: "Output is not semantically similar to expected (#{similarity.round(2)} < #{threshold})"
|
|
55
|
+
}]
|
|
56
|
+
in [:error, reason]
|
|
57
|
+
[:error, "Failed to compute similarity: #{reason}"]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def default_similarity(text1, text2, opts)
|
|
64
|
+
# Use RubyLLM embeddings
|
|
65
|
+
model = opts[:embedding_model] || 'text-embedding-3-small'
|
|
66
|
+
|
|
67
|
+
embedding1 = RubyLLM.embed(text1, model:).vectors.first
|
|
68
|
+
embedding2 = RubyLLM.embed(text2, model:).vectors.first
|
|
69
|
+
|
|
70
|
+
# Compute cosine similarity
|
|
71
|
+
similarity = cosine_similarity(embedding1, embedding2)
|
|
72
|
+
[:ok, similarity]
|
|
73
|
+
rescue StandardError => e
|
|
74
|
+
[:error, e.message]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def cosine_similarity(vec1, vec2)
|
|
78
|
+
dot_product = vec1.zip(vec2).sum { |a, b| a * b }
|
|
79
|
+
magnitude1 = Math.sqrt(vec1.sum { |x| x**2 })
|
|
80
|
+
magnitude2 = Math.sqrt(vec2.sum { |x| x**2 })
|
|
81
|
+
|
|
82
|
+
return 0.0 if magnitude1.zero? || magnitude2.zero?
|
|
83
|
+
|
|
84
|
+
dot_product / (magnitude1 * magnitude2)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Assertions
|
|
6
|
+
# LLM-as-judge assertions for evaluating LLM outputs.
|
|
7
|
+
#
|
|
8
|
+
# Uses RubyLLM to get structured verdicts from the judge model.
|
|
9
|
+
module Judge
|
|
10
|
+
DEFAULT_MODEL = 'anthropic:claude-3-5-haiku-latest'
|
|
11
|
+
DEFAULT_THRESHOLD = 0.8
|
|
12
|
+
|
|
13
|
+
SYSTEM_PROMPT = <<~PROMPT
|
|
14
|
+
You are a precise evaluator of LLM outputs. Your task is to assess outputs
|
|
15
|
+
based on specific criteria and provide structured verdicts.
|
|
16
|
+
|
|
17
|
+
Always respond with valid JSON containing:
|
|
18
|
+
- verdict: "yes", "no", or "partial"
|
|
19
|
+
- reason: A brief explanation
|
|
20
|
+
- score: A float from 0.0 to 1.0
|
|
21
|
+
|
|
22
|
+
Be objective and consistent in your evaluations.
|
|
23
|
+
PROMPT
|
|
24
|
+
|
|
25
|
+
class << self
|
|
26
|
+
# Returns list of available judge assertion types.
|
|
27
|
+
#
|
|
28
|
+
# @return [Array<Symbol>]
|
|
29
|
+
def available
|
|
30
|
+
Tribunal::Judge.all_judge_names
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Evaluates a judge assertion against a test case.
|
|
34
|
+
#
|
|
35
|
+
# @param type [Symbol] The judge type
|
|
36
|
+
# @param test_case [TestCase] The test case
|
|
37
|
+
# @param opts [Hash] Options
|
|
38
|
+
# @return [Array] [:pass, details], [:fail, details], or [:error, message]
|
|
39
|
+
def evaluate(type, test_case, opts)
|
|
40
|
+
judge_class = Tribunal::Judge.find(type)
|
|
41
|
+
return [:error, "Unknown judge assertion: #{type}"] unless judge_class
|
|
42
|
+
|
|
43
|
+
# Validate test case
|
|
44
|
+
error = judge_class.validate(test_case) if judge_class.respond_to?(:validate)
|
|
45
|
+
return [:error, error] if error
|
|
46
|
+
|
|
47
|
+
run_judge(judge_class, test_case, opts)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def run_judge(judge_class, test_case, opts)
|
|
53
|
+
model = opts[:model] || Tribunal.configuration.default_model || DEFAULT_MODEL
|
|
54
|
+
threshold = opts[:threshold] || Tribunal.configuration.default_threshold || DEFAULT_THRESHOLD
|
|
55
|
+
prompt = judge_class.prompt(test_case, opts)
|
|
56
|
+
|
|
57
|
+
response = call_llm(model, prompt, opts)
|
|
58
|
+
|
|
59
|
+
case response
|
|
60
|
+
in [:ok, result]
|
|
61
|
+
# Check if judge has custom evaluation
|
|
62
|
+
if judge_class.respond_to?(:evaluate_result)
|
|
63
|
+
custom_result = judge_class.evaluate_result(result, opts)
|
|
64
|
+
return custom_result if custom_result
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
negative_metric = judge_class.respond_to?(:negative_metric?) && judge_class.negative_metric?
|
|
68
|
+
interpret_response(result, threshold, negative_metric)
|
|
69
|
+
in [:error, reason]
|
|
70
|
+
[:error, reason.to_s]
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def call_llm(model, prompt, opts)
|
|
75
|
+
# Allow injecting custom LLM for tests via opts[:llm]
|
|
76
|
+
return opts[:llm].call(model, build_messages(prompt), opts) if opts[:llm]
|
|
77
|
+
|
|
78
|
+
call_ruby_llm(model, prompt, opts)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def build_messages(prompt)
|
|
82
|
+
[
|
|
83
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
84
|
+
{ role: 'user', content: prompt }
|
|
85
|
+
]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def call_ruby_llm(model, prompt, _opts)
|
|
89
|
+
# Parse model string (e.g., "anthropic:claude-3-5-haiku-latest")
|
|
90
|
+
_provider, model_name = parse_model(model)
|
|
91
|
+
|
|
92
|
+
chat = RubyLLM.chat(model: model_name)
|
|
93
|
+
|
|
94
|
+
# Add system message
|
|
95
|
+
chat.with_instructions(SYSTEM_PROMPT)
|
|
96
|
+
|
|
97
|
+
# Get response
|
|
98
|
+
response = chat.ask(prompt)
|
|
99
|
+
content = response.content
|
|
100
|
+
|
|
101
|
+
# Parse JSON response
|
|
102
|
+
parsed = JSON.parse(content)
|
|
103
|
+
[:ok, parsed]
|
|
104
|
+
rescue JSON::ParserError => e
|
|
105
|
+
# Try to extract JSON from response
|
|
106
|
+
if (match = content&.match(/\{[\s\S]*\}/))
|
|
107
|
+
begin
|
|
108
|
+
parsed = JSON.parse(match[0])
|
|
109
|
+
return [:ok, parsed]
|
|
110
|
+
rescue JSON::ParserError
|
|
111
|
+
# Fall through to error
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
[:error, "Failed to parse LLM response as JSON: #{e.message}"]
|
|
115
|
+
rescue StandardError => e
|
|
116
|
+
[:error, e.message]
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def parse_model(model_string)
|
|
120
|
+
if model_string.include?(':')
|
|
121
|
+
model_string.split(':', 2)
|
|
122
|
+
else
|
|
123
|
+
[nil, model_string]
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def interpret_response(response, threshold, negative_metric)
|
|
128
|
+
details = {
|
|
129
|
+
verdict: response['verdict'],
|
|
130
|
+
reason: response['reason'],
|
|
131
|
+
score: response['score']
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
verdict_result(response['verdict'], response['score'], threshold, negative_metric, details)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def verdict_result(verdict, score, threshold, negative_metric, details)
|
|
138
|
+
return [:error, "Unexpected verdict: #{verdict}"] unless %w[yes no partial].include?(verdict)
|
|
139
|
+
|
|
140
|
+
passed = case verdict
|
|
141
|
+
when 'yes' then !negative_metric
|
|
142
|
+
when 'no' then negative_metric
|
|
143
|
+
when 'partial' then score.is_a?(Numeric) && score >= threshold
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
passed ? [:pass, details] : [:fail, details]
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
# Assertion evaluation engine.
|
|
6
|
+
#
|
|
7
|
+
# Routes assertions to the appropriate implementation:
|
|
8
|
+
# - Deterministic: `contains`, `regex`, `is_json`, etc.
|
|
9
|
+
# - Judge (requires ruby_llm): `faithful`, `relevant`, etc.
|
|
10
|
+
# - Embedding (requires neighbor): `similar`
|
|
11
|
+
module Assertions
|
|
12
|
+
DETERMINISTIC_ASSERTIONS = %i[
|
|
13
|
+
contains
|
|
14
|
+
not_contains
|
|
15
|
+
contains_any
|
|
16
|
+
contains_all
|
|
17
|
+
regex
|
|
18
|
+
is_json
|
|
19
|
+
max_tokens
|
|
20
|
+
latency_ms
|
|
21
|
+
starts_with
|
|
22
|
+
ends_with
|
|
23
|
+
equals
|
|
24
|
+
min_length
|
|
25
|
+
max_length
|
|
26
|
+
word_count
|
|
27
|
+
is_url
|
|
28
|
+
is_email
|
|
29
|
+
levenshtein
|
|
30
|
+
].freeze
|
|
31
|
+
|
|
32
|
+
EMBEDDING_ASSERTIONS = %i[similar].freeze
|
|
33
|
+
|
|
34
|
+
class << self
|
|
35
|
+
# Evaluates a single assertion against a test case.
|
|
36
|
+
#
|
|
37
|
+
# @param assertion_type [Symbol] The type of assertion
|
|
38
|
+
# @param test_case [TestCase] The test case to evaluate
|
|
39
|
+
# @param opts [Hash] Options for the assertion
|
|
40
|
+
# @return [Array] [:pass, details] or [:fail, details]
|
|
41
|
+
def evaluate(assertion_type, test_case, opts = {})
|
|
42
|
+
if DETERMINISTIC_ASSERTIONS.include?(assertion_type)
|
|
43
|
+
Deterministic.evaluate(assertion_type, test_case.actual_output, opts)
|
|
44
|
+
elsif Tribunal::Judge.builtin_judge?(assertion_type) || Tribunal::Judge.custom_judge?(assertion_type)
|
|
45
|
+
evaluate_judge(assertion_type, test_case, opts)
|
|
46
|
+
elsif EMBEDDING_ASSERTIONS.include?(assertion_type)
|
|
47
|
+
evaluate_embedding(assertion_type, test_case, opts)
|
|
48
|
+
else
|
|
49
|
+
[:error, "Unknown assertion type: #{assertion_type}"]
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Evaluates multiple assertions against a test case.
|
|
54
|
+
#
|
|
55
|
+
# @param assertions [Array, Hash] Assertions to evaluate
|
|
56
|
+
# @param test_case [TestCase] The test case to evaluate
|
|
57
|
+
# @return [Hash] Map of assertion_type => result
|
|
58
|
+
def evaluate_all(assertions, test_case)
|
|
59
|
+
normalized = normalize_assertions(assertions)
|
|
60
|
+
normalized.each_with_object({}) do |(type, opts), results|
|
|
61
|
+
results[type] = evaluate(type, test_case, opts)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Checks if all assertions passed.
|
|
66
|
+
#
|
|
67
|
+
# @param results [Hash] Results from evaluate_all
|
|
68
|
+
# @return [Boolean] True if all passed
|
|
69
|
+
def all_passed?(results)
|
|
70
|
+
results.all? { |_type, result| result.first == :pass }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Returns list of available assertion types based on loaded dependencies.
|
|
74
|
+
#
|
|
75
|
+
# @return [Array<Symbol>] Available assertion types
|
|
76
|
+
def available
|
|
77
|
+
base = DETERMINISTIC_ASSERTIONS.dup
|
|
78
|
+
|
|
79
|
+
# Add judge assertions
|
|
80
|
+
base.concat(Tribunal::Judge.all_judge_names)
|
|
81
|
+
|
|
82
|
+
# Add embedding assertions if neighbor is available
|
|
83
|
+
begin
|
|
84
|
+
require 'neighbor'
|
|
85
|
+
base.concat(EMBEDDING_ASSERTIONS)
|
|
86
|
+
rescue LoadError
|
|
87
|
+
# neighbor not available
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
base
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def normalize_assertions(assertions)
|
|
96
|
+
case assertions
|
|
97
|
+
when Hash
|
|
98
|
+
assertions.map do |type, opts|
|
|
99
|
+
opts = { value: opts } unless opts.is_a?(Hash)
|
|
100
|
+
[type.to_sym, opts]
|
|
101
|
+
end
|
|
102
|
+
when Array
|
|
103
|
+
assertions.map do |item|
|
|
104
|
+
case item
|
|
105
|
+
when Symbol, String
|
|
106
|
+
[item.to_sym, {}]
|
|
107
|
+
when Array
|
|
108
|
+
type, opts = item
|
|
109
|
+
opts = { value: opts } unless opts.is_a?(Hash)
|
|
110
|
+
[type.to_sym, opts]
|
|
111
|
+
else
|
|
112
|
+
raise ArgumentError, "Invalid assertion format: #{item.inspect}"
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
else
|
|
116
|
+
raise ArgumentError, 'Assertions must be a Hash or Array'
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def evaluate_judge(type, test_case, opts)
|
|
121
|
+
Assertions::Judge.evaluate(type, test_case, opts)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def evaluate_embedding(_type, test_case, opts)
|
|
125
|
+
begin
|
|
126
|
+
require 'neighbor'
|
|
127
|
+
rescue LoadError
|
|
128
|
+
raise Error, <<~MSG
|
|
129
|
+
Embedding similarity requires the 'neighbor' gem.
|
|
130
|
+
|
|
131
|
+
Add to your Gemfile:
|
|
132
|
+
gem 'neighbor', '~> 0.4'
|
|
133
|
+
MSG
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
Embedding.evaluate(test_case, opts)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
# Configuration for Tribunal.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# RubyLLM::Tribunal.configure do |config|
|
|
9
|
+
# config.default_model = "anthropic:claude-3-5-haiku-latest"
|
|
10
|
+
# config.default_threshold = 0.8
|
|
11
|
+
# config.verbose = true
|
|
12
|
+
# end
|
|
13
|
+
class Configuration
|
|
14
|
+
# Default LLM model for judge assertions
|
|
15
|
+
# @return [String]
|
|
16
|
+
attr_accessor :default_model
|
|
17
|
+
|
|
18
|
+
# Default threshold for judge assertions (0.0-1.0)
|
|
19
|
+
# @return [Float]
|
|
20
|
+
attr_accessor :default_threshold
|
|
21
|
+
|
|
22
|
+
# Whether to print verbose output
|
|
23
|
+
# @return [Boolean]
|
|
24
|
+
attr_accessor :verbose
|
|
25
|
+
|
|
26
|
+
# Custom judges to register
|
|
27
|
+
# @return [Array<Class>]
|
|
28
|
+
attr_accessor :custom_judges
|
|
29
|
+
|
|
30
|
+
def initialize
|
|
31
|
+
@default_model = 'anthropic:claude-3-5-haiku-latest'
|
|
32
|
+
@default_threshold = 0.8
|
|
33
|
+
@verbose = false
|
|
34
|
+
@custom_judges = []
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|