ruby_llm-tribunal 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +32 -0
- data/LICENSE.txt +21 -0
- data/README.md +442 -0
- data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
- data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
- data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
- data/lib/ruby_llm/tribunal/assertions.rb +141 -0
- data/lib/ruby_llm/tribunal/configuration.rb +38 -0
- data/lib/ruby_llm/tribunal/dataset.rb +118 -0
- data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
- data/lib/ruby_llm/tribunal/judge.rb +166 -0
- data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
- data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
- data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
- data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
- data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
- data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
- data/lib/ruby_llm/tribunal/red_team.rb +306 -0
- data/lib/ruby_llm/tribunal/reporter.rb +48 -0
- data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
- data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
- data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
- data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
- data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
- data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
- data/lib/ruby_llm/tribunal/test_case.rb +124 -0
- data/lib/ruby_llm/tribunal/version.rb +7 -0
- data/lib/ruby_llm/tribunal.rb +130 -0
- data/lib/ruby_llm-tribunal.rb +3 -0
- data/lib/tasks/tribunal.rake +269 -0
- metadata +99 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ruby_llm/tribunal'
|
|
4
|
+
|
|
5
|
+
namespace :tribunal do
|
|
6
|
+
desc 'Run LLM evaluations from dataset files'
|
|
7
|
+
task :eval, [:files] => :environment do |_t, args|
|
|
8
|
+
options = parse_options
|
|
9
|
+
|
|
10
|
+
files = if args[:files]
|
|
11
|
+
args[:files].split(',')
|
|
12
|
+
elsif options[:files]
|
|
13
|
+
options[:files]
|
|
14
|
+
else
|
|
15
|
+
find_default_files
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
if files.empty?
|
|
19
|
+
puts 'No eval files found. Create datasets in test/evals/ or spec/evals/'
|
|
20
|
+
exit 0
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
provider = parse_provider(options[:provider])
|
|
24
|
+
format = options[:format] || 'console'
|
|
25
|
+
output = options[:output]
|
|
26
|
+
threshold = options[:threshold]
|
|
27
|
+
strict = options[:strict] || false
|
|
28
|
+
concurrency = options[:concurrency] || 1
|
|
29
|
+
|
|
30
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
|
|
31
|
+
|
|
32
|
+
results = files.flat_map { |file| load_and_run(file, provider, concurrency) }
|
|
33
|
+
aggregated = aggregate_results(results, start_time)
|
|
34
|
+
|
|
35
|
+
# Determine pass/fail based on threshold
|
|
36
|
+
passed = if strict
|
|
37
|
+
aggregated[:summary][:failed].zero?
|
|
38
|
+
elsif threshold
|
|
39
|
+
aggregated[:summary][:pass_rate] >= threshold
|
|
40
|
+
else
|
|
41
|
+
true
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
aggregated[:summary][:threshold_passed] = passed
|
|
45
|
+
aggregated[:summary][:threshold] = threshold
|
|
46
|
+
aggregated[:summary][:strict] = strict
|
|
47
|
+
|
|
48
|
+
formatted = RubyLLM::Tribunal::Reporter.format(aggregated, format)
|
|
49
|
+
|
|
50
|
+
if output
|
|
51
|
+
File.write(output, formatted)
|
|
52
|
+
puts "Results written to #{output}"
|
|
53
|
+
else
|
|
54
|
+
puts formatted
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
exit 1 unless passed
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
desc 'Initialize eval directory structure'
|
|
61
|
+
task :init do
|
|
62
|
+
base_dir = ENV.fetch('TRIBUNAL_BASE_DIR', '.')
|
|
63
|
+
|
|
64
|
+
create_dir(File.join(base_dir, 'test/evals'))
|
|
65
|
+
create_dir(File.join(base_dir, 'test/evals/datasets'))
|
|
66
|
+
|
|
67
|
+
create_file(File.join(base_dir, 'test/evals/datasets/example.json'), example_dataset_json)
|
|
68
|
+
create_file(File.join(base_dir, 'test/evals/datasets/example.yaml'), example_dataset_yaml)
|
|
69
|
+
|
|
70
|
+
puts <<~MSG
|
|
71
|
+
|
|
72
|
+
✅ Created eval structure:
|
|
73
|
+
|
|
74
|
+
test/evals/
|
|
75
|
+
└── datasets/
|
|
76
|
+
├── example.json
|
|
77
|
+
└── example.yaml
|
|
78
|
+
|
|
79
|
+
Run evals with: rake tribunal:eval
|
|
80
|
+
MSG
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def parse_options
|
|
86
|
+
options = {}
|
|
87
|
+
ARGV.each do |arg|
|
|
88
|
+
case arg
|
|
89
|
+
when /^--format=(.+)$/
|
|
90
|
+
options[:format] = Regexp.last_match(1)
|
|
91
|
+
when /^--output=(.+)$/
|
|
92
|
+
options[:output] = Regexp.last_match(1)
|
|
93
|
+
when /^--provider=(.+)$/
|
|
94
|
+
options[:provider] = Regexp.last_match(1)
|
|
95
|
+
when /^--threshold=(.+)$/
|
|
96
|
+
options[:threshold] = Regexp.last_match(1).to_f
|
|
97
|
+
when '--strict'
|
|
98
|
+
options[:strict] = true
|
|
99
|
+
when /^--concurrency=(.+)$/
|
|
100
|
+
options[:concurrency] = Regexp.last_match(1).to_i
|
|
101
|
+
when /^--files=(.+)$/
|
|
102
|
+
options[:files] = Regexp.last_match(1).split(',')
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
options
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def find_default_files
|
|
109
|
+
patterns = [
|
|
110
|
+
'test/evals/**/*.json',
|
|
111
|
+
'test/evals/**/*.yaml',
|
|
112
|
+
'test/evals/**/*.yml',
|
|
113
|
+
'spec/evals/**/*.json',
|
|
114
|
+
'spec/evals/**/*.yaml',
|
|
115
|
+
'spec/evals/**/*.yml'
|
|
116
|
+
]
|
|
117
|
+
patterns.flat_map { |p| Dir.glob(p) }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def parse_provider(str)
|
|
121
|
+
return nil unless str
|
|
122
|
+
|
|
123
|
+
parts = str.split(':')
|
|
124
|
+
raise 'Invalid provider format. Use Module:method (e.g. MyApp::RAG:query)' unless parts.length == 2
|
|
125
|
+
|
|
126
|
+
mod = parts[0].split('::').reduce(Object) { |m, c| m.const_get(c) }
|
|
127
|
+
[mod, parts[1].to_sym]
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def load_and_run(path, provider, concurrency)
|
|
131
|
+
puts "Loading #{path}..."
|
|
132
|
+
|
|
133
|
+
cases = RubyLLM::Tribunal::Dataset.load_with_assertions(path)
|
|
134
|
+
|
|
135
|
+
if concurrency > 1
|
|
136
|
+
require 'parallel'
|
|
137
|
+
Parallel.map(cases, in_threads: concurrency) do |test_case, assertions|
|
|
138
|
+
run_case(test_case, assertions, provider)
|
|
139
|
+
end
|
|
140
|
+
else
|
|
141
|
+
cases.map { |test_case, assertions| run_case(test_case, assertions, provider) }
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def run_case(test_case, assertions, provider)
|
|
146
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
|
|
147
|
+
|
|
148
|
+
if provider
|
|
149
|
+
mod, method = provider
|
|
150
|
+
output = mod.send(method, test_case)
|
|
151
|
+
test_case = test_case.with_output(output)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
results = if test_case.actual_output
|
|
155
|
+
RubyLLM::Tribunal::Assertions.evaluate_all(assertions, test_case)
|
|
156
|
+
else
|
|
157
|
+
{}
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - start
|
|
161
|
+
|
|
162
|
+
failures = results
|
|
163
|
+
.select { |_type, result| result.first == :fail }
|
|
164
|
+
.map { |type, (_status, details)| [type, details[:reason]] }
|
|
165
|
+
|
|
166
|
+
{
|
|
167
|
+
input: test_case.input,
|
|
168
|
+
status: failures.empty? ? :passed : :failed,
|
|
169
|
+
failures:,
|
|
170
|
+
results:,
|
|
171
|
+
duration_ms: duration
|
|
172
|
+
}
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def aggregate_results(cases, start_time)
|
|
176
|
+
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - start_time
|
|
177
|
+
|
|
178
|
+
passed = cases.count { |c| c[:status] == :passed }
|
|
179
|
+
failed = cases.count { |c| c[:status] == :failed }
|
|
180
|
+
total = cases.length
|
|
181
|
+
|
|
182
|
+
metrics = aggregate_metrics(cases)
|
|
183
|
+
|
|
184
|
+
{
|
|
185
|
+
summary: {
|
|
186
|
+
total:,
|
|
187
|
+
passed:,
|
|
188
|
+
failed:,
|
|
189
|
+
pass_rate: total.positive? ? passed.to_f / total : 0,
|
|
190
|
+
duration_ms: duration
|
|
191
|
+
},
|
|
192
|
+
metrics:,
|
|
193
|
+
cases:
|
|
194
|
+
}
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def aggregate_metrics(cases)
|
|
198
|
+
cases
|
|
199
|
+
.flat_map { |c| c[:results].map { |type, result| [type, result.first == :pass] } }
|
|
200
|
+
.group_by(&:first)
|
|
201
|
+
.transform_values do |results|
|
|
202
|
+
{
|
|
203
|
+
passed: results.count { |_, passed| passed },
|
|
204
|
+
total: results.length
|
|
205
|
+
}
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def create_dir(path)
|
|
210
|
+
FileUtils.mkdir_p(path)
|
|
211
|
+
puts "Created #{path}/"
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def create_file(path, content)
|
|
215
|
+
return if File.exist?(path)
|
|
216
|
+
|
|
217
|
+
File.write(path, content)
|
|
218
|
+
puts "Created #{path}"
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def example_dataset_json
|
|
222
|
+
<<~JSON
|
|
223
|
+
[
|
|
224
|
+
{
|
|
225
|
+
"input": "What is the return policy?",
|
|
226
|
+
"context": "Returns are accepted within 30 days of purchase with a valid receipt. Items must be in original condition.",
|
|
227
|
+
"expected": {
|
|
228
|
+
"contains": ["30 days", "receipt"],
|
|
229
|
+
"not_contains": ["no returns"]
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"input": "Do you ship internationally?",
|
|
234
|
+
"context": "We currently ship to the United States and Canada only.",
|
|
235
|
+
"expected": {
|
|
236
|
+
"contains_any": ["United States", "US", "Canada"],
|
|
237
|
+
"not_contains": ["worldwide", "international"]
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
]
|
|
241
|
+
JSON
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def example_dataset_yaml
|
|
245
|
+
<<~YAML
|
|
246
|
+
- input: What is the return policy?
|
|
247
|
+
context: Returns are accepted within 30 days of purchase with a valid receipt.
|
|
248
|
+
expected:
|
|
249
|
+
contains:
|
|
250
|
+
- 30 days
|
|
251
|
+
- receipt
|
|
252
|
+
|
|
253
|
+
- input: What are the store hours?
|
|
254
|
+
context: We are open Monday through Friday, 9am to 5pm.
|
|
255
|
+
expected:
|
|
256
|
+
contains_any:
|
|
257
|
+
- "9am"
|
|
258
|
+
- "9:00"
|
|
259
|
+
regex: "\\\\d+[ap]m"
|
|
260
|
+
YAML
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Make tasks available without :environment for non-Rails apps
|
|
265
|
+
unless Rake::Task.task_defined?(:environment)
|
|
266
|
+
task :environment do
|
|
267
|
+
# No-op for non-Rails apps
|
|
268
|
+
end
|
|
269
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ruby_llm-tribunal
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Florian
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-01-15 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: ruby_llm
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.0'
|
|
27
|
+
description: |-
|
|
28
|
+
Tribunal provides tools for evaluating LLM outputs, detecting hallucinations,
|
|
29
|
+
measuring response quality, and ensuring safety. Features deterministic assertions,
|
|
30
|
+
LLM-as-judge evaluations, red team attack generation, and multiple output formats.
|
|
31
|
+
A RubyLLM plugin inspired by the Elixir Tribunal library.
|
|
32
|
+
email:
|
|
33
|
+
- florian@alqemist.com
|
|
34
|
+
executables: []
|
|
35
|
+
extensions: []
|
|
36
|
+
extra_rdoc_files: []
|
|
37
|
+
files:
|
|
38
|
+
- CHANGELOG.md
|
|
39
|
+
- LICENSE.txt
|
|
40
|
+
- README.md
|
|
41
|
+
- lib/ruby_llm-tribunal.rb
|
|
42
|
+
- lib/ruby_llm/tribunal.rb
|
|
43
|
+
- lib/ruby_llm/tribunal/assertions.rb
|
|
44
|
+
- lib/ruby_llm/tribunal/assertions/deterministic.rb
|
|
45
|
+
- lib/ruby_llm/tribunal/assertions/embedding.rb
|
|
46
|
+
- lib/ruby_llm/tribunal/assertions/judge.rb
|
|
47
|
+
- lib/ruby_llm/tribunal/configuration.rb
|
|
48
|
+
- lib/ruby_llm/tribunal/dataset.rb
|
|
49
|
+
- lib/ruby_llm/tribunal/eval_helpers.rb
|
|
50
|
+
- lib/ruby_llm/tribunal/judge.rb
|
|
51
|
+
- lib/ruby_llm/tribunal/judges/bias.rb
|
|
52
|
+
- lib/ruby_llm/tribunal/judges/correctness.rb
|
|
53
|
+
- lib/ruby_llm/tribunal/judges/faithful.rb
|
|
54
|
+
- lib/ruby_llm/tribunal/judges/hallucination.rb
|
|
55
|
+
- lib/ruby_llm/tribunal/judges/harmful.rb
|
|
56
|
+
- lib/ruby_llm/tribunal/judges/jailbreak.rb
|
|
57
|
+
- lib/ruby_llm/tribunal/judges/pii.rb
|
|
58
|
+
- lib/ruby_llm/tribunal/judges/refusal.rb
|
|
59
|
+
- lib/ruby_llm/tribunal/judges/relevant.rb
|
|
60
|
+
- lib/ruby_llm/tribunal/judges/toxicity.rb
|
|
61
|
+
- lib/ruby_llm/tribunal/red_team.rb
|
|
62
|
+
- lib/ruby_llm/tribunal/reporter.rb
|
|
63
|
+
- lib/ruby_llm/tribunal/reporters/console.rb
|
|
64
|
+
- lib/ruby_llm/tribunal/reporters/github.rb
|
|
65
|
+
- lib/ruby_llm/tribunal/reporters/html.rb
|
|
66
|
+
- lib/ruby_llm/tribunal/reporters/json.rb
|
|
67
|
+
- lib/ruby_llm/tribunal/reporters/junit.rb
|
|
68
|
+
- lib/ruby_llm/tribunal/reporters/text.rb
|
|
69
|
+
- lib/ruby_llm/tribunal/test_case.rb
|
|
70
|
+
- lib/ruby_llm/tribunal/version.rb
|
|
71
|
+
- lib/tasks/tribunal.rake
|
|
72
|
+
homepage: https://github.com/Alqemist-labs/ruby_llm-tribunal
|
|
73
|
+
licenses:
|
|
74
|
+
- MIT
|
|
75
|
+
metadata:
|
|
76
|
+
homepage_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal
|
|
77
|
+
source_code_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal
|
|
78
|
+
changelog_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal/blob/main/CHANGELOG.md
|
|
79
|
+
rubygems_mfa_required: 'true'
|
|
80
|
+
post_install_message:
|
|
81
|
+
rdoc_options: []
|
|
82
|
+
require_paths:
|
|
83
|
+
- lib
|
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: 3.2.0
|
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
|
+
requirements:
|
|
91
|
+
- - ">="
|
|
92
|
+
- !ruby/object:Gem::Version
|
|
93
|
+
version: '0'
|
|
94
|
+
requirements: []
|
|
95
|
+
rubygems_version: 3.0.3.1
|
|
96
|
+
signing_key:
|
|
97
|
+
specification_version: 4
|
|
98
|
+
summary: LLM evaluation framework for Ruby
|
|
99
|
+
test_files: []
|