ruby_llm-tribunal 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +32 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +442 -0
  5. data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
  6. data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
  7. data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
  8. data/lib/ruby_llm/tribunal/assertions.rb +141 -0
  9. data/lib/ruby_llm/tribunal/configuration.rb +38 -0
  10. data/lib/ruby_llm/tribunal/dataset.rb +118 -0
  11. data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
  12. data/lib/ruby_llm/tribunal/judge.rb +166 -0
  13. data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
  14. data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
  15. data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
  16. data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
  17. data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
  18. data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
  19. data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
  20. data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
  21. data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
  22. data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
  23. data/lib/ruby_llm/tribunal/red_team.rb +306 -0
  24. data/lib/ruby_llm/tribunal/reporter.rb +48 -0
  25. data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
  26. data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
  27. data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
  28. data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
  29. data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
  30. data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
  31. data/lib/ruby_llm/tribunal/test_case.rb +124 -0
  32. data/lib/ruby_llm/tribunal/version.rb +7 -0
  33. data/lib/ruby_llm/tribunal.rb +130 -0
  34. data/lib/ruby_llm-tribunal.rb +3 -0
  35. data/lib/tasks/tribunal.rake +269 -0
  36. metadata +99 -0
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ruby_llm/tribunal'
4
+
5
+ namespace :tribunal do
6
+ desc 'Run LLM evaluations from dataset files'
7
+ task :eval, [:files] => :environment do |_t, args|
8
+ options = parse_options
9
+
10
+ files = if args[:files]
11
+ args[:files].split(',')
12
+ elsif options[:files]
13
+ options[:files]
14
+ else
15
+ find_default_files
16
+ end
17
+
18
+ if files.empty?
19
+ puts 'No eval files found. Create datasets in test/evals/ or spec/evals/'
20
+ exit 0
21
+ end
22
+
23
+ provider = parse_provider(options[:provider])
24
+ format = options[:format] || 'console'
25
+ output = options[:output]
26
+ threshold = options[:threshold]
27
+ strict = options[:strict] || false
28
+ concurrency = options[:concurrency] || 1
29
+
30
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
31
+
32
+ results = files.flat_map { |file| load_and_run(file, provider, concurrency) }
33
+ aggregated = aggregate_results(results, start_time)
34
+
35
+ # Determine pass/fail based on threshold
36
+ passed = if strict
37
+ aggregated[:summary][:failed].zero?
38
+ elsif threshold
39
+ aggregated[:summary][:pass_rate] >= threshold
40
+ else
41
+ true
42
+ end
43
+
44
+ aggregated[:summary][:threshold_passed] = passed
45
+ aggregated[:summary][:threshold] = threshold
46
+ aggregated[:summary][:strict] = strict
47
+
48
+ formatted = RubyLLM::Tribunal::Reporter.format(aggregated, format)
49
+
50
+ if output
51
+ File.write(output, formatted)
52
+ puts "Results written to #{output}"
53
+ else
54
+ puts formatted
55
+ end
56
+
57
+ exit 1 unless passed
58
+ end
59
+
60
+ desc 'Initialize eval directory structure'
61
+ task :init do
62
+ base_dir = ENV.fetch('TRIBUNAL_BASE_DIR', '.')
63
+
64
+ create_dir(File.join(base_dir, 'test/evals'))
65
+ create_dir(File.join(base_dir, 'test/evals/datasets'))
66
+
67
+ create_file(File.join(base_dir, 'test/evals/datasets/example.json'), example_dataset_json)
68
+ create_file(File.join(base_dir, 'test/evals/datasets/example.yaml'), example_dataset_yaml)
69
+
70
+ puts <<~MSG
71
+
72
+ ✅ Created eval structure:
73
+
74
+ test/evals/
75
+ └── datasets/
76
+ ├── example.json
77
+ └── example.yaml
78
+
79
+ Run evals with: rake tribunal:eval
80
+ MSG
81
+ end
82
+
83
+ private
84
+
85
+ def parse_options
86
+ options = {}
87
+ ARGV.each do |arg|
88
+ case arg
89
+ when /^--format=(.+)$/
90
+ options[:format] = Regexp.last_match(1)
91
+ when /^--output=(.+)$/
92
+ options[:output] = Regexp.last_match(1)
93
+ when /^--provider=(.+)$/
94
+ options[:provider] = Regexp.last_match(1)
95
+ when /^--threshold=(.+)$/
96
+ options[:threshold] = Regexp.last_match(1).to_f
97
+ when '--strict'
98
+ options[:strict] = true
99
+ when /^--concurrency=(.+)$/
100
+ options[:concurrency] = Regexp.last_match(1).to_i
101
+ when /^--files=(.+)$/
102
+ options[:files] = Regexp.last_match(1).split(',')
103
+ end
104
+ end
105
+ options
106
+ end
107
+
108
+ def find_default_files
109
+ patterns = [
110
+ 'test/evals/**/*.json',
111
+ 'test/evals/**/*.yaml',
112
+ 'test/evals/**/*.yml',
113
+ 'spec/evals/**/*.json',
114
+ 'spec/evals/**/*.yaml',
115
+ 'spec/evals/**/*.yml'
116
+ ]
117
+ patterns.flat_map { |p| Dir.glob(p) }
118
+ end
119
+
120
+ def parse_provider(str)
121
+ return nil unless str
122
+
123
+ parts = str.split(':')
124
+ raise 'Invalid provider format. Use Module:method (e.g. MyApp::RAG:query)' unless parts.length == 2
125
+
126
+ mod = parts[0].split('::').reduce(Object) { |m, c| m.const_get(c) }
127
+ [mod, parts[1].to_sym]
128
+ end
129
+
130
+ def load_and_run(path, provider, concurrency)
131
+ puts "Loading #{path}..."
132
+
133
+ cases = RubyLLM::Tribunal::Dataset.load_with_assertions(path)
134
+
135
+ if concurrency > 1
136
+ require 'parallel'
137
+ Parallel.map(cases, in_threads: concurrency) do |test_case, assertions|
138
+ run_case(test_case, assertions, provider)
139
+ end
140
+ else
141
+ cases.map { |test_case, assertions| run_case(test_case, assertions, provider) }
142
+ end
143
+ end
144
+
145
+ def run_case(test_case, assertions, provider)
146
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
147
+
148
+ if provider
149
+ mod, method = provider
150
+ output = mod.send(method, test_case)
151
+ test_case = test_case.with_output(output)
152
+ end
153
+
154
+ results = if test_case.actual_output
155
+ RubyLLM::Tribunal::Assertions.evaluate_all(assertions, test_case)
156
+ else
157
+ {}
158
+ end
159
+
160
+ duration = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - start
161
+
162
+ failures = results
163
+ .select { |_type, result| result.first == :fail }
164
+ .map { |type, (_status, details)| [type, details[:reason]] }
165
+
166
+ {
167
+ input: test_case.input,
168
+ status: failures.empty? ? :passed : :failed,
169
+ failures:,
170
+ results:,
171
+ duration_ms: duration
172
+ }
173
+ end
174
+
175
+ def aggregate_results(cases, start_time)
176
+ duration = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - start_time
177
+
178
+ passed = cases.count { |c| c[:status] == :passed }
179
+ failed = cases.count { |c| c[:status] == :failed }
180
+ total = cases.length
181
+
182
+ metrics = aggregate_metrics(cases)
183
+
184
+ {
185
+ summary: {
186
+ total:,
187
+ passed:,
188
+ failed:,
189
+ pass_rate: total.positive? ? passed.to_f / total : 0,
190
+ duration_ms: duration
191
+ },
192
+ metrics:,
193
+ cases:
194
+ }
195
+ end
196
+
197
+ def aggregate_metrics(cases)
198
+ cases
199
+ .flat_map { |c| c[:results].map { |type, result| [type, result.first == :pass] } }
200
+ .group_by(&:first)
201
+ .transform_values do |results|
202
+ {
203
+ passed: results.count { |_, passed| passed },
204
+ total: results.length
205
+ }
206
+ end
207
+ end
208
+
209
+ def create_dir(path)
210
+ FileUtils.mkdir_p(path)
211
+ puts "Created #{path}/"
212
+ end
213
+
214
+ def create_file(path, content)
215
+ return if File.exist?(path)
216
+
217
+ File.write(path, content)
218
+ puts "Created #{path}"
219
+ end
220
+
221
+ def example_dataset_json
222
+ <<~JSON
223
+ [
224
+ {
225
+ "input": "What is the return policy?",
226
+ "context": "Returns are accepted within 30 days of purchase with a valid receipt. Items must be in original condition.",
227
+ "expected": {
228
+ "contains": ["30 days", "receipt"],
229
+ "not_contains": ["no returns"]
230
+ }
231
+ },
232
+ {
233
+ "input": "Do you ship internationally?",
234
+ "context": "We currently ship to the United States and Canada only.",
235
+ "expected": {
236
+ "contains_any": ["United States", "US", "Canada"],
237
+ "not_contains": ["worldwide", "international"]
238
+ }
239
+ }
240
+ ]
241
+ JSON
242
+ end
243
+
244
+ def example_dataset_yaml
245
+ <<~YAML
246
+ - input: What is the return policy?
247
+ context: Returns are accepted within 30 days of purchase with a valid receipt.
248
+ expected:
249
+ contains:
250
+ - 30 days
251
+ - receipt
252
+
253
+ - input: What are the store hours?
254
+ context: We are open Monday through Friday, 9am to 5pm.
255
+ expected:
256
+ contains_any:
257
+ - "9am"
258
+ - "9:00"
259
+ regex: "\\\\d+[ap]m"
260
+ YAML
261
+ end
262
+ end
263
+
264
+ # Make tasks available without :environment for non-Rails apps
265
+ unless Rake::Task.task_defined?(:environment)
266
+ task :environment do
267
+ # No-op for non-Rails apps
268
+ end
269
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby_llm-tribunal
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Florian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2026-01-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ruby_llm
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ description: |-
28
+ Tribunal provides tools for evaluating LLM outputs, detecting hallucinations,
29
+ measuring response quality, and ensuring safety. Features deterministic assertions,
30
+ LLM-as-judge evaluations, red team attack generation, and multiple output formats.
31
+ A RubyLLM plugin inspired by the Elixir Tribunal library.
32
+ email:
33
+ - florian@alqemist.com
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - CHANGELOG.md
39
+ - LICENSE.txt
40
+ - README.md
41
+ - lib/ruby_llm-tribunal.rb
42
+ - lib/ruby_llm/tribunal.rb
43
+ - lib/ruby_llm/tribunal/assertions.rb
44
+ - lib/ruby_llm/tribunal/assertions/deterministic.rb
45
+ - lib/ruby_llm/tribunal/assertions/embedding.rb
46
+ - lib/ruby_llm/tribunal/assertions/judge.rb
47
+ - lib/ruby_llm/tribunal/configuration.rb
48
+ - lib/ruby_llm/tribunal/dataset.rb
49
+ - lib/ruby_llm/tribunal/eval_helpers.rb
50
+ - lib/ruby_llm/tribunal/judge.rb
51
+ - lib/ruby_llm/tribunal/judges/bias.rb
52
+ - lib/ruby_llm/tribunal/judges/correctness.rb
53
+ - lib/ruby_llm/tribunal/judges/faithful.rb
54
+ - lib/ruby_llm/tribunal/judges/hallucination.rb
55
+ - lib/ruby_llm/tribunal/judges/harmful.rb
56
+ - lib/ruby_llm/tribunal/judges/jailbreak.rb
57
+ - lib/ruby_llm/tribunal/judges/pii.rb
58
+ - lib/ruby_llm/tribunal/judges/refusal.rb
59
+ - lib/ruby_llm/tribunal/judges/relevant.rb
60
+ - lib/ruby_llm/tribunal/judges/toxicity.rb
61
+ - lib/ruby_llm/tribunal/red_team.rb
62
+ - lib/ruby_llm/tribunal/reporter.rb
63
+ - lib/ruby_llm/tribunal/reporters/console.rb
64
+ - lib/ruby_llm/tribunal/reporters/github.rb
65
+ - lib/ruby_llm/tribunal/reporters/html.rb
66
+ - lib/ruby_llm/tribunal/reporters/json.rb
67
+ - lib/ruby_llm/tribunal/reporters/junit.rb
68
+ - lib/ruby_llm/tribunal/reporters/text.rb
69
+ - lib/ruby_llm/tribunal/test_case.rb
70
+ - lib/ruby_llm/tribunal/version.rb
71
+ - lib/tasks/tribunal.rake
72
+ homepage: https://github.com/Alqemist-labs/ruby_llm-tribunal
73
+ licenses:
74
+ - MIT
75
+ metadata:
76
+ homepage_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal
77
+ source_code_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal
78
+ changelog_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal/blob/main/CHANGELOG.md
79
+ rubygems_mfa_required: 'true'
80
+ post_install_message:
81
+ rdoc_options: []
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 3.2.0
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubygems_version: 3.0.3.1
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: LLM evaluation framework for Ruby
99
+ test_files: []