ruby_llm-tribunal 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +32 -0
- data/LICENSE.txt +21 -0
- data/README.md +442 -0
- data/lib/ruby_llm/tribunal/assertions/deterministic.rb +259 -0
- data/lib/ruby_llm/tribunal/assertions/embedding.rb +90 -0
- data/lib/ruby_llm/tribunal/assertions/judge.rb +152 -0
- data/lib/ruby_llm/tribunal/assertions.rb +141 -0
- data/lib/ruby_llm/tribunal/configuration.rb +38 -0
- data/lib/ruby_llm/tribunal/dataset.rb +118 -0
- data/lib/ruby_llm/tribunal/eval_helpers.rb +288 -0
- data/lib/ruby_llm/tribunal/judge.rb +166 -0
- data/lib/ruby_llm/tribunal/judges/bias.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/correctness.rb +68 -0
- data/lib/ruby_llm/tribunal/judges/faithful.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/hallucination.rb +85 -0
- data/lib/ruby_llm/tribunal/judges/harmful.rb +90 -0
- data/lib/ruby_llm/tribunal/judges/jailbreak.rb +77 -0
- data/lib/ruby_llm/tribunal/judges/pii.rb +118 -0
- data/lib/ruby_llm/tribunal/judges/refusal.rb +79 -0
- data/lib/ruby_llm/tribunal/judges/relevant.rb +65 -0
- data/lib/ruby_llm/tribunal/judges/toxicity.rb +63 -0
- data/lib/ruby_llm/tribunal/red_team.rb +306 -0
- data/lib/ruby_llm/tribunal/reporter.rb +48 -0
- data/lib/ruby_llm/tribunal/reporters/console.rb +120 -0
- data/lib/ruby_llm/tribunal/reporters/github.rb +26 -0
- data/lib/ruby_llm/tribunal/reporters/html.rb +185 -0
- data/lib/ruby_llm/tribunal/reporters/json.rb +31 -0
- data/lib/ruby_llm/tribunal/reporters/junit.rb +58 -0
- data/lib/ruby_llm/tribunal/reporters/text.rb +120 -0
- data/lib/ruby_llm/tribunal/test_case.rb +124 -0
- data/lib/ruby_llm/tribunal/version.rb +7 -0
- data/lib/ruby_llm/tribunal.rb +130 -0
- data/lib/ruby_llm-tribunal.rb +3 -0
- data/lib/tasks/tribunal.rake +269 -0
- metadata +99 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Reporters
|
|
6
|
+
# HTML report for shareable results.
|
|
7
|
+
class HTML
|
|
8
|
+
class << self
|
|
9
|
+
def format(results)
|
|
10
|
+
<<~HTML
|
|
11
|
+
<!DOCTYPE html>
|
|
12
|
+
<html lang="en">
|
|
13
|
+
<head>
|
|
14
|
+
<meta charset="UTF-8">
|
|
15
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
16
|
+
<title>Tribunal Evaluation Report</title>
|
|
17
|
+
<style>
|
|
18
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
19
|
+
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; line-height: 1.6; padding: 2rem; background: #f5f5f5; }
|
|
20
|
+
.container { max-width: 900px; margin: 0 auto; }
|
|
21
|
+
h1 { color: #333; margin-bottom: 1.5rem; }
|
|
22
|
+
.summary { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
|
|
23
|
+
.summary-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; }
|
|
24
|
+
.stat { text-align: center; }
|
|
25
|
+
.stat-value { font-size: 2rem; font-weight: bold; color: #333; }
|
|
26
|
+
.stat-label { color: #666; font-size: 0.9rem; }
|
|
27
|
+
.status { padding: 0.5rem 1rem; border-radius: 4px; display: inline-block; font-weight: bold; margin-top: 1rem; }
|
|
28
|
+
.status.passed { background: #d4edda; color: #155724; }
|
|
29
|
+
.status.failed { background: #f8d7da; color: #721c24; }
|
|
30
|
+
.metrics { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
|
|
31
|
+
.metrics h2 { margin-bottom: 1rem; color: #333; font-size: 1.2rem; }
|
|
32
|
+
.metric-row { display: flex; align-items: center; margin-bottom: 0.75rem; }
|
|
33
|
+
.metric-name { width: 120px; font-weight: 500; }
|
|
34
|
+
.metric-bar { flex: 1; height: 20px; background: #e9ecef; border-radius: 4px; overflow: hidden; margin: 0 1rem; }
|
|
35
|
+
.metric-fill { height: 100%; background: #28a745; transition: width 0.3s; }
|
|
36
|
+
.metric-fill.warning { background: #ffc107; }
|
|
37
|
+
.metric-fill.danger { background: #dc3545; }
|
|
38
|
+
.metric-value { width: 100px; text-align: right; font-size: 0.9rem; color: #666; }
|
|
39
|
+
.failures { background: white; border-radius: 8px; padding: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
|
|
40
|
+
.failures h2 { margin-bottom: 1rem; color: #333; font-size: 1.2rem; }
|
|
41
|
+
.failure { background: #fff5f5; border-left: 4px solid #dc3545; padding: 1rem; margin-bottom: 1rem; border-radius: 0 4px 4px 0; }
|
|
42
|
+
.failure-input { font-weight: 500; color: #333; margin-bottom: 0.5rem; }
|
|
43
|
+
.failure-reason { color: #666; font-size: 0.9rem; }
|
|
44
|
+
.failure-reason code { background: #f1f1f1; padding: 0.2rem 0.4rem; border-radius: 3px; }
|
|
45
|
+
.footer { text-align: center; margin-top: 2rem; color: #666; font-size: 0.85rem; }
|
|
46
|
+
</style>
|
|
47
|
+
</head>
|
|
48
|
+
<body>
|
|
49
|
+
<div class="container">
|
|
50
|
+
<h1>Tribunal Evaluation Report</h1>
|
|
51
|
+
#{summary_section(results[:summary])}
|
|
52
|
+
#{metrics_section(results[:metrics])}
|
|
53
|
+
#{failures_section(results[:cases])}
|
|
54
|
+
<div class="footer">Generated by Tribunal</div>
|
|
55
|
+
</div>
|
|
56
|
+
</body>
|
|
57
|
+
</html>
|
|
58
|
+
HTML
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def summary_section(summary)
|
|
64
|
+
passed = summary[:threshold_passed] != false && summary[:failed].zero?
|
|
65
|
+
status_class = passed ? 'passed' : 'failed'
|
|
66
|
+
status_text = passed ? 'PASSED' : 'FAILED'
|
|
67
|
+
|
|
68
|
+
threshold_info = if summary[:strict]
|
|
69
|
+
' (strict mode)'
|
|
70
|
+
elsif summary[:threshold]
|
|
71
|
+
" (threshold: #{(summary[:threshold] * 100).round}%)"
|
|
72
|
+
else
|
|
73
|
+
''
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
<<~HTML
|
|
77
|
+
<div class="summary">
|
|
78
|
+
<div class="summary-grid">
|
|
79
|
+
<div class="stat">
|
|
80
|
+
<div class="stat-value">#{summary[:total]}</div>
|
|
81
|
+
<div class="stat-label">Total Tests</div>
|
|
82
|
+
</div>
|
|
83
|
+
<div class="stat">
|
|
84
|
+
<div class="stat-value">#{summary[:passed]}</div>
|
|
85
|
+
<div class="stat-label">Passed</div>
|
|
86
|
+
</div>
|
|
87
|
+
<div class="stat">
|
|
88
|
+
<div class="stat-value">#{summary[:failed]}</div>
|
|
89
|
+
<div class="stat-label">Failed</div>
|
|
90
|
+
</div>
|
|
91
|
+
<div class="stat">
|
|
92
|
+
<div class="stat-value">#{(summary[:pass_rate] * 100).round}%</div>
|
|
93
|
+
<div class="stat-label">Pass Rate</div>
|
|
94
|
+
</div>
|
|
95
|
+
<div class="stat">
|
|
96
|
+
<div class="stat-value">#{format_duration(summary[:duration_ms])}</div>
|
|
97
|
+
<div class="stat-label">Duration</div>
|
|
98
|
+
</div>
|
|
99
|
+
</div>
|
|
100
|
+
<div class="status #{status_class}">#{status_text}#{threshold_info}</div>
|
|
101
|
+
</div>
|
|
102
|
+
HTML
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def metrics_section(metrics)
|
|
106
|
+
return '' if metrics.nil? || metrics.empty?
|
|
107
|
+
|
|
108
|
+
rows = metrics.map { |name, data| format_metric_row(name, data) }.join("\n")
|
|
109
|
+
|
|
110
|
+
<<~HTML
|
|
111
|
+
<div class="metrics">
|
|
112
|
+
<h2>Results by Metric</h2>
|
|
113
|
+
#{rows}
|
|
114
|
+
</div>
|
|
115
|
+
HTML
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def format_metric_row(name, data)
|
|
119
|
+
rate = data[:total].positive? ? data[:passed].to_f / data[:total] : 0
|
|
120
|
+
percent = (rate * 100).round
|
|
121
|
+
|
|
122
|
+
fill_class = if percent >= 90
|
|
123
|
+
''
|
|
124
|
+
elsif percent >= 70
|
|
125
|
+
'warning'
|
|
126
|
+
else
|
|
127
|
+
'danger'
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
<<~HTML
|
|
131
|
+
<div class="metric-row">
|
|
132
|
+
<div class="metric-name">#{escape_html(name.to_s)}</div>
|
|
133
|
+
<div class="metric-bar">
|
|
134
|
+
<div class="metric-fill #{fill_class}" style="width: #{percent}%"></div>
|
|
135
|
+
</div>
|
|
136
|
+
<div class="metric-value">#{data[:passed]}/#{data[:total]} (#{percent}%)</div>
|
|
137
|
+
</div>
|
|
138
|
+
HTML
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def failures_section(cases)
|
|
142
|
+
failures = cases.select { |c| c[:status] == :failed }
|
|
143
|
+
return '' if failures.empty?
|
|
144
|
+
|
|
145
|
+
rows = failures.map { |c| format_failure_row(c) }.join("\n")
|
|
146
|
+
|
|
147
|
+
<<~HTML
|
|
148
|
+
<div class="failures">
|
|
149
|
+
<h2>Failed Cases</h2>
|
|
150
|
+
#{rows}
|
|
151
|
+
</div>
|
|
152
|
+
HTML
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def format_failure_row(test_case)
|
|
156
|
+
reasons = test_case[:failures].map do |type, reason|
|
|
157
|
+
"<code>#{escape_html(type.to_s)}</code>: #{escape_html(reason.to_s)}"
|
|
158
|
+
end.join('<br>')
|
|
159
|
+
|
|
160
|
+
<<~HTML
|
|
161
|
+
<div class="failure">
|
|
162
|
+
<div class="failure-input">#{escape_html(test_case[:input].to_s)}</div>
|
|
163
|
+
<div class="failure-reason">#{reasons}</div>
|
|
164
|
+
</div>
|
|
165
|
+
HTML
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def escape_html(str)
|
|
169
|
+
str.to_s
|
|
170
|
+
.gsub('&', '&')
|
|
171
|
+
.gsub('<', '<')
|
|
172
|
+
.gsub('>', '>')
|
|
173
|
+
.gsub('"', '"')
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def format_duration(duration_ms)
|
|
177
|
+
return "#{duration_ms}ms" if duration_ms < 1000
|
|
178
|
+
|
|
179
|
+
"#{(duration_ms / 1000.0).round(1)}s"
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Reporters
|
|
6
|
+
# JSON output for CI/machine consumption.
|
|
7
|
+
class JSON
|
|
8
|
+
class << self
|
|
9
|
+
def format(results)
|
|
10
|
+
convert_for_json(results).to_json
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def convert_for_json(data)
|
|
16
|
+
case data
|
|
17
|
+
when Hash
|
|
18
|
+
data.transform_keys(&:to_s).transform_values { |v| convert_for_json(v) }
|
|
19
|
+
when Array
|
|
20
|
+
data.map { |item| convert_for_json(item) }
|
|
21
|
+
when Symbol
|
|
22
|
+
data.to_s
|
|
23
|
+
else
|
|
24
|
+
data
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Reporters
|
|
6
|
+
# JUnit XML format for CI tools.
|
|
7
|
+
class JUnit
|
|
8
|
+
class << self
|
|
9
|
+
def format(results)
|
|
10
|
+
test_cases = results[:cases].map { |c| format_testcase(c) }.join("\n")
|
|
11
|
+
|
|
12
|
+
<<~XML
|
|
13
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
14
|
+
<testsuites name="Tribunal" tests="#{results[:summary][:total]}" failures="#{results[:summary][:failed]}" time="#{results[:summary][:duration_ms] / 1000.0}">
|
|
15
|
+
<testsuite name="eval" tests="#{results[:summary][:total]}" failures="#{results[:summary][:failed]}">
|
|
16
|
+
#{test_cases}
|
|
17
|
+
</testsuite>
|
|
18
|
+
</testsuites>
|
|
19
|
+
XML
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def format_testcase(test_case)
|
|
25
|
+
name = escape_xml(test_case[:input].to_s)
|
|
26
|
+
time = (test_case[:duration_ms] || 0) / 1000.0
|
|
27
|
+
|
|
28
|
+
return %( <testcase name="#{name}" time="#{time}"/>) if test_case[:status] == :passed
|
|
29
|
+
|
|
30
|
+
failure_msg = test_case[:failures]
|
|
31
|
+
.map { |type, reason| "#{type}: #{reason}" }
|
|
32
|
+
.join("\n")
|
|
33
|
+
.then { |msg| escape_xml(msg) }
|
|
34
|
+
|
|
35
|
+
build_failure_xml(name, time, failure_msg)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def build_failure_xml(name, time, failure_msg)
|
|
39
|
+
<<~XML.chomp
|
|
40
|
+
<testcase name="#{name}" time="#{time}">
|
|
41
|
+
<failure message="Assertion failed">#{failure_msg}</failure>
|
|
42
|
+
</testcase>
|
|
43
|
+
XML
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def escape_xml(str)
|
|
47
|
+
str.to_s
|
|
48
|
+
.gsub('&', '&')
|
|
49
|
+
.gsub('<', '<')
|
|
50
|
+
.gsub('>', '>')
|
|
51
|
+
.gsub('"', '"')
|
|
52
|
+
.gsub("'", ''')
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
module Reporters
|
|
6
|
+
# Plain ASCII text output (no unicode).
|
|
7
|
+
class Text
|
|
8
|
+
class << self
|
|
9
|
+
def format(results)
|
|
10
|
+
[
|
|
11
|
+
header,
|
|
12
|
+
summary_section(results[:summary]),
|
|
13
|
+
metrics_section(results[:metrics]),
|
|
14
|
+
failures_section(results[:cases]),
|
|
15
|
+
footer(results[:summary])
|
|
16
|
+
].join("\n")
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def header
|
|
22
|
+
<<~HEADER
|
|
23
|
+
|
|
24
|
+
Tribunal LLM Evaluation
|
|
25
|
+
===================================================================
|
|
26
|
+
HEADER
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def summary_section(summary)
|
|
30
|
+
<<~SUMMARY
|
|
31
|
+
Summary
|
|
32
|
+
-------------------------------------------------------------------
|
|
33
|
+
Total: #{summary[:total]} test cases
|
|
34
|
+
Passed: #{summary[:passed]} (#{(summary[:pass_rate] * 100).round}%)
|
|
35
|
+
Failed: #{summary[:failed]}
|
|
36
|
+
Duration: #{format_duration(summary[:duration_ms])}
|
|
37
|
+
SUMMARY
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def metrics_section(metrics)
|
|
41
|
+
return '' if metrics.nil? || metrics.empty?
|
|
42
|
+
|
|
43
|
+
rows = metrics.map do |name, data|
|
|
44
|
+
rate = data[:total].positive? ? data[:passed].to_f / data[:total] : 0
|
|
45
|
+
bar = progress_bar(rate, 20)
|
|
46
|
+
" #{pad(name, 14)} #{data[:passed]}/#{data[:total]} passed #{(rate * 100).round}% #{bar}"
|
|
47
|
+
end.join("\n")
|
|
48
|
+
|
|
49
|
+
<<~METRICS
|
|
50
|
+
Results by Metric
|
|
51
|
+
-------------------------------------------------------------------
|
|
52
|
+
#{rows}
|
|
53
|
+
METRICS
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def failures_section(cases)
|
|
57
|
+
failures = cases.select { |c| c[:status] == :failed }
|
|
58
|
+
return '' if failures.empty?
|
|
59
|
+
|
|
60
|
+
rows = failures.each_with_index.map do |c, idx|
|
|
61
|
+
format_failure_row(c, idx + 1)
|
|
62
|
+
end.join("\n")
|
|
63
|
+
|
|
64
|
+
<<~FAILURES
|
|
65
|
+
Failed Cases
|
|
66
|
+
-------------------------------------------------------------------
|
|
67
|
+
#{rows}
|
|
68
|
+
FAILURES
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def format_failure_row(test_case, idx)
|
|
72
|
+
input = test_case[:input].to_s[0, 50]
|
|
73
|
+
reasons = test_case[:failures].map do |type, reason|
|
|
74
|
+
" |- #{type}: #{reason}"
|
|
75
|
+
end.join("\n")
|
|
76
|
+
|
|
77
|
+
<<~ROW
|
|
78
|
+
#{idx}. "#{input}"
|
|
79
|
+
#{reasons}
|
|
80
|
+
ROW
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def footer(summary)
|
|
84
|
+
passed = summary[:threshold_passed] != false && summary[:failed].zero?
|
|
85
|
+
status = passed ? 'PASSED' : 'FAILED'
|
|
86
|
+
|
|
87
|
+
threshold_info = if summary[:strict]
|
|
88
|
+
' (strict mode)'
|
|
89
|
+
elsif summary[:threshold]
|
|
90
|
+
" (threshold: #{(summary[:threshold] * 100).round}%)"
|
|
91
|
+
else
|
|
92
|
+
''
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
<<~FOOTER
|
|
96
|
+
-------------------------------------------------------------------
|
|
97
|
+
#{status}#{threshold_info}
|
|
98
|
+
FOOTER
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def progress_bar(rate, width)
|
|
102
|
+
filled = (rate * width).round
|
|
103
|
+
empty = width - filled
|
|
104
|
+
"#{'#' * filled}#{'-' * empty}"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def pad(term, width)
|
|
108
|
+
term.to_s.ljust(width)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def format_duration(duration_ms)
|
|
112
|
+
return "#{duration_ms}ms" if duration_ms < 1000
|
|
113
|
+
|
|
114
|
+
"#{(duration_ms / 1000.0).round(1)}s"
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Tribunal
|
|
5
|
+
# Represents a single evaluation test case.
|
|
6
|
+
#
|
|
7
|
+
# @attr_reader input [String] The user query/prompt (required)
|
|
8
|
+
# @attr_reader actual_output [String, nil] The LLM response to evaluate
|
|
9
|
+
# @attr_reader expected_output [String, nil] Golden/ideal answer for comparison
|
|
10
|
+
# @attr_reader context [Array<String>, nil] Ground truth context for faithfulness checks
|
|
11
|
+
# @attr_reader retrieval_context [Array<String>, nil] Actual retrieved docs from RAG
|
|
12
|
+
# @attr_reader metadata [Hash, nil] Additional info like latency, tokens, cost
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# test_case = TestCase.new(
|
|
16
|
+
# input: "What's the return policy?",
|
|
17
|
+
# actual_output: "You can return items within 30 days.",
|
|
18
|
+
# context: ["Returns accepted within 30 days with receipt."],
|
|
19
|
+
# expected_output: "Items can be returned within 30 days with a receipt."
|
|
20
|
+
# )
|
|
21
|
+
class TestCase
|
|
22
|
+
attr_reader :input, :actual_output, :expected_output, :context, :retrieval_context, :metadata
|
|
23
|
+
|
|
24
|
+
# Creates a new test case from a hash.
|
|
25
|
+
#
|
|
26
|
+
# @param attrs [Hash] Test case attributes
|
|
27
|
+
# @option attrs [String] :input The user query/prompt
|
|
28
|
+
# @option attrs [String] :actual_output The LLM response to evaluate
|
|
29
|
+
# @option attrs [String] :expected_output Golden/ideal answer
|
|
30
|
+
# @option attrs [Array<String>, String] :context Ground truth context
|
|
31
|
+
# @option attrs [Array<String>] :retrieval_context Retrieved docs from RAG
|
|
32
|
+
# @option attrs [Hash] :metadata Additional info
|
|
33
|
+
def initialize(attrs = {})
|
|
34
|
+
attrs = normalize_keys(attrs)
|
|
35
|
+
|
|
36
|
+
@input = attrs[:input]
|
|
37
|
+
@actual_output = attrs[:actual_output]
|
|
38
|
+
@expected_output = attrs[:expected_output]
|
|
39
|
+
@context = normalize_context(attrs[:context])
|
|
40
|
+
@retrieval_context = normalize_context(attrs[:retrieval_context])
|
|
41
|
+
@metadata = attrs[:metadata]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Sets the actual output on an existing test case.
|
|
45
|
+
# Useful when the dataset provides input/context but output comes from your LLM.
|
|
46
|
+
#
|
|
47
|
+
# @param output [String] The LLM response
|
|
48
|
+
# @return [TestCase] A new test case with the output set
|
|
49
|
+
def with_output(output)
|
|
50
|
+
TestCase.new(
|
|
51
|
+
input: @input,
|
|
52
|
+
actual_output: output,
|
|
53
|
+
expected_output: @expected_output,
|
|
54
|
+
context: @context,
|
|
55
|
+
retrieval_context: @retrieval_context,
|
|
56
|
+
metadata: @metadata
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Sets the retrieval context from your RAG pipeline.
|
|
61
|
+
#
|
|
62
|
+
# @param context [Array<String>, String] Retrieved documents
|
|
63
|
+
# @return [TestCase] A new test case with retrieval context set
|
|
64
|
+
def with_retrieval_context(context)
|
|
65
|
+
TestCase.new(
|
|
66
|
+
input: @input,
|
|
67
|
+
actual_output: @actual_output,
|
|
68
|
+
expected_output: @expected_output,
|
|
69
|
+
context: @context,
|
|
70
|
+
retrieval_context: normalize_context(context),
|
|
71
|
+
metadata: @metadata
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Adds metadata (latency, tokens, cost, etc).
|
|
76
|
+
#
|
|
77
|
+
# @param new_metadata [Hash] Metadata to merge
|
|
78
|
+
# @return [TestCase] A new test case with merged metadata
|
|
79
|
+
def with_metadata(new_metadata)
|
|
80
|
+
merged = (@metadata || {}).merge(new_metadata)
|
|
81
|
+
TestCase.new(
|
|
82
|
+
input: @input,
|
|
83
|
+
actual_output: @actual_output,
|
|
84
|
+
expected_output: @expected_output,
|
|
85
|
+
context: @context,
|
|
86
|
+
retrieval_context: @retrieval_context,
|
|
87
|
+
metadata: merged
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Converts the test case to a hash.
|
|
92
|
+
#
|
|
93
|
+
# @return [Hash] Test case as hash
|
|
94
|
+
def to_h
|
|
95
|
+
{
|
|
96
|
+
input: @input,
|
|
97
|
+
actual_output: @actual_output,
|
|
98
|
+
expected_output: @expected_output,
|
|
99
|
+
context: @context,
|
|
100
|
+
retrieval_context: @retrieval_context,
|
|
101
|
+
metadata: @metadata
|
|
102
|
+
}.compact
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
private
|
|
106
|
+
|
|
107
|
+
def normalize_keys(hash)
|
|
108
|
+
hash.transform_keys do |key|
|
|
109
|
+
case key
|
|
110
|
+
when String then key.to_sym
|
|
111
|
+
else key
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def normalize_context(ctx)
|
|
117
|
+
return nil if ctx.nil?
|
|
118
|
+
return [ctx] if ctx.is_a?(String)
|
|
119
|
+
|
|
120
|
+
ctx
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ruby_llm'
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'yaml'
|
|
6
|
+
|
|
7
|
+
require_relative 'tribunal/version'
|
|
8
|
+
require_relative 'tribunal/configuration'
|
|
9
|
+
require_relative 'tribunal/test_case'
|
|
10
|
+
require_relative 'tribunal/assertions'
|
|
11
|
+
require_relative 'tribunal/assertions/deterministic'
|
|
12
|
+
require_relative 'tribunal/assertions/judge'
|
|
13
|
+
require_relative 'tribunal/assertions/embedding'
|
|
14
|
+
require_relative 'tribunal/judge'
|
|
15
|
+
require_relative 'tribunal/judges/faithful'
|
|
16
|
+
require_relative 'tribunal/judges/relevant'
|
|
17
|
+
require_relative 'tribunal/judges/hallucination'
|
|
18
|
+
require_relative 'tribunal/judges/correctness'
|
|
19
|
+
require_relative 'tribunal/judges/bias'
|
|
20
|
+
require_relative 'tribunal/judges/toxicity'
|
|
21
|
+
require_relative 'tribunal/judges/harmful'
|
|
22
|
+
require_relative 'tribunal/judges/jailbreak'
|
|
23
|
+
require_relative 'tribunal/judges/pii'
|
|
24
|
+
require_relative 'tribunal/judges/refusal'
|
|
25
|
+
require_relative 'tribunal/dataset'
|
|
26
|
+
require_relative 'tribunal/red_team'
|
|
27
|
+
require_relative 'tribunal/reporter'
|
|
28
|
+
require_relative 'tribunal/reporters/console'
|
|
29
|
+
require_relative 'tribunal/reporters/text'
|
|
30
|
+
require_relative 'tribunal/reporters/json'
|
|
31
|
+
require_relative 'tribunal/reporters/html'
|
|
32
|
+
require_relative 'tribunal/reporters/github'
|
|
33
|
+
require_relative 'tribunal/reporters/junit'
|
|
34
|
+
require_relative 'tribunal/eval_helpers'
|
|
35
|
+
|
|
36
|
+
module RubyLLM
|
|
37
|
+
# LLM evaluation framework for Ruby.
|
|
38
|
+
#
|
|
39
|
+
# Tribunal provides tools for evaluating LLM outputs,
|
|
40
|
+
# detecting hallucinations, and measuring response quality.
|
|
41
|
+
#
|
|
42
|
+
# @example Quick Start
|
|
43
|
+
# test_case = RubyLLM::Tribunal.test_case(
|
|
44
|
+
# input: "What's the return policy?",
|
|
45
|
+
# actual_output: "Returns within 30 days.",
|
|
46
|
+
# context: ["Return policy: 30 days with receipt."]
|
|
47
|
+
# )
|
|
48
|
+
#
|
|
49
|
+
# assertions = [
|
|
50
|
+
# [:contains, { value: "30 days" }],
|
|
51
|
+
# [:faithful, { threshold: 0.8 }]
|
|
52
|
+
# ]
|
|
53
|
+
#
|
|
54
|
+
# results = RubyLLM::Tribunal.evaluate(test_case, assertions)
|
|
55
|
+
#
|
|
56
|
+
module Tribunal
|
|
57
|
+
class Error < StandardError; end
|
|
58
|
+
|
|
59
|
+
class << self
|
|
60
|
+
# Configuration for Tribunal
|
|
61
|
+
def configuration
|
|
62
|
+
@configuration ||= Configuration.new
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def configure
|
|
66
|
+
yield(configuration)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Evaluates a test case against assertions.
|
|
70
|
+
#
|
|
71
|
+
# @param test_case [TestCase] The test case to evaluate
|
|
72
|
+
# @param assertions [Array, Hash] Assertions to run
|
|
73
|
+
# @return [Hash] Results map of assertion_type => result
|
|
74
|
+
#
|
|
75
|
+
# @example
|
|
76
|
+
# test_case = TestCase.new(
|
|
77
|
+
# input: "What's the return policy?",
|
|
78
|
+
# actual_output: "Returns within 30 days.",
|
|
79
|
+
# context: ["Return policy: 30 days with receipt."]
|
|
80
|
+
# )
|
|
81
|
+
#
|
|
82
|
+
# assertions = [
|
|
83
|
+
# [:contains, { value: "30 days" }],
|
|
84
|
+
# [:faithful, { threshold: 0.8 }]
|
|
85
|
+
# ]
|
|
86
|
+
#
|
|
87
|
+
# Tribunal.evaluate(test_case, assertions)
|
|
88
|
+
# #=> { contains: [:pass, {...}], faithful: [:pass, {...}] }
|
|
89
|
+
def evaluate(test_case, assertions)
|
|
90
|
+
Assertions.evaluate_all(assertions, test_case)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Returns available assertion types based on loaded dependencies.
|
|
94
|
+
#
|
|
95
|
+
# @return [Array<Symbol>] List of available assertion types
|
|
96
|
+
def available_assertions
|
|
97
|
+
Assertions.available
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Creates a new test case.
|
|
101
|
+
#
|
|
102
|
+
# @param attrs [Hash] Test case attributes
|
|
103
|
+
# @return [TestCase] New test case instance
|
|
104
|
+
#
|
|
105
|
+
# @example
|
|
106
|
+
# Tribunal.test_case(
|
|
107
|
+
# input: "What's the price?",
|
|
108
|
+
# actual_output: "The price is $29.99.",
|
|
109
|
+
# context: ["Product costs $29.99"]
|
|
110
|
+
# )
|
|
111
|
+
def test_case(attrs)
|
|
112
|
+
TestCase.new(attrs)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Registers a custom judge.
|
|
116
|
+
#
|
|
117
|
+
# @param judge_class [Class] A class implementing the Judge interface
|
|
118
|
+
def register_judge(judge_class)
|
|
119
|
+
Judge.register(judge_class)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Returns all registered judge names.
|
|
123
|
+
#
|
|
124
|
+
# @return [Array<Symbol>] List of judge names
|
|
125
|
+
def judge_names
|
|
126
|
+
Judge.all_judge_names
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|