raif 1.2.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/raif/conversation_entry_job.rb +1 -1
- data/app/models/raif/agents/re_act_step.rb +1 -2
- data/app/models/raif/concerns/has_llm.rb +1 -1
- data/app/models/raif/concerns/task_run_args.rb +62 -0
- data/app/models/raif/conversation.rb +5 -1
- data/app/models/raif/conversation_entry.rb +6 -8
- data/app/models/raif/llm.rb +1 -1
- data/app/models/raif/llms/open_router.rb +3 -1
- data/app/models/raif/task.rb +22 -9
- data/app/views/raif/conversation_entries/_form.html.erb +1 -1
- data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
- data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
- data/config/locales/en.yml +8 -0
- data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
- data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
- data/exe/raif +7 -0
- data/lib/generators/raif/agent/agent_generator.rb +22 -7
- data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
- data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
- data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
- data/lib/generators/raif/base_generator.rb +19 -0
- data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
- data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
- data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
- data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
- data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
- data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
- data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
- data/lib/generators/raif/install/install_generator.rb +15 -0
- data/lib/generators/raif/install/templates/initializer.rb +11 -0
- data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
- data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
- data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
- data/lib/generators/raif/task/task_generator.rb +22 -3
- data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
- data/lib/generators/raif/task/templates/task.rb.tt +55 -59
- data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
- data/lib/raif/cli/base.rb +39 -0
- data/lib/raif/cli/evals.rb +47 -0
- data/lib/raif/cli/evals_setup.rb +27 -0
- data/lib/raif/cli.rb +67 -0
- data/lib/raif/configuration.rb +20 -6
- data/lib/raif/evals/eval.rb +30 -0
- data/lib/raif/evals/eval_set.rb +111 -0
- data/lib/raif/evals/eval_sets/expectations.rb +53 -0
- data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
- data/lib/raif/evals/expectation_result.rb +39 -0
- data/lib/raif/evals/llm_judge.rb +32 -0
- data/lib/raif/evals/llm_judges/binary.rb +94 -0
- data/lib/raif/evals/llm_judges/comparative.rb +89 -0
- data/lib/raif/evals/llm_judges/scored.rb +63 -0
- data/lib/raif/evals/llm_judges/summarization.rb +166 -0
- data/lib/raif/evals/run.rb +201 -0
- data/lib/raif/evals/scoring_rubric.rb +174 -0
- data/lib/raif/evals.rb +26 -0
- data/lib/raif/llm_registry.rb +33 -0
- data/lib/raif/migration_checker.rb +3 -3
- data/lib/raif/utils/colors.rb +23 -0
- data/lib/raif/utils.rb +1 -0
- data/lib/raif/version.rb +1 -1
- data/lib/raif.rb +4 -0
- data/spec/support/current_temperature_test_tool.rb +34 -0
- data/spec/support/test_conversation.rb +1 -1
- metadata +35 -3
@@ -0,0 +1,166 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module LlmJudges
|
6
|
+
class Summarization < Raif::Evals::LlmJudge
|
7
|
+
task_run_arg :original_content # the original content to evaluate the summary against
|
8
|
+
task_run_arg :summary # the summary to evaluate against the original content
|
9
|
+
|
10
|
+
json_response_schema do
|
11
|
+
object :coverage do
|
12
|
+
string :justification, description: "Justification for the score"
|
13
|
+
number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
|
14
|
+
end
|
15
|
+
|
16
|
+
object :accuracy do
|
17
|
+
string :justification, description: "Justification for the score"
|
18
|
+
number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
|
19
|
+
end
|
20
|
+
|
21
|
+
object :clarity do
|
22
|
+
string :justification, description: "Justification for the score"
|
23
|
+
number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
|
24
|
+
end
|
25
|
+
|
26
|
+
object :conciseness do
|
27
|
+
string :justification, description: "Justification for the score"
|
28
|
+
number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
|
29
|
+
end
|
30
|
+
|
31
|
+
object :overall do
|
32
|
+
string :justification, description: "Justification for the score"
|
33
|
+
number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def build_system_prompt
|
38
|
+
<<~PROMPT.strip
|
39
|
+
You are an impartial expert judge of summary quality. You'll be provided a original piece of content and its summary. Your job is to evaluate the summary against the original content based on the following criteria, and assign a score from 1 to 5 for each (5 = excellent, 1 = very poor):
|
40
|
+
|
41
|
+
**Coverage (Relevance & Completeness):** Does the summary capture all the important points of the original content?
|
42
|
+
- 5 = Excellent Coverage - Nearly all key points and essential details from the content are present in the summary, with no major omissions.
|
43
|
+
- 4 = Good Coverage - Most important points are included, but a minor detail or two might be missing.
|
44
|
+
- 3 = Fair Coverage - Some main points appear, but the summary misses or glosses over other important information.
|
45
|
+
- 2 = Poor Coverage - Many critical points from the content are missing; the summary is incomplete.
|
46
|
+
- 1 = Very Poor - The summary fails to include most of the content's main points (highly incomplete).
|
47
|
+
|
48
|
+
**Accuracy (Faithfulness to the Source):** Is the summary factually correct and free of hallucinations or misrepresentations of the content?
|
49
|
+
- 5 = Fully Accurate - All statements in the summary are correct and directly supported by the content. No errors or invented information.
|
50
|
+
- 4 = Mostly Accurate - The summary is generally accurate with perhaps one minor error or slight ambiguity, but no significant falsehoods.
|
51
|
+
- 3 = Some Inaccuracies - Contains a few errors or unsupported claims from the content, but overall captures the gist correctly.
|
52
|
+
- 2 = Mostly Inaccurate - Multiple statements in the summary are incorrect or not supported by the content.
|
53
|
+
- 1 = Completely Inaccurate - The summary seriously distorts or contradicts the content; many claims are false or not in the source.
|
54
|
+
|
55
|
+
**Clarity and Coherence:** Is the summary well-written and easy to understand? (Consider organization, flow, and whether it would make sense to a reader.)
|
56
|
+
- 5 = Very Clear & Coherent - The summary is logically organized, flows well, and would be easily understood by the target reader. No confusion or ambiguity.
|
57
|
+
- 4 = Mostly Clear - Readable and mostly well-structured, though a sentence or transition could be smoother.
|
58
|
+
- 3 = Somewhat Clear - The summary makes sense overall but might be disjointed or awkward in places, requiring effort to follow.
|
59
|
+
- 2 = Generally Unclear - Lacks coherence or has poor phrasing that makes it hard to follow the ideas.
|
60
|
+
- 1 = Very Poor Clarity - The summary is very confusing or poorly structured, making it hard to understand.
|
61
|
+
|
62
|
+
**Conciseness:** Is the summary succinct while still informative? (It should omit unnecessary detail but not at the expense of coverage.)
|
63
|
+
- 5 = Highly Concise - The summary is brief yet covers all important information (no fluff or redundancy).
|
64
|
+
- 4 = Concise - Generally to-the-point, with only minor redundancy or superfluous content.
|
65
|
+
- 3 = Moderately Concise - Some excess detail or repetition that could be trimmed, but not egregious.
|
66
|
+
- 2 = Verbose - Contains a lot of unnecessary detail or repeats points, making it longer than needed.
|
67
|
+
- 1 = Excessively Verbose - The summary is overly long or wordy, with much content that doesn't add value.
|
68
|
+
PROMPT
|
69
|
+
end
|
70
|
+
|
71
|
+
def build_prompt
|
72
|
+
<<~PROMPT.strip
|
73
|
+
# Instructions
|
74
|
+
Below is an original piece of content and its summary. Evaluate the summary against the original content based on our 4 criteria. For each, you should provide:
|
75
|
+
- A brief justification (1-3 sentences) noting any relevant observations (e.g. what was missing, incorrect, unclear, or well-done).
|
76
|
+
- A score from 1 to 5 (5 = excellent, 1 = very poor).
|
77
|
+
|
78
|
+
Finally, provide an **overall evaluation** of the summary, consisting of a brief justification (1-3 sentences) and a score from 1 to 5 (5 = excellent, 1 = very poor).
|
79
|
+
|
80
|
+
# Output Format
|
81
|
+
Format your output as a JSON object with the following keys:
|
82
|
+
{
|
83
|
+
"coverage": {
|
84
|
+
"justification": "...",
|
85
|
+
"score": 1-5
|
86
|
+
},
|
87
|
+
"accuracy": {
|
88
|
+
"justification": "...",
|
89
|
+
"score": 1-5
|
90
|
+
},
|
91
|
+
"clarity": {
|
92
|
+
"justification": "...",
|
93
|
+
"score": 1-5
|
94
|
+
},
|
95
|
+
"conciseness": {
|
96
|
+
"justification": "...",
|
97
|
+
"score": 1-5
|
98
|
+
},
|
99
|
+
"overall": {
|
100
|
+
"justification": "...",
|
101
|
+
"score": 1-5
|
102
|
+
}
|
103
|
+
}
|
104
|
+
#{additional_context_prompt}
|
105
|
+
# Original Article/Document
|
106
|
+
#{original_content}
|
107
|
+
|
108
|
+
# Summary to Evaluate
|
109
|
+
#{summary}
|
110
|
+
PROMPT
|
111
|
+
end
|
112
|
+
|
113
|
+
def overall_score
|
114
|
+
parsed_response["overall"]["score"] if completed?
|
115
|
+
end
|
116
|
+
|
117
|
+
def overall_justification
|
118
|
+
parsed_response["overall"]["justification"] if completed?
|
119
|
+
end
|
120
|
+
|
121
|
+
def coverage_score
|
122
|
+
parsed_response["coverage"]["score"] if completed?
|
123
|
+
end
|
124
|
+
|
125
|
+
def coverage_justification
|
126
|
+
parsed_response["coverage"]["justification"] if completed?
|
127
|
+
end
|
128
|
+
|
129
|
+
def accuracy_score
|
130
|
+
parsed_response["accuracy"]["score"] if completed?
|
131
|
+
end
|
132
|
+
|
133
|
+
def accuracy_justification
|
134
|
+
parsed_response["accuracy"]["justification"] if completed?
|
135
|
+
end
|
136
|
+
|
137
|
+
def clarity_score
|
138
|
+
parsed_response["clarity"]["score"] if completed?
|
139
|
+
end
|
140
|
+
|
141
|
+
def clarity_justification
|
142
|
+
parsed_response["clarity"]["justification"] if completed?
|
143
|
+
end
|
144
|
+
|
145
|
+
def conciseness_score
|
146
|
+
parsed_response["conciseness"]["score"] if completed?
|
147
|
+
end
|
148
|
+
|
149
|
+
def conciseness_justification
|
150
|
+
parsed_response["conciseness"]["justification"] if completed?
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
|
155
|
+
def additional_context_prompt
|
156
|
+
return if additional_context.blank?
|
157
|
+
|
158
|
+
<<~PROMPT
|
159
|
+
\n# Additional context:
|
160
|
+
#{additional_context}
|
161
|
+
PROMPT
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fileutils"
|
4
|
+
require "json"
|
5
|
+
|
6
|
+
module Raif
|
7
|
+
module Evals
|
8
|
+
class Run
|
9
|
+
attr_reader :eval_sets, :results, :output
|
10
|
+
|
11
|
+
def initialize(file_paths: nil, output: $stdout)
|
12
|
+
@output = output
|
13
|
+
@results = {}
|
14
|
+
|
15
|
+
@eval_sets = if file_paths&.any?
|
16
|
+
load_eval_sets_from_files(file_paths)
|
17
|
+
else
|
18
|
+
discover_eval_sets
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute
|
23
|
+
# Load setup file if it exists
|
24
|
+
setup_file = Rails.root.join("raif_evals", "setup.rb")
|
25
|
+
if File.exist?(setup_file)
|
26
|
+
require setup_file
|
27
|
+
else
|
28
|
+
output.puts Raif::Utils::Colors.red("\n\nNo setup file found. To set up Raif evals, run:\n")
|
29
|
+
output.puts Raif::Utils::Colors.red("bundle exec raif evals:setup\n")
|
30
|
+
exit 1
|
31
|
+
end
|
32
|
+
|
33
|
+
output.puts "\nStarting Raif Eval Run"
|
34
|
+
output.puts ""
|
35
|
+
output.puts "Raif.config.default_llm_model_key: #{Raif.config.default_llm_model_key}"
|
36
|
+
output.puts ""
|
37
|
+
output.puts "=" * 50
|
38
|
+
|
39
|
+
@eval_sets.each do |eval_set_entry|
|
40
|
+
eval_set_class, file_path, line_number = if eval_set_entry.is_a?(Hash)
|
41
|
+
[eval_set_entry[:class], eval_set_entry[:file_path], eval_set_entry[:line_number]]
|
42
|
+
else
|
43
|
+
[eval_set_entry, nil, nil]
|
44
|
+
end
|
45
|
+
|
46
|
+
if line_number
|
47
|
+
# Running specific eval by line number
|
48
|
+
output.puts "\nRunning #{eval_set_class.name} at line #{line_number}"
|
49
|
+
output.puts "-" * 50
|
50
|
+
|
51
|
+
eval_results = run_eval_at_line(eval_set_class, file_path, line_number)
|
52
|
+
else
|
53
|
+
# Running all evals in the set
|
54
|
+
output.puts "\nRunning #{eval_set_class.name}"
|
55
|
+
output.puts "-" * 50
|
56
|
+
|
57
|
+
eval_results = eval_set_class.run(output: output)
|
58
|
+
end
|
59
|
+
|
60
|
+
@results[eval_set_class.name] = eval_results.map(&:to_h)
|
61
|
+
passed_count = eval_results.count(&:passed?)
|
62
|
+
total_count = eval_results.count
|
63
|
+
|
64
|
+
output.puts "-" * 50
|
65
|
+
output.puts "#{eval_set_class.name}: #{passed_count}/#{total_count} evals passed"
|
66
|
+
end
|
67
|
+
|
68
|
+
export_results
|
69
|
+
print_summary
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def load_eval_sets_from_files(file_paths)
|
75
|
+
eval_sets = []
|
76
|
+
|
77
|
+
file_paths.each do |f|
|
78
|
+
file_path = f[:file_path]
|
79
|
+
line_number = f[:line_number]
|
80
|
+
|
81
|
+
# Convert relative path to absolute
|
82
|
+
absolute_path = File.expand_path(file_path)
|
83
|
+
|
84
|
+
unless File.exist?(absolute_path)
|
85
|
+
output.puts Raif::Utils::Colors.red("Error: File not found: #{file_path}")
|
86
|
+
exit 1
|
87
|
+
end
|
88
|
+
|
89
|
+
subclasses_before = Raif::Evals::EvalSet.subclasses
|
90
|
+
|
91
|
+
require absolute_path
|
92
|
+
|
93
|
+
loaded_eval_sets = Raif::Evals::EvalSet.subclasses - subclasses_before
|
94
|
+
eval_set_class = loaded_eval_sets.first
|
95
|
+
|
96
|
+
eval_set_entry = { class: eval_set_class, file_path: absolute_path }
|
97
|
+
eval_set_entry[:line_number] = line_number if line_number
|
98
|
+
|
99
|
+
eval_sets << eval_set_entry
|
100
|
+
end
|
101
|
+
|
102
|
+
eval_sets
|
103
|
+
end
|
104
|
+
|
105
|
+
def run_eval_at_line(eval_set_class, file_path, line_number)
|
106
|
+
target_eval = eval_set_class.evals.find{|e| e[:definition_line_number] == line_number }
|
107
|
+
|
108
|
+
if target_eval.nil?
|
109
|
+
output.puts Raif::Utils::Colors.red("Error: No eval block found at line #{line_number}")
|
110
|
+
return []
|
111
|
+
end
|
112
|
+
|
113
|
+
instance = eval_set_class.new(output: output)
|
114
|
+
[instance.run_eval(target_eval)]
|
115
|
+
end
|
116
|
+
|
117
|
+
def discover_eval_sets
|
118
|
+
eval_sets_dir = Rails.root.join("raif_evals", "eval_sets")
|
119
|
+
return [] unless eval_sets_dir.exist?
|
120
|
+
|
121
|
+
Dir.glob(eval_sets_dir.join("**", "*_eval_set.rb")).map do |file|
|
122
|
+
relative_path = Pathname.new(file).relative_path_from(Rails.root)
|
123
|
+
require Rails.root.join(relative_path)
|
124
|
+
|
125
|
+
# Extract the path components after raif_evals/eval_sets
|
126
|
+
path_from_eval_sets = Pathname.new(file).relative_path_from(eval_sets_dir)
|
127
|
+
path_parts = path_from_eval_sets.dirname.to_s.split("/")
|
128
|
+
|
129
|
+
# Remove "." if it's the only element (meaning file is in eval_sets root)
|
130
|
+
path_parts = [] if path_parts == ["."]
|
131
|
+
|
132
|
+
# Build the full class name
|
133
|
+
class_name = File.basename(file, ".rb").camelize
|
134
|
+
namespace_parts = ["Raif", "Evals"] + path_parts.map(&:camelize)
|
135
|
+
full_class_name = (namespace_parts + [class_name]).join("::")
|
136
|
+
|
137
|
+
full_class_name.constantize
|
138
|
+
end.select { |klass| klass < Raif::Evals::EvalSet }
|
139
|
+
end
|
140
|
+
|
141
|
+
def export_results
|
142
|
+
results_dir = Rails.root.join("raif_evals", "results")
|
143
|
+
FileUtils.mkdir_p(results_dir)
|
144
|
+
|
145
|
+
timestamp = Time.current.strftime("%Y%m%d_%H%M%S")
|
146
|
+
filename = results_dir.join("eval_run_#{timestamp}.json")
|
147
|
+
|
148
|
+
File.write(filename, JSON.pretty_generate({
|
149
|
+
run_at: Time.current.iso8601,
|
150
|
+
results: @results,
|
151
|
+
summary: summary_data
|
152
|
+
}))
|
153
|
+
|
154
|
+
output.puts "\nResults exported to: #{filename}"
|
155
|
+
end
|
156
|
+
|
157
|
+
def summary_data
|
158
|
+
total_eval_sets = @results.count
|
159
|
+
total_evals = @results.values.sum(&:count)
|
160
|
+
passed_evals = @results.values.sum { |evals| evals.count { |e| e[:passed] } }
|
161
|
+
|
162
|
+
total_expectations = @results.values.sum do |evals|
|
163
|
+
evals.sum { |e| e[:expectation_results].count }
|
164
|
+
end
|
165
|
+
|
166
|
+
passed_expectations = @results.values.sum do |evals|
|
167
|
+
evals.sum { |e| e[:expectation_results].count { |r| r[:status] == :passed } }
|
168
|
+
end
|
169
|
+
|
170
|
+
{
|
171
|
+
total_eval_sets: total_eval_sets,
|
172
|
+
total_evals: total_evals,
|
173
|
+
passed_evals: passed_evals,
|
174
|
+
total_expectations: total_expectations,
|
175
|
+
passed_expectations: passed_expectations
|
176
|
+
}
|
177
|
+
end
|
178
|
+
|
179
|
+
def print_summary
|
180
|
+
data = summary_data
|
181
|
+
|
182
|
+
output.puts ""
|
183
|
+
output.puts "\n" + "=" * 50
|
184
|
+
output.puts "SUMMARY"
|
185
|
+
output.puts "=" * 50
|
186
|
+
output.puts "Eval Sets: #{data[:total_eval_sets]}"
|
187
|
+
output.puts ""
|
188
|
+
output.puts "Evals:"
|
189
|
+
output.puts " #{data[:total_evals]} total"
|
190
|
+
output.puts Raif::Utils::Colors.green(" #{data[:passed_evals]} passed")
|
191
|
+
output.puts Raif::Utils::Colors.red(" #{data[:total_evals] - data[:passed_evals]} failed")
|
192
|
+
output.puts ""
|
193
|
+
output.puts "Expectations:"
|
194
|
+
output.puts " #{data[:total_expectations]} total"
|
195
|
+
output.puts Raif::Utils::Colors.green(" #{data[:passed_expectations]} passed")
|
196
|
+
output.puts Raif::Utils::Colors.red(" #{data[:total_expectations] - data[:passed_expectations]} failed")
|
197
|
+
output.puts ""
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
# ScoringRubric provides a standardized way to define evaluation criteria with
|
6
|
+
# multiple scoring levels. Each level can define either a score range or a single
|
7
|
+
# score value, along with descriptive text explaining what qualifies for that score.
|
8
|
+
#
|
9
|
+
# @example Creating a custom rubric
|
10
|
+
# rubric = ScoringRubric.new(
|
11
|
+
# name: :technical_accuracy,
|
12
|
+
# description: "Evaluates technical correctness and precision",
|
13
|
+
# levels: [
|
14
|
+
# { score_range: (9..10), description: "Technically perfect with no errors" },
|
15
|
+
# { score_range: (7..8), description: "Mostly correct with minor technical issues" },
|
16
|
+
# { score_range: (5..6), description: "Generally correct but some technical problems" },
|
17
|
+
# { score_range: (3..4), description: "Significant technical errors present" },
|
18
|
+
# { score_range: (0..2), description: "Technically incorrect or misleading" }
|
19
|
+
# ]
|
20
|
+
# )
|
21
|
+
#
|
22
|
+
# @example Integer scoring levels
|
23
|
+
# rubric = ScoringRubric.new(
|
24
|
+
# name: :technical_accuracy ,
|
25
|
+
# description: "Evaluates technical correctness and precision",
|
26
|
+
# levels: [
|
27
|
+
# { score: 5, description: "Technically perfect with no errors" },
|
28
|
+
# { score: 4, description: "Mostly correct with minor technical issues" },
|
29
|
+
# { score: 3, description: "Generally correct but some technical problems" },
|
30
|
+
# { score: 2, description: "Significant technical errors present" },
|
31
|
+
# { score: 1, description: "Mostly incorrect or misleading" },
|
32
|
+
# { score: 0, description: "Completely incorrect or misleading" }
|
33
|
+
# ]
|
34
|
+
# )
|
35
|
+
#
|
36
|
+
# @example Using built-in rubrics
|
37
|
+
# accuracy_rubric = ScoringRubric.accuracy
|
38
|
+
# helpfulness_rubric = ScoringRubric.helpfulness
|
39
|
+
# clarity_rubric = ScoringRubric.clarity
|
40
|
+
#
|
41
|
+
class ScoringRubric
|
42
|
+
# @return [Symbol] The rubric's identifier name
|
43
|
+
attr_reader :name
|
44
|
+
# @return [String] Human-readable description of what this rubric evaluates
|
45
|
+
attr_reader :description
|
46
|
+
# @return [Array<Hash>] Array of scoring level definitions
|
47
|
+
attr_reader :levels
|
48
|
+
|
49
|
+
# Creates a new ScoringRubric with the specified criteria.
|
50
|
+
#
|
51
|
+
# @param name [Symbol] Identifier for this rubric (e.g., :accuracy, :helpfulness)
|
52
|
+
# @param description [String] Human-readable description of what this rubric evaluates
|
53
|
+
# @param levels [Array<Hash>] Array of scoring level definitions. Each level must contain
|
54
|
+
# either :score (Integer) or :score_range (Range), plus :description (String)
|
55
|
+
def initialize(name:, description:, levels:)
|
56
|
+
@name = name
|
57
|
+
@description = description
|
58
|
+
@levels = levels
|
59
|
+
end
|
60
|
+
|
61
|
+
# Converts the rubric into a formatted string suitable for LLM prompts.
|
62
|
+
#
|
63
|
+
# The output includes the rubric description followed by a detailed breakdown
|
64
|
+
# of all scoring levels with their criteria.
|
65
|
+
#
|
66
|
+
# @return [String] Formatted rubric text ready for inclusion in prompts
|
67
|
+
#
|
68
|
+
# @example Output format
|
69
|
+
# "Evaluates factual correctness and precision
|
70
|
+
#
|
71
|
+
# Scoring levels:
|
72
|
+
# - 9-10: Completely accurate with no errors
|
73
|
+
# - 7-8: Mostly accurate with minor imprecisions
|
74
|
+
# - 5-6: Generally accurate but some notable errors"
|
75
|
+
#
|
76
|
+
# @raise [ArgumentError] If a level doesn't contain :score or :score_range
|
77
|
+
def to_prompt
|
78
|
+
prompt = "#{description}\n\nScoring levels:\n"
|
79
|
+
|
80
|
+
levels.each do |level|
|
81
|
+
if level.key?(:score)
|
82
|
+
score = level[:score]
|
83
|
+
prompt += "- #{score}: #{level[:description]}\n"
|
84
|
+
else
|
85
|
+
range = level[:score_range]
|
86
|
+
min, max = case range
|
87
|
+
when Range
|
88
|
+
[range.begin, range.exclude_end? ? range.end - 1 : range.end]
|
89
|
+
else
|
90
|
+
raise ArgumentError, "level must include :score or :score_range (Range)"
|
91
|
+
end
|
92
|
+
prompt += "- #{min}-#{max}: #{level[:description]}\n"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
prompt.strip
|
97
|
+
end
|
98
|
+
|
99
|
+
class << self
|
100
|
+
# Creates a rubric for evaluating factual accuracy and correctness.
|
101
|
+
#
|
102
|
+
# This rubric focuses on whether information is factually correct,
|
103
|
+
# precise, and free from errors or misconceptions.
|
104
|
+
#
|
105
|
+
# @return [ScoringRubric] Pre-configured accuracy rubric (1-5 scale)
|
106
|
+
#
|
107
|
+
# @example
|
108
|
+
# rubric = ScoringRubric.accuracy
|
109
|
+
# expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
|
110
|
+
def accuracy
|
111
|
+
new(
|
112
|
+
name: :accuracy,
|
113
|
+
description: "Evaluates factual correctness and precision",
|
114
|
+
levels: [
|
115
|
+
{ score: 5, description: "Completely accurate with no errors" },
|
116
|
+
{ score: 4, description: "Mostly accurate with minor imprecisions" },
|
117
|
+
{ score: 3, description: "Generally accurate but some notable errors" },
|
118
|
+
{ score: 2, description: "Significant inaccuracies present" },
|
119
|
+
{ score: 1, description: "Mostly or entirely inaccurate" }
|
120
|
+
]
|
121
|
+
)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Creates a rubric for evaluating how well content addresses user needs.
|
125
|
+
#
|
126
|
+
# This rubric assesses whether the response is useful, relevant, and
|
127
|
+
# effectively helps the user accomplish their goals.
|
128
|
+
#
|
129
|
+
# @return [ScoringRubric] Pre-configured helpfulness rubric (1-5 scale)
|
130
|
+
#
|
131
|
+
# @example
|
132
|
+
# rubric = ScoringRubric.helpfulness
|
133
|
+
# expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
|
134
|
+
def helpfulness
|
135
|
+
new(
|
136
|
+
name: :helpfulness,
|
137
|
+
description: "Evaluates how well the response addresses user needs",
|
138
|
+
levels: [
|
139
|
+
{ score: 5, description: "Extremely helpful, fully addresses the need" },
|
140
|
+
{ score: 4, description: "Very helpful with good coverage" },
|
141
|
+
{ score: 3, description: "Moderately helpful but missing some aspects" },
|
142
|
+
{ score: 2, description: "Somewhat helpful but significant gaps" },
|
143
|
+
{ score: 1, description: "Not helpful or misleading" }
|
144
|
+
]
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
# Creates a rubric for evaluating clarity and comprehensibility.
|
149
|
+
#
|
150
|
+
# This rubric focuses on how easy content is to understand, whether
|
151
|
+
# it's well-organized, and if the language is appropriate for the audience.
|
152
|
+
#
|
153
|
+
# @return [ScoringRubric] Pre-configured clarity rubric (1-5 scale)
|
154
|
+
#
|
155
|
+
# @example
|
156
|
+
# rubric = ScoringRubric.clarity
|
157
|
+
# expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
|
158
|
+
def clarity
|
159
|
+
new(
|
160
|
+
name: :clarity,
|
161
|
+
description: "Evaluates clarity and comprehensibility",
|
162
|
+
levels: [
|
163
|
+
{ score: 5, description: "Crystal clear and easy to understand" },
|
164
|
+
{ score: 4, description: "Clear with minor ambiguities" },
|
165
|
+
{ score: 3, description: "Generally clear but some confusion" },
|
166
|
+
{ score: 2, description: "Unclear in significant ways" },
|
167
|
+
{ score: 1, description: "Very unclear or incomprehensible" }
|
168
|
+
]
|
169
|
+
)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
data/lib/raif/evals.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "raif/evals/expectation_result"
|
4
|
+
require "raif/evals/eval"
|
5
|
+
require "raif/evals/eval_set"
|
6
|
+
require "raif/evals/run"
|
7
|
+
require "raif/evals/llm_judge"
|
8
|
+
require "raif/evals/llm_judges/binary"
|
9
|
+
require "raif/evals/llm_judges/comparative"
|
10
|
+
require "raif/evals/llm_judges/scored"
|
11
|
+
require "raif/evals/llm_judges/summarization"
|
12
|
+
require "raif/evals/scoring_rubric"
|
13
|
+
|
14
|
+
module Raif
|
15
|
+
module Evals
|
16
|
+
# Namespace modules for organizing eval sets
|
17
|
+
module Tasks
|
18
|
+
end
|
19
|
+
|
20
|
+
module Conversations
|
21
|
+
end
|
22
|
+
|
23
|
+
module Agents
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/raif/llm_registry.rb
CHANGED
@@ -113,6 +113,27 @@ module Raif
|
|
113
113
|
output_token_cost: 4.4 / 1_000_000,
|
114
114
|
model_provider_settings: { supports_temperature: false },
|
115
115
|
},
|
116
|
+
{
|
117
|
+
key: :open_ai_gpt_5,
|
118
|
+
api_name: "gpt-5",
|
119
|
+
input_token_cost: 1.25 / 1_000_000,
|
120
|
+
output_token_cost: 10.0 / 1_000_000,
|
121
|
+
model_provider_settings: { supports_temperature: false },
|
122
|
+
},
|
123
|
+
{
|
124
|
+
key: :open_ai_gpt_5_mini,
|
125
|
+
api_name: "gpt-5-mini",
|
126
|
+
input_token_cost: 0.25 / 1_000_000,
|
127
|
+
output_token_cost: 2.0 / 1_000_000,
|
128
|
+
model_provider_settings: { supports_temperature: false },
|
129
|
+
},
|
130
|
+
{
|
131
|
+
key: :open_ai_gpt_5_nano,
|
132
|
+
api_name: "gpt-5-nano",
|
133
|
+
input_token_cost: 0.05 / 1_000_000,
|
134
|
+
output_token_cost: 0.4 / 1_000_000,
|
135
|
+
model_provider_settings: { supports_temperature: false },
|
136
|
+
}
|
116
137
|
]
|
117
138
|
|
118
139
|
open_ai_responses_models = open_ai_models.dup.map.with_index do |model, _index|
|
@@ -321,6 +342,18 @@ module Raif
|
|
321
342
|
input_token_cost: 0.27 / 1_000_000,
|
322
343
|
output_token_cost: 1.1 / 1_000_000,
|
323
344
|
},
|
345
|
+
{
|
346
|
+
key: :open_router_open_ai_gpt_oss_120b,
|
347
|
+
api_name: "gpt-oss-120b",
|
348
|
+
input_token_cost: 0.15 / 1_000_000,
|
349
|
+
output_token_cost: 0.6 / 1_000_000,
|
350
|
+
},
|
351
|
+
{
|
352
|
+
key: :open_router_open_ai_gpt_oss_20b,
|
353
|
+
api_name: "gpt-oss-20b",
|
354
|
+
input_token_cost: 0.05 / 1_000_000,
|
355
|
+
output_token_cost: 0.2 / 1_000_000,
|
356
|
+
}
|
324
357
|
]
|
325
358
|
}
|
326
359
|
end
|
@@ -53,8 +53,7 @@ module Raif
|
|
53
53
|
end
|
54
54
|
|
55
55
|
def build_warning_message(uninstalled_migration_names)
|
56
|
-
<<~WARNING
|
57
|
-
\e[33m
|
56
|
+
msg = <<~WARNING
|
58
57
|
⚠️ RAIF MIGRATION WARNING ⚠️
|
59
58
|
|
60
59
|
The following Raif migrations have not been run in your application:
|
@@ -66,8 +65,9 @@ module Raif
|
|
66
65
|
rails raif:install:migrations
|
67
66
|
rails db:migrate
|
68
67
|
|
69
|
-
\e[0m
|
70
68
|
WARNING
|
69
|
+
|
70
|
+
Raif::Utils::Colors.yellow(msg)
|
71
71
|
end
|
72
72
|
end
|
73
73
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Utils
|
5
|
+
module Colors
|
6
|
+
def self.green(text)
|
7
|
+
"\e[32m#{text}\e[0m"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.red(text)
|
11
|
+
"\e[31m#{text}\e[0m"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.yellow(text)
|
15
|
+
"\e[33m#{text}\e[0m"
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.blue(text)
|
19
|
+
"\e[34m#{text}\e[0m"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/raif/utils.rb
CHANGED
data/lib/raif/version.rb
CHANGED