raif 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/app/jobs/raif/conversation_entry_job.rb +1 -1
  3. data/app/models/raif/agents/re_act_step.rb +1 -2
  4. data/app/models/raif/concerns/has_llm.rb +1 -1
  5. data/app/models/raif/concerns/task_run_args.rb +62 -0
  6. data/app/models/raif/conversation.rb +5 -1
  7. data/app/models/raif/conversation_entry.rb +6 -8
  8. data/app/models/raif/llm.rb +1 -1
  9. data/app/models/raif/llms/open_router.rb +3 -1
  10. data/app/models/raif/task.rb +22 -9
  11. data/app/views/raif/conversation_entries/_form.html.erb +1 -1
  12. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  13. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  14. data/config/locales/en.yml +8 -0
  15. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  16. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  17. data/exe/raif +7 -0
  18. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  19. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  20. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  21. data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
  22. data/lib/generators/raif/base_generator.rb +19 -0
  23. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  24. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  25. data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
  26. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  27. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  28. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  29. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  30. data/lib/generators/raif/install/install_generator.rb +15 -0
  31. data/lib/generators/raif/install/templates/initializer.rb +11 -0
  32. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
  33. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
  34. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
  35. data/lib/generators/raif/task/task_generator.rb +22 -3
  36. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  37. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  38. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  39. data/lib/raif/cli/base.rb +39 -0
  40. data/lib/raif/cli/evals.rb +47 -0
  41. data/lib/raif/cli/evals_setup.rb +27 -0
  42. data/lib/raif/cli.rb +67 -0
  43. data/lib/raif/configuration.rb +20 -6
  44. data/lib/raif/evals/eval.rb +30 -0
  45. data/lib/raif/evals/eval_set.rb +111 -0
  46. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  47. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  48. data/lib/raif/evals/expectation_result.rb +39 -0
  49. data/lib/raif/evals/llm_judge.rb +32 -0
  50. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  51. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  52. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  53. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  54. data/lib/raif/evals/run.rb +201 -0
  55. data/lib/raif/evals/scoring_rubric.rb +174 -0
  56. data/lib/raif/evals.rb +26 -0
  57. data/lib/raif/llm_registry.rb +33 -0
  58. data/lib/raif/migration_checker.rb +3 -3
  59. data/lib/raif/utils/colors.rb +23 -0
  60. data/lib/raif/utils.rb +1 -0
  61. data/lib/raif/version.rb +1 -1
  62. data/lib/raif.rb +4 -0
  63. data/spec/support/current_temperature_test_tool.rb +34 -0
  64. data/spec/support/test_conversation.rb +1 -1
  65. metadata +35 -3
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Summarization < Raif::Evals::LlmJudge
7
+ task_run_arg :original_content # the original content to evaluate the summary against
8
+ task_run_arg :summary # the summary to evaluate against the original content
9
+
10
+ json_response_schema do
11
+ object :coverage do
12
+ string :justification, description: "Justification for the score"
13
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
14
+ end
15
+
16
+ object :accuracy do
17
+ string :justification, description: "Justification for the score"
18
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
19
+ end
20
+
21
+ object :clarity do
22
+ string :justification, description: "Justification for the score"
23
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
24
+ end
25
+
26
+ object :conciseness do
27
+ string :justification, description: "Justification for the score"
28
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
29
+ end
30
+
31
+ object :overall do
32
+ string :justification, description: "Justification for the score"
33
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
34
+ end
35
+ end
36
+
37
+ def build_system_prompt
38
+ <<~PROMPT.strip
39
+ You are an impartial expert judge of summary quality. You'll be provided a original piece of content and its summary. Your job is to evaluate the summary against the original content based on the following criteria, and assign a score from 1 to 5 for each (5 = excellent, 1 = very poor):
40
+
41
+ **Coverage (Relevance & Completeness):** Does the summary capture all the important points of the original content?
42
+ - 5 = Excellent Coverage - Nearly all key points and essential details from the content are present in the summary, with no major omissions.
43
+ - 4 = Good Coverage - Most important points are included, but a minor detail or two might be missing.
44
+ - 3 = Fair Coverage - Some main points appear, but the summary misses or glosses over other important information.
45
+ - 2 = Poor Coverage - Many critical points from the content are missing; the summary is incomplete.
46
+ - 1 = Very Poor - The summary fails to include most of the content's main points (highly incomplete).
47
+
48
+ **Accuracy (Faithfulness to the Source):** Is the summary factually correct and free of hallucinations or misrepresentations of the content?
49
+ - 5 = Fully Accurate - All statements in the summary are correct and directly supported by the content. No errors or invented information.
50
+ - 4 = Mostly Accurate - The summary is generally accurate with perhaps one minor error or slight ambiguity, but no significant falsehoods.
51
+ - 3 = Some Inaccuracies - Contains a few errors or unsupported claims from the content, but overall captures the gist correctly.
52
+ - 2 = Mostly Inaccurate - Multiple statements in the summary are incorrect or not supported by the content.
53
+ - 1 = Completely Inaccurate - The summary seriously distorts or contradicts the content; many claims are false or not in the source.
54
+
55
+ **Clarity and Coherence:** Is the summary well-written and easy to understand? (Consider organization, flow, and whether it would make sense to a reader.)
56
+ - 5 = Very Clear & Coherent - The summary is logically organized, flows well, and would be easily understood by the target reader. No confusion or ambiguity.
57
+ - 4 = Mostly Clear - Readable and mostly well-structured, though a sentence or transition could be smoother.
58
+ - 3 = Somewhat Clear - The summary makes sense overall but might be disjointed or awkward in places, requiring effort to follow.
59
+ - 2 = Generally Unclear - Lacks coherence or has poor phrasing that makes it hard to follow the ideas.
60
+ - 1 = Very Poor Clarity - The summary is very confusing or poorly structured, making it hard to understand.
61
+
62
+ **Conciseness:** Is the summary succinct while still informative? (It should omit unnecessary detail but not at the expense of coverage.)
63
+ - 5 = Highly Concise - The summary is brief yet covers all important information (no fluff or redundancy).
64
+ - 4 = Concise - Generally to-the-point, with only minor redundancy or superfluous content.
65
+ - 3 = Moderately Concise - Some excess detail or repetition that could be trimmed, but not egregious.
66
+ - 2 = Verbose - Contains a lot of unnecessary detail or repeats points, making it longer than needed.
67
+ - 1 = Excessively Verbose - The summary is overly long or wordy, with much content that doesn't add value.
68
+ PROMPT
69
+ end
70
+
71
+ def build_prompt
72
+ <<~PROMPT.strip
73
+ # Instructions
74
+ Below is an original piece of content and its summary. Evaluate the summary against the original content based on our 4 criteria. For each, you should provide:
75
+ - A brief justification (1-3 sentences) noting any relevant observations (e.g. what was missing, incorrect, unclear, or well-done).
76
+ - A score from 1 to 5 (5 = excellent, 1 = very poor).
77
+
78
+ Finally, provide an **overall evaluation** of the summary, consisting of a brief justification (1-3 sentences) and a score from 1 to 5 (5 = excellent, 1 = very poor).
79
+
80
+ # Output Format
81
+ Format your output as a JSON object with the following keys:
82
+ {
83
+ "coverage": {
84
+ "justification": "...",
85
+ "score": 1-5
86
+ },
87
+ "accuracy": {
88
+ "justification": "...",
89
+ "score": 1-5
90
+ },
91
+ "clarity": {
92
+ "justification": "...",
93
+ "score": 1-5
94
+ },
95
+ "conciseness": {
96
+ "justification": "...",
97
+ "score": 1-5
98
+ },
99
+ "overall": {
100
+ "justification": "...",
101
+ "score": 1-5
102
+ }
103
+ }
104
+ #{additional_context_prompt}
105
+ # Original Article/Document
106
+ #{original_content}
107
+
108
+ # Summary to Evaluate
109
+ #{summary}
110
+ PROMPT
111
+ end
112
+
113
+ def overall_score
114
+ parsed_response["overall"]["score"] if completed?
115
+ end
116
+
117
+ def overall_justification
118
+ parsed_response["overall"]["justification"] if completed?
119
+ end
120
+
121
+ def coverage_score
122
+ parsed_response["coverage"]["score"] if completed?
123
+ end
124
+
125
+ def coverage_justification
126
+ parsed_response["coverage"]["justification"] if completed?
127
+ end
128
+
129
+ def accuracy_score
130
+ parsed_response["accuracy"]["score"] if completed?
131
+ end
132
+
133
+ def accuracy_justification
134
+ parsed_response["accuracy"]["justification"] if completed?
135
+ end
136
+
137
+ def clarity_score
138
+ parsed_response["clarity"]["score"] if completed?
139
+ end
140
+
141
+ def clarity_justification
142
+ parsed_response["clarity"]["justification"] if completed?
143
+ end
144
+
145
+ def conciseness_score
146
+ parsed_response["conciseness"]["score"] if completed?
147
+ end
148
+
149
+ def conciseness_justification
150
+ parsed_response["conciseness"]["justification"] if completed?
151
+ end
152
+
153
+ private
154
+
155
+ def additional_context_prompt
156
+ return if additional_context.blank?
157
+
158
+ <<~PROMPT
159
+ \n# Additional context:
160
+ #{additional_context}
161
+ PROMPT
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+
6
+ module Raif
7
+ module Evals
8
+ class Run
9
+ attr_reader :eval_sets, :results, :output
10
+
11
+ def initialize(file_paths: nil, output: $stdout)
12
+ @output = output
13
+ @results = {}
14
+
15
+ @eval_sets = if file_paths&.any?
16
+ load_eval_sets_from_files(file_paths)
17
+ else
18
+ discover_eval_sets
19
+ end
20
+ end
21
+
22
+ def execute
23
+ # Load setup file if it exists
24
+ setup_file = Rails.root.join("raif_evals", "setup.rb")
25
+ if File.exist?(setup_file)
26
+ require setup_file
27
+ else
28
+ output.puts Raif::Utils::Colors.red("\n\nNo setup file found. To set up Raif evals, run:\n")
29
+ output.puts Raif::Utils::Colors.red("bundle exec raif evals:setup\n")
30
+ exit 1
31
+ end
32
+
33
+ output.puts "\nStarting Raif Eval Run"
34
+ output.puts ""
35
+ output.puts "Raif.config.default_llm_model_key: #{Raif.config.default_llm_model_key}"
36
+ output.puts ""
37
+ output.puts "=" * 50
38
+
39
+ @eval_sets.each do |eval_set_entry|
40
+ eval_set_class, file_path, line_number = if eval_set_entry.is_a?(Hash)
41
+ [eval_set_entry[:class], eval_set_entry[:file_path], eval_set_entry[:line_number]]
42
+ else
43
+ [eval_set_entry, nil, nil]
44
+ end
45
+
46
+ if line_number
47
+ # Running specific eval by line number
48
+ output.puts "\nRunning #{eval_set_class.name} at line #{line_number}"
49
+ output.puts "-" * 50
50
+
51
+ eval_results = run_eval_at_line(eval_set_class, file_path, line_number)
52
+ else
53
+ # Running all evals in the set
54
+ output.puts "\nRunning #{eval_set_class.name}"
55
+ output.puts "-" * 50
56
+
57
+ eval_results = eval_set_class.run(output: output)
58
+ end
59
+
60
+ @results[eval_set_class.name] = eval_results.map(&:to_h)
61
+ passed_count = eval_results.count(&:passed?)
62
+ total_count = eval_results.count
63
+
64
+ output.puts "-" * 50
65
+ output.puts "#{eval_set_class.name}: #{passed_count}/#{total_count} evals passed"
66
+ end
67
+
68
+ export_results
69
+ print_summary
70
+ end
71
+
72
+ private
73
+
74
+ def load_eval_sets_from_files(file_paths)
75
+ eval_sets = []
76
+
77
+ file_paths.each do |f|
78
+ file_path = f[:file_path]
79
+ line_number = f[:line_number]
80
+
81
+ # Convert relative path to absolute
82
+ absolute_path = File.expand_path(file_path)
83
+
84
+ unless File.exist?(absolute_path)
85
+ output.puts Raif::Utils::Colors.red("Error: File not found: #{file_path}")
86
+ exit 1
87
+ end
88
+
89
+ subclasses_before = Raif::Evals::EvalSet.subclasses
90
+
91
+ require absolute_path
92
+
93
+ loaded_eval_sets = Raif::Evals::EvalSet.subclasses - subclasses_before
94
+ eval_set_class = loaded_eval_sets.first
95
+
96
+ eval_set_entry = { class: eval_set_class, file_path: absolute_path }
97
+ eval_set_entry[:line_number] = line_number if line_number
98
+
99
+ eval_sets << eval_set_entry
100
+ end
101
+
102
+ eval_sets
103
+ end
104
+
105
+ def run_eval_at_line(eval_set_class, file_path, line_number)
106
+ target_eval = eval_set_class.evals.find{|e| e[:definition_line_number] == line_number }
107
+
108
+ if target_eval.nil?
109
+ output.puts Raif::Utils::Colors.red("Error: No eval block found at line #{line_number}")
110
+ return []
111
+ end
112
+
113
+ instance = eval_set_class.new(output: output)
114
+ [instance.run_eval(target_eval)]
115
+ end
116
+
117
+ def discover_eval_sets
118
+ eval_sets_dir = Rails.root.join("raif_evals", "eval_sets")
119
+ return [] unless eval_sets_dir.exist?
120
+
121
+ Dir.glob(eval_sets_dir.join("**", "*_eval_set.rb")).map do |file|
122
+ relative_path = Pathname.new(file).relative_path_from(Rails.root)
123
+ require Rails.root.join(relative_path)
124
+
125
+ # Extract the path components after raif_evals/eval_sets
126
+ path_from_eval_sets = Pathname.new(file).relative_path_from(eval_sets_dir)
127
+ path_parts = path_from_eval_sets.dirname.to_s.split("/")
128
+
129
+ # Remove "." if it's the only element (meaning file is in eval_sets root)
130
+ path_parts = [] if path_parts == ["."]
131
+
132
+ # Build the full class name
133
+ class_name = File.basename(file, ".rb").camelize
134
+ namespace_parts = ["Raif", "Evals"] + path_parts.map(&:camelize)
135
+ full_class_name = (namespace_parts + [class_name]).join("::")
136
+
137
+ full_class_name.constantize
138
+ end.select { |klass| klass < Raif::Evals::EvalSet }
139
+ end
140
+
141
+ def export_results
142
+ results_dir = Rails.root.join("raif_evals", "results")
143
+ FileUtils.mkdir_p(results_dir)
144
+
145
+ timestamp = Time.current.strftime("%Y%m%d_%H%M%S")
146
+ filename = results_dir.join("eval_run_#{timestamp}.json")
147
+
148
+ File.write(filename, JSON.pretty_generate({
149
+ run_at: Time.current.iso8601,
150
+ results: @results,
151
+ summary: summary_data
152
+ }))
153
+
154
+ output.puts "\nResults exported to: #{filename}"
155
+ end
156
+
157
+ def summary_data
158
+ total_eval_sets = @results.count
159
+ total_evals = @results.values.sum(&:count)
160
+ passed_evals = @results.values.sum { |evals| evals.count { |e| e[:passed] } }
161
+
162
+ total_expectations = @results.values.sum do |evals|
163
+ evals.sum { |e| e[:expectation_results].count }
164
+ end
165
+
166
+ passed_expectations = @results.values.sum do |evals|
167
+ evals.sum { |e| e[:expectation_results].count { |r| r[:status] == :passed } }
168
+ end
169
+
170
+ {
171
+ total_eval_sets: total_eval_sets,
172
+ total_evals: total_evals,
173
+ passed_evals: passed_evals,
174
+ total_expectations: total_expectations,
175
+ passed_expectations: passed_expectations
176
+ }
177
+ end
178
+
179
+ def print_summary
180
+ data = summary_data
181
+
182
+ output.puts ""
183
+ output.puts "\n" + "=" * 50
184
+ output.puts "SUMMARY"
185
+ output.puts "=" * 50
186
+ output.puts "Eval Sets: #{data[:total_eval_sets]}"
187
+ output.puts ""
188
+ output.puts "Evals:"
189
+ output.puts " #{data[:total_evals]} total"
190
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_evals]} passed")
191
+ output.puts Raif::Utils::Colors.red(" #{data[:total_evals] - data[:passed_evals]} failed")
192
+ output.puts ""
193
+ output.puts "Expectations:"
194
+ output.puts " #{data[:total_expectations]} total"
195
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_expectations]} passed")
196
+ output.puts Raif::Utils::Colors.red(" #{data[:total_expectations] - data[:passed_expectations]} failed")
197
+ output.puts ""
198
+ end
199
+ end
200
+ end
201
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ # ScoringRubric provides a standardized way to define evaluation criteria with
6
+ # multiple scoring levels. Each level can define either a score range or a single
7
+ # score value, along with descriptive text explaining what qualifies for that score.
8
+ #
9
+ # @example Creating a custom rubric
10
+ # rubric = ScoringRubric.new(
11
+ # name: :technical_accuracy,
12
+ # description: "Evaluates technical correctness and precision",
13
+ # levels: [
14
+ # { score_range: (9..10), description: "Technically perfect with no errors" },
15
+ # { score_range: (7..8), description: "Mostly correct with minor technical issues" },
16
+ # { score_range: (5..6), description: "Generally correct but some technical problems" },
17
+ # { score_range: (3..4), description: "Significant technical errors present" },
18
+ # { score_range: (0..2), description: "Technically incorrect or misleading" }
19
+ # ]
20
+ # )
21
+ #
22
+ # @example Integer scoring levels
23
+ # rubric = ScoringRubric.new(
24
+ # name: :technical_accuracy ,
25
+ # description: "Evaluates technical correctness and precision",
26
+ # levels: [
27
+ # { score: 5, description: "Technically perfect with no errors" },
28
+ # { score: 4, description: "Mostly correct with minor technical issues" },
29
+ # { score: 3, description: "Generally correct but some technical problems" },
30
+ # { score: 2, description: "Significant technical errors present" },
31
+ # { score: 1, description: "Mostly incorrect or misleading" },
32
+ # { score: 0, description: "Completely incorrect or misleading" }
33
+ # ]
34
+ # )
35
+ #
36
+ # @example Using built-in rubrics
37
+ # accuracy_rubric = ScoringRubric.accuracy
38
+ # helpfulness_rubric = ScoringRubric.helpfulness
39
+ # clarity_rubric = ScoringRubric.clarity
40
+ #
41
+ class ScoringRubric
42
+ # @return [Symbol] The rubric's identifier name
43
+ attr_reader :name
44
+ # @return [String] Human-readable description of what this rubric evaluates
45
+ attr_reader :description
46
+ # @return [Array<Hash>] Array of scoring level definitions
47
+ attr_reader :levels
48
+
49
+ # Creates a new ScoringRubric with the specified criteria.
50
+ #
51
+ # @param name [Symbol] Identifier for this rubric (e.g., :accuracy, :helpfulness)
52
+ # @param description [String] Human-readable description of what this rubric evaluates
53
+ # @param levels [Array<Hash>] Array of scoring level definitions. Each level must contain
54
+ # either :score (Integer) or :score_range (Range), plus :description (String)
55
+ def initialize(name:, description:, levels:)
56
+ @name = name
57
+ @description = description
58
+ @levels = levels
59
+ end
60
+
61
+ # Converts the rubric into a formatted string suitable for LLM prompts.
62
+ #
63
+ # The output includes the rubric description followed by a detailed breakdown
64
+ # of all scoring levels with their criteria.
65
+ #
66
+ # @return [String] Formatted rubric text ready for inclusion in prompts
67
+ #
68
+ # @example Output format
69
+ # "Evaluates factual correctness and precision
70
+ #
71
+ # Scoring levels:
72
+ # - 9-10: Completely accurate with no errors
73
+ # - 7-8: Mostly accurate with minor imprecisions
74
+ # - 5-6: Generally accurate but some notable errors"
75
+ #
76
+ # @raise [ArgumentError] If a level doesn't contain :score or :score_range
77
+ def to_prompt
78
+ prompt = "#{description}\n\nScoring levels:\n"
79
+
80
+ levels.each do |level|
81
+ if level.key?(:score)
82
+ score = level[:score]
83
+ prompt += "- #{score}: #{level[:description]}\n"
84
+ else
85
+ range = level[:score_range]
86
+ min, max = case range
87
+ when Range
88
+ [range.begin, range.exclude_end? ? range.end - 1 : range.end]
89
+ else
90
+ raise ArgumentError, "level must include :score or :score_range (Range)"
91
+ end
92
+ prompt += "- #{min}-#{max}: #{level[:description]}\n"
93
+ end
94
+ end
95
+
96
+ prompt.strip
97
+ end
98
+
99
+ class << self
100
+ # Creates a rubric for evaluating factual accuracy and correctness.
101
+ #
102
+ # This rubric focuses on whether information is factually correct,
103
+ # precise, and free from errors or misconceptions.
104
+ #
105
+ # @return [ScoringRubric] Pre-configured accuracy rubric (1-5 scale)
106
+ #
107
+ # @example
108
+ # rubric = ScoringRubric.accuracy
109
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
110
+ def accuracy
111
+ new(
112
+ name: :accuracy,
113
+ description: "Evaluates factual correctness and precision",
114
+ levels: [
115
+ { score: 5, description: "Completely accurate with no errors" },
116
+ { score: 4, description: "Mostly accurate with minor imprecisions" },
117
+ { score: 3, description: "Generally accurate but some notable errors" },
118
+ { score: 2, description: "Significant inaccuracies present" },
119
+ { score: 1, description: "Mostly or entirely inaccurate" }
120
+ ]
121
+ )
122
+ end
123
+
124
+ # Creates a rubric for evaluating how well content addresses user needs.
125
+ #
126
+ # This rubric assesses whether the response is useful, relevant, and
127
+ # effectively helps the user accomplish their goals.
128
+ #
129
+ # @return [ScoringRubric] Pre-configured helpfulness rubric (1-5 scale)
130
+ #
131
+ # @example
132
+ # rubric = ScoringRubric.helpfulness
133
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
134
+ def helpfulness
135
+ new(
136
+ name: :helpfulness,
137
+ description: "Evaluates how well the response addresses user needs",
138
+ levels: [
139
+ { score: 5, description: "Extremely helpful, fully addresses the need" },
140
+ { score: 4, description: "Very helpful with good coverage" },
141
+ { score: 3, description: "Moderately helpful but missing some aspects" },
142
+ { score: 2, description: "Somewhat helpful but significant gaps" },
143
+ { score: 1, description: "Not helpful or misleading" }
144
+ ]
145
+ )
146
+ end
147
+
148
+ # Creates a rubric for evaluating clarity and comprehensibility.
149
+ #
150
+ # This rubric focuses on how easy content is to understand, whether
151
+ # it's well-organized, and if the language is appropriate for the audience.
152
+ #
153
+ # @return [ScoringRubric] Pre-configured clarity rubric (1-5 scale)
154
+ #
155
+ # @example
156
+ # rubric = ScoringRubric.clarity
157
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
158
+ def clarity
159
+ new(
160
+ name: :clarity,
161
+ description: "Evaluates clarity and comprehensibility",
162
+ levels: [
163
+ { score: 5, description: "Crystal clear and easy to understand" },
164
+ { score: 4, description: "Clear with minor ambiguities" },
165
+ { score: 3, description: "Generally clear but some confusion" },
166
+ { score: 2, description: "Unclear in significant ways" },
167
+ { score: 1, description: "Very unclear or incomprehensible" }
168
+ ]
169
+ )
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end
data/lib/raif/evals.rb ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "raif/evals/expectation_result"
4
+ require "raif/evals/eval"
5
+ require "raif/evals/eval_set"
6
+ require "raif/evals/run"
7
+ require "raif/evals/llm_judge"
8
+ require "raif/evals/llm_judges/binary"
9
+ require "raif/evals/llm_judges/comparative"
10
+ require "raif/evals/llm_judges/scored"
11
+ require "raif/evals/llm_judges/summarization"
12
+ require "raif/evals/scoring_rubric"
13
+
14
+ module Raif
15
+ module Evals
16
+ # Namespace modules for organizing eval sets
17
+ module Tasks
18
+ end
19
+
20
+ module Conversations
21
+ end
22
+
23
+ module Agents
24
+ end
25
+ end
26
+ end
@@ -113,6 +113,27 @@ module Raif
113
113
  output_token_cost: 4.4 / 1_000_000,
114
114
  model_provider_settings: { supports_temperature: false },
115
115
  },
116
+ {
117
+ key: :open_ai_gpt_5,
118
+ api_name: "gpt-5",
119
+ input_token_cost: 1.25 / 1_000_000,
120
+ output_token_cost: 10.0 / 1_000_000,
121
+ model_provider_settings: { supports_temperature: false },
122
+ },
123
+ {
124
+ key: :open_ai_gpt_5_mini,
125
+ api_name: "gpt-5-mini",
126
+ input_token_cost: 0.25 / 1_000_000,
127
+ output_token_cost: 2.0 / 1_000_000,
128
+ model_provider_settings: { supports_temperature: false },
129
+ },
130
+ {
131
+ key: :open_ai_gpt_5_nano,
132
+ api_name: "gpt-5-nano",
133
+ input_token_cost: 0.05 / 1_000_000,
134
+ output_token_cost: 0.4 / 1_000_000,
135
+ model_provider_settings: { supports_temperature: false },
136
+ }
116
137
  ]
117
138
 
118
139
  open_ai_responses_models = open_ai_models.dup.map.with_index do |model, _index|
@@ -321,6 +342,18 @@ module Raif
321
342
  input_token_cost: 0.27 / 1_000_000,
322
343
  output_token_cost: 1.1 / 1_000_000,
323
344
  },
345
+ {
346
+ key: :open_router_open_ai_gpt_oss_120b,
347
+ api_name: "gpt-oss-120b",
348
+ input_token_cost: 0.15 / 1_000_000,
349
+ output_token_cost: 0.6 / 1_000_000,
350
+ },
351
+ {
352
+ key: :open_router_open_ai_gpt_oss_20b,
353
+ api_name: "gpt-oss-20b",
354
+ input_token_cost: 0.05 / 1_000_000,
355
+ output_token_cost: 0.2 / 1_000_000,
356
+ }
324
357
  ]
325
358
  }
326
359
  end
@@ -53,8 +53,7 @@ module Raif
53
53
  end
54
54
 
55
55
  def build_warning_message(uninstalled_migration_names)
56
- <<~WARNING
57
- \e[33m
56
+ msg = <<~WARNING
58
57
  ⚠️ RAIF MIGRATION WARNING ⚠️
59
58
 
60
59
  The following Raif migrations have not been run in your application:
@@ -66,8 +65,9 @@ module Raif
66
65
  rails raif:install:migrations
67
66
  rails db:migrate
68
67
 
69
- \e[0m
70
68
  WARNING
69
+
70
+ Raif::Utils::Colors.yellow(msg)
71
71
  end
72
72
  end
73
73
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Utils
5
+ module Colors
6
+ def self.green(text)
7
+ "\e[32m#{text}\e[0m"
8
+ end
9
+
10
+ def self.red(text)
11
+ "\e[31m#{text}\e[0m"
12
+ end
13
+
14
+ def self.yellow(text)
15
+ "\e[33m#{text}\e[0m"
16
+ end
17
+
18
+ def self.blue(text)
19
+ "\e[34m#{text}\e[0m"
20
+ end
21
+ end
22
+ end
23
+ end
data/lib/raif/utils.rb CHANGED
@@ -4,4 +4,5 @@ module Raif::Utils
4
4
  require "raif/utils/readable_content_extractor"
5
5
  require "raif/utils/html_to_markdown_converter"
6
6
  require "raif/utils/html_fragment_processor"
7
+ require "raif/utils/colors"
7
8
  end
data/lib/raif/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Raif
4
- VERSION = "1.2.2"
4
+ VERSION = "1.3.0"
5
5
  end
data/lib/raif.rb CHANGED
@@ -37,4 +37,8 @@ module Raif
37
37
  def self.logger
38
38
  @logger ||= Rails.logger
39
39
  end
40
+
41
+ def self.running_evals?
42
+ ENV["RAIF_RUNNING_EVALS"] == "true"
43
+ end
40
44
  end