raif 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +29 -935
  3. data/app/assets/builds/raif_admin.css +5 -1
  4. data/app/assets/images/raif-logo-white.svg +8 -0
  5. data/app/assets/stylesheets/raif_admin.scss +4 -0
  6. data/app/jobs/raif/conversation_entry_job.rb +1 -1
  7. data/app/models/raif/agents/re_act_step.rb +1 -2
  8. data/app/models/raif/concerns/has_llm.rb +1 -1
  9. data/app/models/raif/concerns/task_run_args.rb +62 -0
  10. data/app/models/raif/conversation.rb +8 -0
  11. data/app/models/raif/conversation_entry.rb +6 -9
  12. data/app/models/raif/llm.rb +1 -1
  13. data/app/models/raif/llms/open_router.rb +47 -4
  14. data/app/models/raif/task.rb +22 -9
  15. data/app/views/layouts/raif/admin.html.erb +3 -1
  16. data/app/views/raif/conversation_entries/_form.html.erb +1 -1
  17. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  18. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  19. data/config/locales/en.yml +8 -0
  20. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  21. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  22. data/exe/raif +7 -0
  23. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  24. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  25. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  26. data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
  27. data/lib/generators/raif/base_generator.rb +19 -0
  28. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  29. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  30. data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
  31. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  32. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  33. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  34. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  35. data/lib/generators/raif/install/install_generator.rb +15 -0
  36. data/lib/generators/raif/install/templates/initializer.rb +14 -3
  37. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -2
  38. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -76
  39. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +10 -0
  40. data/lib/generators/raif/task/task_generator.rb +22 -3
  41. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  42. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  43. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  44. data/lib/raif/cli/base.rb +39 -0
  45. data/lib/raif/cli/evals.rb +47 -0
  46. data/lib/raif/cli/evals_setup.rb +27 -0
  47. data/lib/raif/cli.rb +67 -0
  48. data/lib/raif/configuration.rb +23 -9
  49. data/lib/raif/engine.rb +2 -1
  50. data/lib/raif/evals/eval.rb +30 -0
  51. data/lib/raif/evals/eval_set.rb +111 -0
  52. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  53. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  54. data/lib/raif/evals/expectation_result.rb +39 -0
  55. data/lib/raif/evals/llm_judge.rb +32 -0
  56. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  57. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  58. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  59. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  60. data/lib/raif/evals/run.rb +201 -0
  61. data/lib/raif/evals/scoring_rubric.rb +174 -0
  62. data/lib/raif/evals.rb +26 -0
  63. data/lib/raif/llm_registry.rb +33 -0
  64. data/lib/raif/migration_checker.rb +3 -3
  65. data/lib/raif/utils/colors.rb +23 -0
  66. data/lib/raif/utils.rb +1 -0
  67. data/lib/raif/version.rb +1 -1
  68. data/lib/raif.rb +4 -0
  69. data/spec/support/current_temperature_test_tool.rb +34 -0
  70. data/spec/support/test_conversation.rb +1 -1
  71. metadata +37 -3
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Binary < Raif::Evals::LlmJudge
7
+ task_run_arg :criteria
8
+ task_run_arg :examples
9
+ task_run_arg :strict_mode
10
+
11
+ json_response_schema do
12
+ boolean :passes, description: "Whether the content passes the criteria"
13
+ string :reasoning, description: "Detailed explanation of the judgment"
14
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
15
+ end
16
+
17
+ def build_system_prompt
18
+ <<~PROMPT.strip
19
+ You are an expert evaluator assessing whether content meets specific criteria.
20
+ Your task is to make binary pass/fail judgments with clear reasoning.
21
+
22
+ First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise pass/fail judgment.
23
+
24
+ Respond with JSON matching this schema:
25
+ {
26
+ "passes": boolean,
27
+ "reasoning": "detailed explanation",
28
+ "confidence": 0.0-1.0
29
+ }
30
+ PROMPT
31
+ end
32
+
33
+ def build_prompt
34
+ prompt = <<~PROMPT
35
+ Evaluation criteria: #{criteria}
36
+
37
+ #{strict_mode ? "Apply the criteria strictly without any leniency." : "Apply reasonable judgment while adhering to the criteria."}
38
+ PROMPT
39
+
40
+ if examples.present?
41
+ prompt += "\nHere are examples of how to evaluate:"
42
+ examples.each do |example|
43
+ prompt += format_example(example)
44
+ end
45
+ end
46
+
47
+ prompt += additional_context_prompt if additional_context.present?
48
+
49
+ prompt += <<~PROMPT.rstrip
50
+
51
+ Now evaluate this content:
52
+ #{content_to_judge}
53
+
54
+ Does this content meet the evaluation criteria?
55
+ PROMPT
56
+
57
+ prompt
58
+ end
59
+
60
+ # Judgment accessor methods
61
+ def passes?
62
+ parsed_response["passes"] if completed?
63
+ end
64
+
65
+ private
66
+
67
+ def additional_context_prompt
68
+ <<~PROMPT
69
+
70
+ Additional context:
71
+ #{additional_context}
72
+ PROMPT
73
+ end
74
+
75
+ def format_example(example)
76
+ if example.key?(:output)
77
+ content_label = "Output"
78
+ content_value = example[:output]
79
+ else
80
+ content_label = "Content"
81
+ content_value = example[:content]
82
+ end
83
+
84
+ <<~EXAMPLE
85
+
86
+ #{content_label}: #{content_value}
87
+ Reasoning: #{example[:reasoning]}
88
+ Judgment: #{example[:passes] ? "PASS" : "FAIL"}
89
+ EXAMPLE
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Comparative < Raif::Evals::LlmJudge
7
+ task_run_arg :over_content # the content to compare against
8
+ task_run_arg :comparison_criteria # the criteria to use when comparing content_to_judge to over_content
9
+ task_run_arg :allow_ties # whether to allow ties in the comparison
10
+
11
+ attr_accessor :content_a, :content_b, :expected_winner
12
+
13
+ before_create do
14
+ self.expected_winner = ["A", "B"].sample
15
+
16
+ if expected_winner == "A"
17
+ self.content_a = content_to_judge
18
+ self.content_b = over_content
19
+ else
20
+ self.content_a = over_content
21
+ self.content_b = content_to_judge
22
+ end
23
+ end
24
+
25
+ json_response_schema do
26
+ string :winner, description: "Which content is better (A, B, or tie)", enum: ["A", "B", "tie"]
27
+ string :reasoning, description: "Detailed explanation of the judgment"
28
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
29
+ end
30
+
31
+ def build_system_prompt
32
+ <<~PROMPT.strip
33
+ You are an expert evaluator comparing two pieces of content to determine which better meets specified criteria.
34
+
35
+ #{allow_ties ? "You may declare a tie if both pieces of content are equally good." : "You must choose a winner even if the difference is minimal."}
36
+
37
+ First, provide detailed reasoning for your choice. Then, provide a precise winner #{allow_ties ? "(A, B, or tie)" : "(A or B)"}.
38
+
39
+ Respond with JSON matching the required schema.
40
+ PROMPT
41
+ end
42
+
43
+ def build_prompt
44
+ <<~PROMPT.strip
45
+ Comparison criteria: #{comparison_criteria}
46
+ #{additional_context_prompt}
47
+ Compare the following two pieces of content:
48
+
49
+ CONTENT A:
50
+ #{content_a}
51
+
52
+ CONTENT B:
53
+ #{content_b}
54
+
55
+ Which content better meets the comparison criteria?
56
+ PROMPT
57
+ end
58
+
59
+ def winner
60
+ parsed_response["winner"] if completed?
61
+ end
62
+
63
+ def tie?
64
+ return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
65
+
66
+ parsed_response["winner"] == "tie"
67
+ end
68
+
69
+ def correct_expected_winner?
70
+ return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
71
+
72
+ parsed_response["winner"] == expected_winner
73
+ end
74
+
75
+ private
76
+
77
+ def additional_context_prompt
78
+ return if additional_context.blank?
79
+
80
+ <<~PROMPT
81
+
82
+ Additional context:
83
+ #{additional_context}
84
+ PROMPT
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Scored < Raif::Evals::LlmJudge
7
+ task_run_arg :scoring_rubric # the scoring rubric to use when evaluating the content
8
+
9
+ json_response_schema do
10
+ number :score, description: "Numerical score based on the rubric"
11
+ string :reasoning, description: "Detailed explanation of the score"
12
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
13
+ end
14
+
15
+ def build_system_prompt
16
+ <<~PROMPT.strip
17
+ You are an expert evaluator providing numerical scores based on a detailed rubric.
18
+
19
+ First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise score according to the provided rubric.
20
+
21
+ Respond with JSON matching this schema:
22
+ {
23
+ "score": number,
24
+ "reasoning": "detailed explanation",
25
+ "confidence": 0.0-1.0
26
+ }
27
+ PROMPT
28
+ end
29
+
30
+ def build_prompt
31
+ <<~PROMPT.strip
32
+ Scoring rubric:
33
+ #{format_rubric(scoring_rubric)}
34
+ #{additional_context_prompt}
35
+ Evaluate the following content according to the scoring rubric:
36
+ #{content_to_judge}
37
+
38
+ Provide your score and detailed reasoning.
39
+ PROMPT
40
+ end
41
+
42
+ def judgment_score
43
+ parsed_response["score"] if completed?
44
+ end
45
+
46
+ private
47
+
48
+ def additional_context_prompt
49
+ return if additional_context.blank?
50
+
51
+ <<~PROMPT
52
+ \nAdditional context:
53
+ #{additional_context}
54
+ PROMPT
55
+ end
56
+
57
+ def format_rubric(rubric)
58
+ rubric.is_a?(ScoringRubric) ? rubric.to_prompt : rubric.to_s
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Summarization < Raif::Evals::LlmJudge
7
+ task_run_arg :original_content # the original content to evaluate the summary against
8
+ task_run_arg :summary # the summary to evaluate against the original content
9
+
10
+ json_response_schema do
11
+ object :coverage do
12
+ string :justification, description: "Justification for the score"
13
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
14
+ end
15
+
16
+ object :accuracy do
17
+ string :justification, description: "Justification for the score"
18
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
19
+ end
20
+
21
+ object :clarity do
22
+ string :justification, description: "Justification for the score"
23
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
24
+ end
25
+
26
+ object :conciseness do
27
+ string :justification, description: "Justification for the score"
28
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
29
+ end
30
+
31
+ object :overall do
32
+ string :justification, description: "Justification for the score"
33
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
34
+ end
35
+ end
36
+
37
+ def build_system_prompt
38
+ <<~PROMPT.strip
39
+ You are an impartial expert judge of summary quality. You'll be provided a original piece of content and its summary. Your job is to evaluate the summary against the original content based on the following criteria, and assign a score from 1 to 5 for each (5 = excellent, 1 = very poor):
40
+
41
+ **Coverage (Relevance & Completeness):** Does the summary capture all the important points of the original content?
42
+ - 5 = Excellent Coverage - Nearly all key points and essential details from the content are present in the summary, with no major omissions.
43
+ - 4 = Good Coverage - Most important points are included, but a minor detail or two might be missing.
44
+ - 3 = Fair Coverage - Some main points appear, but the summary misses or glosses over other important information.
45
+ - 2 = Poor Coverage - Many critical points from the content are missing; the summary is incomplete.
46
+ - 1 = Very Poor - The summary fails to include most of the content's main points (highly incomplete).
47
+
48
+ **Accuracy (Faithfulness to the Source):** Is the summary factually correct and free of hallucinations or misrepresentations of the content?
49
+ - 5 = Fully Accurate - All statements in the summary are correct and directly supported by the content. No errors or invented information.
50
+ - 4 = Mostly Accurate - The summary is generally accurate with perhaps one minor error or slight ambiguity, but no significant falsehoods.
51
+ - 3 = Some Inaccuracies - Contains a few errors or unsupported claims from the content, but overall captures the gist correctly.
52
+ - 2 = Mostly Inaccurate - Multiple statements in the summary are incorrect or not supported by the content.
53
+ - 1 = Completely Inaccurate - The summary seriously distorts or contradicts the content; many claims are false or not in the source.
54
+
55
+ **Clarity and Coherence:** Is the summary well-written and easy to understand? (Consider organization, flow, and whether it would make sense to a reader.)
56
+ - 5 = Very Clear & Coherent - The summary is logically organized, flows well, and would be easily understood by the target reader. No confusion or ambiguity.
57
+ - 4 = Mostly Clear - Readable and mostly well-structured, though a sentence or transition could be smoother.
58
+ - 3 = Somewhat Clear - The summary makes sense overall but might be disjointed or awkward in places, requiring effort to follow.
59
+ - 2 = Generally Unclear - Lacks coherence or has poor phrasing that makes it hard to follow the ideas.
60
+ - 1 = Very Poor Clarity - The summary is very confusing or poorly structured, making it hard to understand.
61
+
62
+ **Conciseness:** Is the summary succinct while still informative? (It should omit unnecessary detail but not at the expense of coverage.)
63
+ - 5 = Highly Concise - The summary is brief yet covers all important information (no fluff or redundancy).
64
+ - 4 = Concise - Generally to-the-point, with only minor redundancy or superfluous content.
65
+ - 3 = Moderately Concise - Some excess detail or repetition that could be trimmed, but not egregious.
66
+ - 2 = Verbose - Contains a lot of unnecessary detail or repeats points, making it longer than needed.
67
+ - 1 = Excessively Verbose - The summary is overly long or wordy, with much content that doesn't add value.
68
+ PROMPT
69
+ end
70
+
71
+ def build_prompt
72
+ <<~PROMPT.strip
73
+ # Instructions
74
+ Below is an original piece of content and its summary. Evaluate the summary against the original content based on our 4 criteria. For each, you should provide:
75
+ - A brief justification (1-3 sentences) noting any relevant observations (e.g. what was missing, incorrect, unclear, or well-done).
76
+ - A score from 1 to 5 (5 = excellent, 1 = very poor).
77
+
78
+ Finally, provide an **overall evaluation** of the summary, consisting of a brief justification (1-3 sentences) and a score from 1 to 5 (5 = excellent, 1 = very poor).
79
+
80
+ # Output Format
81
+ Format your output as a JSON object with the following keys:
82
+ {
83
+ "coverage": {
84
+ "justification": "...",
85
+ "score": 1-5
86
+ },
87
+ "accuracy": {
88
+ "justification": "...",
89
+ "score": 1-5
90
+ },
91
+ "clarity": {
92
+ "justification": "...",
93
+ "score": 1-5
94
+ },
95
+ "conciseness": {
96
+ "justification": "...",
97
+ "score": 1-5
98
+ },
99
+ "overall": {
100
+ "justification": "...",
101
+ "score": 1-5
102
+ }
103
+ }
104
+ #{additional_context_prompt}
105
+ # Original Article/Document
106
+ #{original_content}
107
+
108
+ # Summary to Evaluate
109
+ #{summary}
110
+ PROMPT
111
+ end
112
+
113
+ def overall_score
114
+ parsed_response["overall"]["score"] if completed?
115
+ end
116
+
117
+ def overall_justification
118
+ parsed_response["overall"]["justification"] if completed?
119
+ end
120
+
121
+ def coverage_score
122
+ parsed_response["coverage"]["score"] if completed?
123
+ end
124
+
125
+ def coverage_justification
126
+ parsed_response["coverage"]["justification"] if completed?
127
+ end
128
+
129
+ def accuracy_score
130
+ parsed_response["accuracy"]["score"] if completed?
131
+ end
132
+
133
+ def accuracy_justification
134
+ parsed_response["accuracy"]["justification"] if completed?
135
+ end
136
+
137
+ def clarity_score
138
+ parsed_response["clarity"]["score"] if completed?
139
+ end
140
+
141
+ def clarity_justification
142
+ parsed_response["clarity"]["justification"] if completed?
143
+ end
144
+
145
+ def conciseness_score
146
+ parsed_response["conciseness"]["score"] if completed?
147
+ end
148
+
149
+ def conciseness_justification
150
+ parsed_response["conciseness"]["justification"] if completed?
151
+ end
152
+
153
+ private
154
+
155
+ def additional_context_prompt
156
+ return if additional_context.blank?
157
+
158
+ <<~PROMPT
159
+ \n# Additional context:
160
+ #{additional_context}
161
+ PROMPT
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+
6
+ module Raif
7
+ module Evals
8
+ class Run
9
+ attr_reader :eval_sets, :results, :output
10
+
11
+ def initialize(file_paths: nil, output: $stdout)
12
+ @output = output
13
+ @results = {}
14
+
15
+ @eval_sets = if file_paths&.any?
16
+ load_eval_sets_from_files(file_paths)
17
+ else
18
+ discover_eval_sets
19
+ end
20
+ end
21
+
22
+ def execute
23
+ # Load setup file if it exists
24
+ setup_file = Rails.root.join("raif_evals", "setup.rb")
25
+ if File.exist?(setup_file)
26
+ require setup_file
27
+ else
28
+ output.puts Raif::Utils::Colors.red("\n\nNo setup file found. To set up Raif evals, run:\n")
29
+ output.puts Raif::Utils::Colors.red("bundle exec raif evals:setup\n")
30
+ exit 1
31
+ end
32
+
33
+ output.puts "\nStarting Raif Eval Run"
34
+ output.puts ""
35
+ output.puts "Raif.config.default_llm_model_key: #{Raif.config.default_llm_model_key}"
36
+ output.puts ""
37
+ output.puts "=" * 50
38
+
39
+ @eval_sets.each do |eval_set_entry|
40
+ eval_set_class, file_path, line_number = if eval_set_entry.is_a?(Hash)
41
+ [eval_set_entry[:class], eval_set_entry[:file_path], eval_set_entry[:line_number]]
42
+ else
43
+ [eval_set_entry, nil, nil]
44
+ end
45
+
46
+ if line_number
47
+ # Running specific eval by line number
48
+ output.puts "\nRunning #{eval_set_class.name} at line #{line_number}"
49
+ output.puts "-" * 50
50
+
51
+ eval_results = run_eval_at_line(eval_set_class, file_path, line_number)
52
+ else
53
+ # Running all evals in the set
54
+ output.puts "\nRunning #{eval_set_class.name}"
55
+ output.puts "-" * 50
56
+
57
+ eval_results = eval_set_class.run(output: output)
58
+ end
59
+
60
+ @results[eval_set_class.name] = eval_results.map(&:to_h)
61
+ passed_count = eval_results.count(&:passed?)
62
+ total_count = eval_results.count
63
+
64
+ output.puts "-" * 50
65
+ output.puts "#{eval_set_class.name}: #{passed_count}/#{total_count} evals passed"
66
+ end
67
+
68
+ export_results
69
+ print_summary
70
+ end
71
+
72
+ private
73
+
74
+ def load_eval_sets_from_files(file_paths)
75
+ eval_sets = []
76
+
77
+ file_paths.each do |f|
78
+ file_path = f[:file_path]
79
+ line_number = f[:line_number]
80
+
81
+ # Convert relative path to absolute
82
+ absolute_path = File.expand_path(file_path)
83
+
84
+ unless File.exist?(absolute_path)
85
+ output.puts Raif::Utils::Colors.red("Error: File not found: #{file_path}")
86
+ exit 1
87
+ end
88
+
89
+ subclasses_before = Raif::Evals::EvalSet.subclasses
90
+
91
+ require absolute_path
92
+
93
+ loaded_eval_sets = Raif::Evals::EvalSet.subclasses - subclasses_before
94
+ eval_set_class = loaded_eval_sets.first
95
+
96
+ eval_set_entry = { class: eval_set_class, file_path: absolute_path }
97
+ eval_set_entry[:line_number] = line_number if line_number
98
+
99
+ eval_sets << eval_set_entry
100
+ end
101
+
102
+ eval_sets
103
+ end
104
+
105
+ def run_eval_at_line(eval_set_class, file_path, line_number)
106
+ target_eval = eval_set_class.evals.find{|e| e[:definition_line_number] == line_number }
107
+
108
+ if target_eval.nil?
109
+ output.puts Raif::Utils::Colors.red("Error: No eval block found at line #{line_number}")
110
+ return []
111
+ end
112
+
113
+ instance = eval_set_class.new(output: output)
114
+ [instance.run_eval(target_eval)]
115
+ end
116
+
117
+ def discover_eval_sets
118
+ eval_sets_dir = Rails.root.join("raif_evals", "eval_sets")
119
+ return [] unless eval_sets_dir.exist?
120
+
121
+ Dir.glob(eval_sets_dir.join("**", "*_eval_set.rb")).map do |file|
122
+ relative_path = Pathname.new(file).relative_path_from(Rails.root)
123
+ require Rails.root.join(relative_path)
124
+
125
+ # Extract the path components after raif_evals/eval_sets
126
+ path_from_eval_sets = Pathname.new(file).relative_path_from(eval_sets_dir)
127
+ path_parts = path_from_eval_sets.dirname.to_s.split("/")
128
+
129
+ # Remove "." if it's the only element (meaning file is in eval_sets root)
130
+ path_parts = [] if path_parts == ["."]
131
+
132
+ # Build the full class name
133
+ class_name = File.basename(file, ".rb").camelize
134
+ namespace_parts = ["Raif", "Evals"] + path_parts.map(&:camelize)
135
+ full_class_name = (namespace_parts + [class_name]).join("::")
136
+
137
+ full_class_name.constantize
138
+ end.select { |klass| klass < Raif::Evals::EvalSet }
139
+ end
140
+
141
+ def export_results
142
+ results_dir = Rails.root.join("raif_evals", "results")
143
+ FileUtils.mkdir_p(results_dir)
144
+
145
+ timestamp = Time.current.strftime("%Y%m%d_%H%M%S")
146
+ filename = results_dir.join("eval_run_#{timestamp}.json")
147
+
148
+ File.write(filename, JSON.pretty_generate({
149
+ run_at: Time.current.iso8601,
150
+ results: @results,
151
+ summary: summary_data
152
+ }))
153
+
154
+ output.puts "\nResults exported to: #{filename}"
155
+ end
156
+
157
+ def summary_data
158
+ total_eval_sets = @results.count
159
+ total_evals = @results.values.sum(&:count)
160
+ passed_evals = @results.values.sum { |evals| evals.count { |e| e[:passed] } }
161
+
162
+ total_expectations = @results.values.sum do |evals|
163
+ evals.sum { |e| e[:expectation_results].count }
164
+ end
165
+
166
+ passed_expectations = @results.values.sum do |evals|
167
+ evals.sum { |e| e[:expectation_results].count { |r| r[:status] == :passed } }
168
+ end
169
+
170
+ {
171
+ total_eval_sets: total_eval_sets,
172
+ total_evals: total_evals,
173
+ passed_evals: passed_evals,
174
+ total_expectations: total_expectations,
175
+ passed_expectations: passed_expectations
176
+ }
177
+ end
178
+
179
+ def print_summary
180
+ data = summary_data
181
+
182
+ output.puts ""
183
+ output.puts "\n" + "=" * 50
184
+ output.puts "SUMMARY"
185
+ output.puts "=" * 50
186
+ output.puts "Eval Sets: #{data[:total_eval_sets]}"
187
+ output.puts ""
188
+ output.puts "Evals:"
189
+ output.puts " #{data[:total_evals]} total"
190
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_evals]} passed")
191
+ output.puts Raif::Utils::Colors.red(" #{data[:total_evals] - data[:passed_evals]} failed")
192
+ output.puts ""
193
+ output.puts "Expectations:"
194
+ output.puts " #{data[:total_expectations]} total"
195
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_expectations]} passed")
196
+ output.puts Raif::Utils::Colors.red(" #{data[:total_expectations] - data[:passed_expectations]} failed")
197
+ output.puts ""
198
+ end
199
+ end
200
+ end
201
+ end