raif 1.2.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +6 -5
  3. data/app/assets/builds/raif.css +4 -1
  4. data/app/assets/builds/raif_admin.css +13 -1
  5. data/app/assets/javascript/raif/controllers/conversations_controller.js +1 -1
  6. data/app/assets/stylesheets/raif/admin/conversation.scss +16 -0
  7. data/app/assets/stylesheets/raif/conversations.scss +3 -0
  8. data/app/assets/stylesheets/raif.scss +2 -1
  9. data/app/controllers/raif/admin/application_controller.rb +16 -0
  10. data/app/controllers/raif/admin/configs_controller.rb +94 -0
  11. data/app/controllers/raif/admin/model_completions_controller.rb +18 -1
  12. data/app/controllers/raif/admin/model_tool_invocations_controller.rb +7 -1
  13. data/app/controllers/raif/admin/stats/model_tool_invocations_controller.rb +21 -0
  14. data/app/controllers/raif/admin/stats/tasks_controller.rb +15 -6
  15. data/app/controllers/raif/admin/stats_controller.rb +32 -3
  16. data/app/controllers/raif/conversation_entries_controller.rb +1 -0
  17. data/app/controllers/raif/conversations_controller.rb +10 -2
  18. data/app/jobs/raif/conversation_entry_job.rb +8 -6
  19. data/app/models/raif/admin/task_stat.rb +7 -0
  20. data/app/models/raif/agent.rb +63 -2
  21. data/app/models/raif/agents/native_tool_calling_agent.rb +101 -56
  22. data/app/models/raif/application_record.rb +18 -0
  23. data/app/models/raif/concerns/agent_inference_stats.rb +35 -0
  24. data/app/models/raif/concerns/has_llm.rb +1 -1
  25. data/app/models/raif/concerns/json_schema_definition.rb +40 -5
  26. data/app/models/raif/concerns/llms/anthropic/message_formatting.rb +28 -0
  27. data/app/models/raif/concerns/llms/anthropic/response_tool_calls.rb +24 -0
  28. data/app/models/raif/concerns/llms/anthropic/tool_formatting.rb +4 -0
  29. data/app/models/raif/concerns/llms/bedrock/message_formatting.rb +36 -0
  30. data/app/models/raif/concerns/llms/bedrock/response_tool_calls.rb +26 -0
  31. data/app/models/raif/concerns/llms/bedrock/tool_formatting.rb +4 -0
  32. data/app/models/raif/concerns/llms/google/message_formatting.rb +109 -0
  33. data/app/models/raif/concerns/llms/google/response_tool_calls.rb +32 -0
  34. data/app/models/raif/concerns/llms/google/tool_formatting.rb +72 -0
  35. data/app/models/raif/concerns/llms/message_formatting.rb +11 -5
  36. data/app/models/raif/concerns/llms/open_ai/json_schema_validation.rb +3 -3
  37. data/app/models/raif/concerns/llms/open_ai_completions/message_formatting.rb +22 -0
  38. data/app/models/raif/concerns/llms/open_ai_completions/response_tool_calls.rb +22 -0
  39. data/app/models/raif/concerns/llms/open_ai_completions/tool_formatting.rb +4 -0
  40. data/app/models/raif/concerns/llms/open_ai_responses/message_formatting.rb +17 -0
  41. data/app/models/raif/concerns/llms/open_ai_responses/response_tool_calls.rb +26 -0
  42. data/app/models/raif/concerns/llms/open_ai_responses/tool_formatting.rb +4 -0
  43. data/app/models/raif/concerns/run_with.rb +127 -0
  44. data/app/models/raif/conversation.rb +96 -9
  45. data/app/models/raif/conversation_entry.rb +37 -8
  46. data/app/models/raif/embedding_model.rb +2 -1
  47. data/app/models/raif/embedding_models/open_ai.rb +1 -1
  48. data/app/models/raif/llm.rb +28 -3
  49. data/app/models/raif/llms/anthropic.rb +7 -19
  50. data/app/models/raif/llms/bedrock.rb +6 -20
  51. data/app/models/raif/llms/google.rb +140 -0
  52. data/app/models/raif/llms/open_ai_base.rb +19 -5
  53. data/app/models/raif/llms/open_ai_completions.rb +6 -11
  54. data/app/models/raif/llms/open_ai_responses.rb +6 -16
  55. data/app/models/raif/llms/open_router.rb +10 -14
  56. data/app/models/raif/model_completion.rb +61 -0
  57. data/app/models/raif/model_tool.rb +10 -2
  58. data/app/models/raif/model_tool_invocation.rb +38 -6
  59. data/app/models/raif/model_tools/agent_final_answer.rb +2 -7
  60. data/app/models/raif/model_tools/provider_managed/code_execution.rb +4 -0
  61. data/app/models/raif/model_tools/provider_managed/image_generation.rb +4 -0
  62. data/app/models/raif/model_tools/provider_managed/web_search.rb +4 -0
  63. data/app/models/raif/streaming_responses/google.rb +71 -0
  64. data/app/models/raif/task.rb +74 -18
  65. data/app/models/raif/user_tool_invocation.rb +19 -0
  66. data/app/views/layouts/raif/admin.html.erb +12 -1
  67. data/app/views/raif/admin/agents/_agent.html.erb +8 -0
  68. data/app/views/raif/admin/agents/_conversation_message.html.erb +28 -6
  69. data/app/views/raif/admin/agents/index.html.erb +2 -0
  70. data/app/views/raif/admin/agents/show.html.erb +46 -1
  71. data/app/views/raif/admin/configs/show.html.erb +117 -0
  72. data/app/views/raif/admin/conversations/_conversation_entry.html.erb +29 -34
  73. data/app/views/raif/admin/conversations/show.html.erb +2 -0
  74. data/app/views/raif/admin/model_completions/_model_completion.html.erb +9 -0
  75. data/app/views/raif/admin/model_completions/index.html.erb +26 -0
  76. data/app/views/raif/admin/model_completions/show.html.erb +124 -61
  77. data/app/views/raif/admin/model_tool_invocations/index.html.erb +22 -1
  78. data/app/views/raif/admin/model_tools/_list.html.erb +16 -0
  79. data/app/views/raif/admin/model_tools/_model_tool.html.erb +36 -0
  80. data/app/views/raif/admin/stats/_stats_tile.html.erb +34 -0
  81. data/app/views/raif/admin/stats/index.html.erb +71 -88
  82. data/app/views/raif/admin/stats/model_tool_invocations/index.html.erb +43 -0
  83. data/app/views/raif/admin/stats/tasks/index.html.erb +20 -6
  84. data/app/views/raif/admin/tasks/index.html.erb +6 -1
  85. data/app/views/raif/admin/tasks/show.html.erb +36 -3
  86. data/app/views/raif/conversation_entries/_form.html.erb +4 -1
  87. data/app/views/raif/conversations/_conversation.html.erb +10 -0
  88. data/app/views/raif/conversations/_entry_processed.turbo_stream.erb +12 -0
  89. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  90. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  91. data/app/views/raif/conversations/index.html.erb +23 -0
  92. data/config/locales/admin.en.yml +33 -1
  93. data/config/locales/en.yml +41 -4
  94. data/config/routes.rb +2 -0
  95. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  96. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  97. data/db/migrate/20250904194456_add_generating_entry_response_to_raif_conversations.rb +7 -0
  98. data/db/migrate/20250911125234_add_source_to_raif_tasks.rb +7 -0
  99. data/db/migrate/20251020005853_add_source_to_raif_agents.rb +7 -0
  100. data/db/migrate/20251020011346_rename_task_run_args_to_run_with.rb +7 -0
  101. data/db/migrate/20251020011405_add_run_with_to_raif_agents.rb +13 -0
  102. data/db/migrate/20251024160119_add_llm_messages_max_length_to_raif_conversations.rb +14 -0
  103. data/db/migrate/20251124185033_add_provider_tool_call_id_to_raif_model_tool_invocations.rb +7 -0
  104. data/db/migrate/20251128202941_add_tool_choice_to_raif_model_completions.rb +7 -0
  105. data/db/migrate/20260118144846_add_source_to_raif_conversations.rb +7 -0
  106. data/db/migrate/20260119000000_add_failure_tracking_to_raif_model_completions.rb +10 -0
  107. data/db/migrate/20260119000001_add_completed_at_to_raif_model_completions.rb +8 -0
  108. data/db/migrate/20260119000002_add_started_at_to_raif_model_completions.rb +8 -0
  109. data/exe/raif +7 -0
  110. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  111. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  112. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  113. data/lib/generators/raif/agent/templates/application_agent.rb.tt +1 -3
  114. data/lib/generators/raif/base_generator.rb +19 -0
  115. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  116. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  117. data/lib/generators/raif/conversation/templates/conversation.rb.tt +34 -32
  118. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  119. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  120. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  121. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  122. data/lib/generators/raif/install/install_generator.rb +15 -0
  123. data/lib/generators/raif/install/templates/initializer.rb +89 -10
  124. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
  125. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
  126. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
  127. data/lib/generators/raif/task/task_generator.rb +22 -3
  128. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  129. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  130. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  131. data/lib/raif/cli/base.rb +39 -0
  132. data/lib/raif/cli/evals.rb +47 -0
  133. data/lib/raif/cli/evals_setup.rb +27 -0
  134. data/lib/raif/cli.rb +67 -0
  135. data/lib/raif/configuration.rb +57 -8
  136. data/lib/raif/engine.rb +8 -0
  137. data/lib/raif/errors/instance_dependent_schema_error.rb +8 -0
  138. data/lib/raif/errors/streaming_error.rb +6 -3
  139. data/lib/raif/errors.rb +1 -0
  140. data/lib/raif/evals/eval.rb +30 -0
  141. data/lib/raif/evals/eval_set.rb +111 -0
  142. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  143. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  144. data/lib/raif/evals/expectation_result.rb +39 -0
  145. data/lib/raif/evals/llm_judge.rb +32 -0
  146. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  147. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  148. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  149. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  150. data/lib/raif/evals/run.rb +202 -0
  151. data/lib/raif/evals/scoring_rubric.rb +174 -0
  152. data/lib/raif/evals.rb +26 -0
  153. data/lib/raif/json_schema_builder.rb +14 -0
  154. data/lib/raif/llm_registry.rb +218 -15
  155. data/lib/raif/messages.rb +180 -0
  156. data/lib/raif/migration_checker.rb +3 -3
  157. data/lib/raif/utils/colors.rb +23 -0
  158. data/lib/raif/utils.rb +1 -0
  159. data/lib/raif/version.rb +1 -1
  160. data/lib/raif.rb +13 -0
  161. data/lib/tasks/annotate_rb.rake +10 -0
  162. data/spec/support/current_temperature_test_tool.rb +34 -0
  163. data/spec/support/rspec_helpers.rb +8 -8
  164. data/spec/support/test_conversation.rb +1 -1
  165. metadata +77 -10
  166. data/app/models/raif/agents/re_act_agent.rb +0 -127
  167. data/app/models/raif/agents/re_act_step.rb +0 -33
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Summarization < Raif::Evals::LlmJudge
7
+ run_with :original_content # the original content to evaluate the summary against
8
+ run_with :summary # the summary to evaluate against the original content
9
+
10
+ json_response_schema do
11
+ object :coverage do
12
+ string :justification, description: "Justification for the score"
13
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
14
+ end
15
+
16
+ object :accuracy do
17
+ string :justification, description: "Justification for the score"
18
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
19
+ end
20
+
21
+ object :clarity do
22
+ string :justification, description: "Justification for the score"
23
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
24
+ end
25
+
26
+ object :conciseness do
27
+ string :justification, description: "Justification for the score"
28
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
29
+ end
30
+
31
+ object :overall do
32
+ string :justification, description: "Justification for the score"
33
+ number :score, description: "Score from 1 to 5", enum: [1, 2, 3, 4, 5]
34
+ end
35
+ end
36
+
37
+ def build_system_prompt
38
+ <<~PROMPT.strip
39
+ You are an impartial expert judge of summary quality. You'll be provided a original piece of content and its summary. Your job is to evaluate the summary against the original content based on the following criteria, and assign a score from 1 to 5 for each (5 = excellent, 1 = very poor):
40
+
41
+ **Coverage (Relevance & Completeness):** Does the summary capture all the important points of the original content?
42
+ - 5 = Excellent Coverage - Nearly all key points and essential details from the content are present in the summary, with no major omissions.
43
+ - 4 = Good Coverage - Most important points are included, but a minor detail or two might be missing.
44
+ - 3 = Fair Coverage - Some main points appear, but the summary misses or glosses over other important information.
45
+ - 2 = Poor Coverage - Many critical points from the content are missing; the summary is incomplete.
46
+ - 1 = Very Poor - The summary fails to include most of the content's main points (highly incomplete).
47
+
48
+ **Accuracy (Faithfulness to the Source):** Is the summary factually correct and free of hallucinations or misrepresentations of the content?
49
+ - 5 = Fully Accurate - All statements in the summary are correct and directly supported by the content. No errors or invented information.
50
+ - 4 = Mostly Accurate - The summary is generally accurate with perhaps one minor error or slight ambiguity, but no significant falsehoods.
51
+ - 3 = Some Inaccuracies - Contains a few errors or unsupported claims from the content, but overall captures the gist correctly.
52
+ - 2 = Mostly Inaccurate - Multiple statements in the summary are incorrect or not supported by the content.
53
+ - 1 = Completely Inaccurate - The summary seriously distorts or contradicts the content; many claims are false or not in the source.
54
+
55
+ **Clarity and Coherence:** Is the summary well-written and easy to understand? (Consider organization, flow, and whether it would make sense to a reader.)
56
+ - 5 = Very Clear & Coherent - The summary is logically organized, flows well, and would be easily understood by the target reader. No confusion or ambiguity.
57
+ - 4 = Mostly Clear - Readable and mostly well-structured, though a sentence or transition could be smoother.
58
+ - 3 = Somewhat Clear - The summary makes sense overall but might be disjointed or awkward in places, requiring effort to follow.
59
+ - 2 = Generally Unclear - Lacks coherence or has poor phrasing that makes it hard to follow the ideas.
60
+ - 1 = Very Poor Clarity - The summary is very confusing or poorly structured, making it hard to understand.
61
+
62
+ **Conciseness:** Is the summary succinct while still informative? (It should omit unnecessary detail but not at the expense of coverage.)
63
+ - 5 = Highly Concise - The summary is brief yet covers all important information (no fluff or redundancy).
64
+ - 4 = Concise - Generally to-the-point, with only minor redundancy or superfluous content.
65
+ - 3 = Moderately Concise - Some excess detail or repetition that could be trimmed, but not egregious.
66
+ - 2 = Verbose - Contains a lot of unnecessary detail or repeats points, making it longer than needed.
67
+ - 1 = Excessively Verbose - The summary is overly long or wordy, with much content that doesn't add value.
68
+ PROMPT
69
+ end
70
+
71
+ def build_prompt
72
+ <<~PROMPT.strip
73
+ # Instructions
74
+ Below is an original piece of content and its summary. Evaluate the summary against the original content based on our 4 criteria. For each, you should provide:
75
+ - A brief justification (1-3 sentences) noting any relevant observations (e.g. what was missing, incorrect, unclear, or well-done).
76
+ - A score from 1 to 5 (5 = excellent, 1 = very poor).
77
+
78
+ Finally, provide an **overall evaluation** of the summary, consisting of a brief justification (1-3 sentences) and a score from 1 to 5 (5 = excellent, 1 = very poor).
79
+
80
+ # Output Format
81
+ Format your output as a JSON object with the following keys:
82
+ {
83
+ "coverage": {
84
+ "justification": "...",
85
+ "score": 1-5
86
+ },
87
+ "accuracy": {
88
+ "justification": "...",
89
+ "score": 1-5
90
+ },
91
+ "clarity": {
92
+ "justification": "...",
93
+ "score": 1-5
94
+ },
95
+ "conciseness": {
96
+ "justification": "...",
97
+ "score": 1-5
98
+ },
99
+ "overall": {
100
+ "justification": "...",
101
+ "score": 1-5
102
+ }
103
+ }
104
+ #{additional_context_prompt}
105
+ # Original Article/Document
106
+ #{original_content}
107
+
108
+ # Summary to Evaluate
109
+ #{summary}
110
+ PROMPT
111
+ end
112
+
113
+ def overall_score
114
+ parsed_response["overall"]["score"] if completed?
115
+ end
116
+
117
+ def overall_justification
118
+ parsed_response["overall"]["justification"] if completed?
119
+ end
120
+
121
+ def coverage_score
122
+ parsed_response["coverage"]["score"] if completed?
123
+ end
124
+
125
+ def coverage_justification
126
+ parsed_response["coverage"]["justification"] if completed?
127
+ end
128
+
129
+ def accuracy_score
130
+ parsed_response["accuracy"]["score"] if completed?
131
+ end
132
+
133
+ def accuracy_justification
134
+ parsed_response["accuracy"]["justification"] if completed?
135
+ end
136
+
137
+ def clarity_score
138
+ parsed_response["clarity"]["score"] if completed?
139
+ end
140
+
141
+ def clarity_justification
142
+ parsed_response["clarity"]["justification"] if completed?
143
+ end
144
+
145
+ def conciseness_score
146
+ parsed_response["conciseness"]["score"] if completed?
147
+ end
148
+
149
+ def conciseness_justification
150
+ parsed_response["conciseness"]["justification"] if completed?
151
+ end
152
+
153
+ private
154
+
155
+ def additional_context_prompt
156
+ return if additional_context.blank?
157
+
158
+ <<~PROMPT
159
+ \n# Additional context:
160
+ #{additional_context}
161
+ PROMPT
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,202 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+
6
+ module Raif
7
+ module Evals
8
+ class Run
9
+ attr_reader :eval_sets, :results, :output
10
+
11
+ def initialize(file_paths: nil, output: $stdout)
12
+ @output = output
13
+ @results = {}
14
+
15
+ @eval_sets = if file_paths&.any?
16
+ load_eval_sets_from_files(file_paths)
17
+ else
18
+ discover_eval_sets
19
+ end
20
+ end
21
+
22
+ def execute
23
+ # Load setup file if it exists
24
+ setup_file = Rails.root.join("raif_evals", "setup.rb")
25
+ if File.exist?(setup_file)
26
+ require setup_file
27
+ else
28
+ output.puts Raif::Utils::Colors.red("\n\nNo setup file found. To set up Raif evals, run:\n")
29
+ output.puts Raif::Utils::Colors.red("bundle exec raif evals:setup\n")
30
+ exit 1
31
+ end
32
+
33
+ output.puts "\nStarting Raif Eval Run"
34
+ output.puts ""
35
+ output.puts "Raif.config.default_llm_model_key: #{Raif.config.default_llm_model_key}"
36
+ output.puts "Raif.config.evals_default_llm_judge_model_key: #{Raif.config.evals_default_llm_judge_model_key}"
37
+ output.puts ""
38
+ output.puts "=" * 50
39
+
40
+ @eval_sets.each do |eval_set_entry|
41
+ eval_set_class, file_path, line_number = if eval_set_entry.is_a?(Hash)
42
+ [eval_set_entry[:class], eval_set_entry[:file_path], eval_set_entry[:line_number]]
43
+ else
44
+ [eval_set_entry, nil, nil]
45
+ end
46
+
47
+ if line_number
48
+ # Running specific eval by line number
49
+ output.puts "\nRunning #{eval_set_class.name} at line #{line_number}"
50
+ output.puts "-" * 50
51
+
52
+ eval_results = run_eval_at_line(eval_set_class, file_path, line_number)
53
+ else
54
+ # Running all evals in the set
55
+ output.puts "\nRunning #{eval_set_class.name}"
56
+ output.puts "-" * 50
57
+
58
+ eval_results = eval_set_class.run(output: output)
59
+ end
60
+
61
+ @results[eval_set_class.name] = eval_results.map(&:to_h)
62
+ passed_count = eval_results.count(&:passed?)
63
+ total_count = eval_results.count
64
+
65
+ output.puts "-" * 50
66
+ output.puts "#{eval_set_class.name}: #{passed_count}/#{total_count} evals passed"
67
+ end
68
+
69
+ export_results
70
+ print_summary
71
+ end
72
+
73
+ private
74
+
75
+ def load_eval_sets_from_files(file_paths)
76
+ eval_sets = []
77
+
78
+ file_paths.each do |f|
79
+ file_path = f[:file_path]
80
+ line_number = f[:line_number]
81
+
82
+ # Convert relative path to absolute
83
+ absolute_path = File.expand_path(file_path)
84
+
85
+ unless File.exist?(absolute_path)
86
+ output.puts Raif::Utils::Colors.red("Error: File not found: #{file_path}")
87
+ exit 1
88
+ end
89
+
90
+ subclasses_before = Raif::Evals::EvalSet.subclasses
91
+
92
+ require absolute_path
93
+
94
+ loaded_eval_sets = Raif::Evals::EvalSet.subclasses - subclasses_before
95
+ eval_set_class = loaded_eval_sets.first
96
+
97
+ eval_set_entry = { class: eval_set_class, file_path: absolute_path }
98
+ eval_set_entry[:line_number] = line_number if line_number
99
+
100
+ eval_sets << eval_set_entry
101
+ end
102
+
103
+ eval_sets
104
+ end
105
+
106
+ def run_eval_at_line(eval_set_class, file_path, line_number)
107
+ target_eval = eval_set_class.evals.find{|e| e[:definition_line_number] == line_number }
108
+
109
+ if target_eval.nil?
110
+ output.puts Raif::Utils::Colors.red("Error: No eval block found at line #{line_number}")
111
+ return []
112
+ end
113
+
114
+ instance = eval_set_class.new(output: output)
115
+ [instance.run_eval(target_eval)]
116
+ end
117
+
118
+ def discover_eval_sets
119
+ eval_sets_dir = Rails.root.join("raif_evals", "eval_sets")
120
+ return [] unless eval_sets_dir.exist?
121
+
122
+ Dir.glob(eval_sets_dir.join("**", "*_eval_set.rb")).map do |file|
123
+ relative_path = Pathname.new(file).relative_path_from(Rails.root)
124
+ require Rails.root.join(relative_path)
125
+
126
+ # Extract the path components after raif_evals/eval_sets
127
+ path_from_eval_sets = Pathname.new(file).relative_path_from(eval_sets_dir)
128
+ path_parts = path_from_eval_sets.dirname.to_s.split("/")
129
+
130
+ # Remove "." if it's the only element (meaning file is in eval_sets root)
131
+ path_parts = [] if path_parts == ["."]
132
+
133
+ # Build the full class name
134
+ class_name = File.basename(file, ".rb").camelize
135
+ namespace_parts = ["Raif", "Evals"] + path_parts.map(&:camelize)
136
+ full_class_name = (namespace_parts + [class_name]).join("::")
137
+
138
+ full_class_name.constantize
139
+ end.select { |klass| klass < Raif::Evals::EvalSet }
140
+ end
141
+
142
+ def export_results
143
+ results_dir = Rails.root.join("raif_evals", "results")
144
+ FileUtils.mkdir_p(results_dir)
145
+
146
+ timestamp = Time.current.strftime("%Y%m%d_%H%M%S")
147
+ filename = results_dir.join("eval_run_#{timestamp}.json")
148
+
149
+ File.write(filename, JSON.pretty_generate({
150
+ run_at: Time.current.iso8601,
151
+ results: @results,
152
+ summary: summary_data
153
+ }))
154
+
155
+ output.puts "\nResults exported to: #{filename}"
156
+ end
157
+
158
+ def summary_data
159
+ total_eval_sets = @results.count
160
+ total_evals = @results.values.sum(&:count)
161
+ passed_evals = @results.values.sum { |evals| evals.count { |e| e[:passed] } }
162
+
163
+ total_expectations = @results.values.sum do |evals|
164
+ evals.sum { |e| e[:expectation_results].count }
165
+ end
166
+
167
+ passed_expectations = @results.values.sum do |evals|
168
+ evals.sum { |e| e[:expectation_results].count { |r| r[:status] == :passed } }
169
+ end
170
+
171
+ {
172
+ total_eval_sets: total_eval_sets,
173
+ total_evals: total_evals,
174
+ passed_evals: passed_evals,
175
+ total_expectations: total_expectations,
176
+ passed_expectations: passed_expectations
177
+ }
178
+ end
179
+
180
+ def print_summary
181
+ data = summary_data
182
+
183
+ output.puts ""
184
+ output.puts "\n" + "=" * 50
185
+ output.puts "SUMMARY"
186
+ output.puts "=" * 50
187
+ output.puts "Eval Sets: #{data[:total_eval_sets]}"
188
+ output.puts ""
189
+ output.puts "Evals:"
190
+ output.puts " #{data[:total_evals]} total"
191
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_evals]} passed")
192
+ output.puts Raif::Utils::Colors.red(" #{data[:total_evals] - data[:passed_evals]} failed")
193
+ output.puts ""
194
+ output.puts "Expectations:"
195
+ output.puts " #{data[:total_expectations]} total"
196
+ output.puts Raif::Utils::Colors.green(" #{data[:passed_expectations]} passed")
197
+ output.puts Raif::Utils::Colors.red(" #{data[:total_expectations] - data[:passed_expectations]} failed")
198
+ output.puts ""
199
+ end
200
+ end
201
+ end
202
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ # ScoringRubric provides a standardized way to define evaluation criteria with
6
+ # multiple scoring levels. Each level can define either a score range or a single
7
+ # score value, along with descriptive text explaining what qualifies for that score.
8
+ #
9
+ # @example Creating a custom rubric
10
+ # rubric = ScoringRubric.new(
11
+ # name: :technical_accuracy,
12
+ # description: "Evaluates technical correctness and precision",
13
+ # levels: [
14
+ # { score_range: (9..10), description: "Technically perfect with no errors" },
15
+ # { score_range: (7..8), description: "Mostly correct with minor technical issues" },
16
+ # { score_range: (5..6), description: "Generally correct but some technical problems" },
17
+ # { score_range: (3..4), description: "Significant technical errors present" },
18
+ # { score_range: (0..2), description: "Technically incorrect or misleading" }
19
+ # ]
20
+ # )
21
+ #
22
+ # @example Integer scoring levels
23
+ # rubric = ScoringRubric.new(
24
+ # name: :technical_accuracy ,
25
+ # description: "Evaluates technical correctness and precision",
26
+ # levels: [
27
+ # { score: 5, description: "Technically perfect with no errors" },
28
+ # { score: 4, description: "Mostly correct with minor technical issues" },
29
+ # { score: 3, description: "Generally correct but some technical problems" },
30
+ # { score: 2, description: "Significant technical errors present" },
31
+ # { score: 1, description: "Mostly incorrect or misleading" },
32
+ # { score: 0, description: "Completely incorrect or misleading" }
33
+ # ]
34
+ # )
35
+ #
36
+ # @example Using built-in rubrics
37
+ # accuracy_rubric = ScoringRubric.accuracy
38
+ # helpfulness_rubric = ScoringRubric.helpfulness
39
+ # clarity_rubric = ScoringRubric.clarity
40
+ #
41
+ class ScoringRubric
42
+ # @return [Symbol] The rubric's identifier name
43
+ attr_reader :name
44
+ # @return [String] Human-readable description of what this rubric evaluates
45
+ attr_reader :description
46
+ # @return [Array<Hash>] Array of scoring level definitions
47
+ attr_reader :levels
48
+
49
+ # Creates a new ScoringRubric with the specified criteria.
50
+ #
51
+ # @param name [Symbol] Identifier for this rubric (e.g., :accuracy, :helpfulness)
52
+ # @param description [String] Human-readable description of what this rubric evaluates
53
+ # @param levels [Array<Hash>] Array of scoring level definitions. Each level must contain
54
+ # either :score (Integer) or :score_range (Range), plus :description (String)
55
+ def initialize(name:, description:, levels:)
56
+ @name = name
57
+ @description = description
58
+ @levels = levels
59
+ end
60
+
61
+ # Converts the rubric into a formatted string suitable for LLM prompts.
62
+ #
63
+ # The output includes the rubric description followed by a detailed breakdown
64
+ # of all scoring levels with their criteria.
65
+ #
66
+ # @return [String] Formatted rubric text ready for inclusion in prompts
67
+ #
68
+ # @example Output format
69
+ # "Evaluates factual correctness and precision
70
+ #
71
+ # Scoring levels:
72
+ # - 9-10: Completely accurate with no errors
73
+ # - 7-8: Mostly accurate with minor imprecisions
74
+ # - 5-6: Generally accurate but some notable errors"
75
+ #
76
+ # @raise [ArgumentError] If a level doesn't contain :score or :score_range
77
+ def to_prompt
78
+ prompt = "#{description}\n\nScoring levels:\n"
79
+
80
+ levels.each do |level|
81
+ if level.key?(:score)
82
+ score = level[:score]
83
+ prompt += "- #{score}: #{level[:description]}\n"
84
+ else
85
+ range = level[:score_range]
86
+ min, max = case range
87
+ when Range
88
+ [range.begin, range.exclude_end? ? range.end - 1 : range.end]
89
+ else
90
+ raise ArgumentError, "level must include :score or :score_range (Range)"
91
+ end
92
+ prompt += "- #{min}-#{max}: #{level[:description]}\n"
93
+ end
94
+ end
95
+
96
+ prompt.strip
97
+ end
98
+
99
+ class << self
100
+ # Creates a rubric for evaluating factual accuracy and correctness.
101
+ #
102
+ # This rubric focuses on whether information is factually correct,
103
+ # precise, and free from errors or misconceptions.
104
+ #
105
+ # @return [ScoringRubric] Pre-configured accuracy rubric (1-5 scale)
106
+ #
107
+ # @example
108
+ # rubric = ScoringRubric.accuracy
109
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
110
+ def accuracy
111
+ new(
112
+ name: :accuracy,
113
+ description: "Evaluates factual correctness and precision",
114
+ levels: [
115
+ { score: 5, description: "Completely accurate with no errors" },
116
+ { score: 4, description: "Mostly accurate with minor imprecisions" },
117
+ { score: 3, description: "Generally accurate but some notable errors" },
118
+ { score: 2, description: "Significant inaccuracies present" },
119
+ { score: 1, description: "Mostly or entirely inaccurate" }
120
+ ]
121
+ )
122
+ end
123
+
124
+ # Creates a rubric for evaluating how well content addresses user needs.
125
+ #
126
+ # This rubric assesses whether the response is useful, relevant, and
127
+ # effectively helps the user accomplish their goals.
128
+ #
129
+ # @return [ScoringRubric] Pre-configured helpfulness rubric (1-5 scale)
130
+ #
131
+ # @example
132
+ # rubric = ScoringRubric.helpfulness
133
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
134
+ def helpfulness
135
+ new(
136
+ name: :helpfulness,
137
+ description: "Evaluates how well the response addresses user needs",
138
+ levels: [
139
+ { score: 5, description: "Extremely helpful, fully addresses the need" },
140
+ { score: 4, description: "Very helpful with good coverage" },
141
+ { score: 3, description: "Moderately helpful but missing some aspects" },
142
+ { score: 2, description: "Somewhat helpful but significant gaps" },
143
+ { score: 1, description: "Not helpful or misleading" }
144
+ ]
145
+ )
146
+ end
147
+
148
+ # Creates a rubric for evaluating clarity and comprehensibility.
149
+ #
150
+ # This rubric focuses on how easy content is to understand, whether
151
+ # it's well-organized, and if the language is appropriate for the audience.
152
+ #
153
+ # @return [ScoringRubric] Pre-configured clarity rubric (1-5 scale)
154
+ #
155
+ # @example
156
+ # rubric = ScoringRubric.clarity
157
+ # expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
158
+ def clarity
159
+ new(
160
+ name: :clarity,
161
+ description: "Evaluates clarity and comprehensibility",
162
+ levels: [
163
+ { score: 5, description: "Crystal clear and easy to understand" },
164
+ { score: 4, description: "Clear with minor ambiguities" },
165
+ { score: 3, description: "Generally clear but some confusion" },
166
+ { score: 2, description: "Unclear in significant ways" },
167
+ { score: 1, description: "Very unclear or incomprehensible" }
168
+ ]
169
+ )
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end
data/lib/raif/evals.rb ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "raif/evals/expectation_result"
4
+ require "raif/evals/eval"
5
+ require "raif/evals/eval_set"
6
+ require "raif/evals/run"
7
+ require "raif/evals/llm_judge"
8
+ require "raif/evals/llm_judges/binary"
9
+ require "raif/evals/llm_judges/comparative"
10
+ require "raif/evals/llm_judges/scored"
11
+ require "raif/evals/llm_judges/summarization"
12
+ require "raif/evals/scoring_rubric"
13
+
14
+ module Raif
15
+ module Evals
16
+ # Namespace modules for organizing eval sets
17
+ module Tasks
18
+ end
19
+
20
+ module Conversations
21
+ end
22
+
23
+ module Agents
24
+ end
25
+ end
26
+ end
@@ -10,6 +10,20 @@ module Raif
10
10
  @items_schema = nil
11
11
  end
12
12
 
13
+ # Build schema with instance context for instance-dependent schemas
14
+ # The block receives the instance as a parameter and has access to the builder methods
15
+ #
16
+ # @param instance [Object] The instance to use as context
17
+ # @param block [Proc] The block to evaluate with instance context
18
+ # @return [JsonSchemaBuilder] self for chaining
19
+ def build_with_instance(instance, &block)
20
+ # Evaluate the block in the context of the builder, passing the instance as parameter
21
+ # This allows the block to use both builder methods (string, integer, etc.)
22
+ # and access the instance parameter for conditional logic
23
+ instance_exec(instance, &block)
24
+ self
25
+ end
26
+
13
27
  def string(name, options = {})
14
28
  add_property(name, "string", options)
15
29
  end