raif 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/app/jobs/raif/conversation_entry_job.rb +1 -1
  3. data/app/models/raif/agents/re_act_step.rb +1 -2
  4. data/app/models/raif/concerns/has_llm.rb +1 -1
  5. data/app/models/raif/concerns/task_run_args.rb +62 -0
  6. data/app/models/raif/conversation.rb +5 -1
  7. data/app/models/raif/conversation_entry.rb +6 -8
  8. data/app/models/raif/llm.rb +1 -1
  9. data/app/models/raif/llms/open_router.rb +3 -1
  10. data/app/models/raif/task.rb +22 -9
  11. data/app/views/raif/conversation_entries/_form.html.erb +1 -1
  12. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  13. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  14. data/config/locales/en.yml +8 -0
  15. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  16. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  17. data/exe/raif +7 -0
  18. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  19. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  20. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  21. data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
  22. data/lib/generators/raif/base_generator.rb +19 -0
  23. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  24. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  25. data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
  26. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  27. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  28. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  29. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  30. data/lib/generators/raif/install/install_generator.rb +15 -0
  31. data/lib/generators/raif/install/templates/initializer.rb +11 -0
  32. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
  33. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
  34. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
  35. data/lib/generators/raif/task/task_generator.rb +22 -3
  36. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  37. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  38. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  39. data/lib/raif/cli/base.rb +39 -0
  40. data/lib/raif/cli/evals.rb +47 -0
  41. data/lib/raif/cli/evals_setup.rb +27 -0
  42. data/lib/raif/cli.rb +67 -0
  43. data/lib/raif/configuration.rb +20 -6
  44. data/lib/raif/evals/eval.rb +30 -0
  45. data/lib/raif/evals/eval_set.rb +111 -0
  46. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  47. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  48. data/lib/raif/evals/expectation_result.rb +39 -0
  49. data/lib/raif/evals/llm_judge.rb +32 -0
  50. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  51. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  52. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  53. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  54. data/lib/raif/evals/run.rb +201 -0
  55. data/lib/raif/evals/scoring_rubric.rb +174 -0
  56. data/lib/raif/evals.rb +26 -0
  57. data/lib/raif/llm_registry.rb +33 -0
  58. data/lib/raif/migration_checker.rb +3 -3
  59. data/lib/raif/utils/colors.rb +23 -0
  60. data/lib/raif/utils.rb +1 -0
  61. data/lib/raif/version.rb +1 -1
  62. data/lib/raif.rb +4 -0
  63. data/spec/support/current_temperature_test_tool.rb +34 -0
  64. data/spec/support/test_conversation.rb +1 -1
  65. metadata +35 -3
@@ -0,0 +1,255 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module EvalSets
6
+ module LlmJudgeExpectations
7
+
8
+ # Uses an LLM judge to evaluate whether content meets specific criteria with a binary pass/fail result.
9
+ #
10
+ # This method leverages the Binary LLM judge to assess content against provided criteria,
11
+ # returning a pass or fail judgment with reasoning and confidence scores.
12
+ #
13
+ # @param content [String] The content to be evaluated by the LLM judge
14
+ # @param criteria [String] The evaluation criteria that the content must meet
15
+ # @param examples [Array<Hash>] Optional examples showing how to evaluate similar content.
16
+ # Each example should have keys: :content, :passes (boolean), :reasoning
17
+ # @param strict [Boolean] Whether to apply criteria strictly (true) or with reasonable judgment (false)
18
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
19
+ # If nil, uses the configured default judge model or falls back to default LLM
20
+ # @param additional_context [String, nil] Optional additional context to be provided to the judge
21
+ #
22
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
23
+ #
24
+ # @example Basic usage
25
+ # expect_llm_judge_passes(
26
+ # task.parsed_response,
27
+ # criteria: "Response is polite and professional"
28
+ # )
29
+ #
30
+ # @example With examples and strict mode
31
+ # expect_llm_judge_passes(
32
+ # content,
33
+ # criteria: "Contains a proper greeting",
34
+ # examples: [
35
+ # { content: "Hello, how can I help?", passes: true, reasoning: "Contains greeting" },
36
+ # { content: "What do you want?", passes: false, reasoning: "No greeting, rude tone" }
37
+ # ],
38
+ # strict: true
39
+ # )
40
+ #
41
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
42
+ # - :passes - Boolean result
43
+ # - :reasoning - Detailed explanation
44
+ # - :confidence - Confidence score (0.0-1.0)
45
+ def expect_llm_judge_passes(content, criteria:, examples: [], strict: false, llm_judge_model_key: nil, additional_context: nil,
46
+ result_metadata: {})
47
+ judge_task = LlmJudges::Binary.run(
48
+ content_to_judge: content,
49
+ criteria: criteria,
50
+ examples: examples,
51
+ strict_mode: strict,
52
+ llm_model_key: llm_judge_model_key,
53
+ additional_context: additional_context
54
+ )
55
+
56
+ if judge_task.low_confidence? && output.respond_to?(:puts)
57
+ output.puts Raif::Utils::Colors.yellow(" ⚠ Low confidence: #{judge_task.judgment_confidence}")
58
+ end
59
+
60
+ if Raif.config.evals_verbose_output && output.respond_to?(:puts)
61
+ output.puts " #{judge_task.judgment_reasoning}"
62
+ end
63
+
64
+ judge_metadata = {
65
+ passes: judge_task.passes?,
66
+ reasoning: judge_task.judgment_reasoning,
67
+ confidence: judge_task.judgment_confidence,
68
+ }.compact
69
+
70
+ # Merge user metadata with judge metadata
71
+ combined_metadata = result_metadata.merge(judge_metadata)
72
+
73
+ expectation_result = expect "LLM judge: #{criteria}", result_metadata: combined_metadata do
74
+ judge_task.passes?
75
+ end
76
+
77
+ if expectation_result && judge_task.errors.any?
78
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
79
+ end
80
+
81
+ expectation_result
82
+ end
83
+
84
+ # Uses an LLM judge to evaluate content with a numerical score based on a detailed rubric.
85
+ #
86
+ # This method leverages the Scored LLM judge to assess content against a scoring rubric,
87
+ # providing a numerical score with detailed reasoning and determining pass/fail based on
88
+ # the minimum passing score threshold.
89
+ #
90
+ # @param output [String] The content to be evaluated by the LLM judge
91
+ # @param scoring_rubric [ScoringRubric, String] The rubric to use for scoring. Can be a
92
+ # ScoringRubric object with structured levels or a plain string description
93
+ # @param min_passing_score [Integer] Minimum score required to pass
94
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
95
+ # If nil, uses the configured default judge model or falls back to default LLM
96
+ # @param additional_context [String, nil] Optional additional context to be provided to the judge
97
+ #
98
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
99
+ #
100
+ # @example Using a built-in rubric
101
+ # expect_llm_judge_score(
102
+ # task.parsed_response,
103
+ # scoring_rubric: ScoringRubric.accuracy,
104
+ # min_passing_score: 8
105
+ # )
106
+ #
107
+ # @example Using a custom rubric
108
+ # rubric = ScoringRubric.new(
109
+ # name: :technical_writing,
110
+ # description: "Evaluates technical writing quality",
111
+ # levels: [
112
+ # { score_range: (9..10), description: "Expert-level technical content" },
113
+ # { score_range: (7..8), description: "Strong technical content" },
114
+ # { score_range: (5..6), description: "Adequate technical content" },
115
+ # { score_range: (3..4), description: "Weak technical content" },
116
+ # { score_range: (0..2), description: "Poor technical content" }
117
+ # ]
118
+ # )
119
+ # expect_llm_judge_score(output, scoring_rubric: rubric, min_passing_score: 7)
120
+ #
121
+ # @example Using a simple string rubric
122
+ # expect_llm_judge_score(
123
+ # output,
124
+ # scoring_rubric: "Rate clarity from 0-5 where 5 is crystal clear",
125
+ # min_passing_score: 4
126
+ # )
127
+ #
128
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
129
+ # - :score - Numerical score given
130
+ # - :reasoning - Detailed explanation
131
+ # - :confidence - Confidence score (0.0-1.0)
132
+ def expect_llm_judge_score(output, scoring_rubric:, min_passing_score:, llm_judge_model_key: nil, additional_context: nil,
133
+ result_metadata: {})
134
+ scoring_rubric_obj = scoring_rubric
135
+
136
+ judge_task = LlmJudges::Scored.run(
137
+ content_to_judge: output,
138
+ scoring_rubric: scoring_rubric_obj,
139
+ llm_model_key: llm_judge_model_key,
140
+ additional_context: additional_context
141
+ )
142
+
143
+ rubric_name = scoring_rubric_obj.respond_to?(:name) ? scoring_rubric_obj.name : "custom"
144
+ if output.respond_to?(:puts)
145
+ output.puts " Score: #{judge_task.judgment_score}"
146
+ output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
147
+ end
148
+
149
+ judge_metadata = {
150
+ score: judge_task.judgment_score,
151
+ reasoning: judge_task.judgment_reasoning,
152
+ confidence: judge_task.judgment_confidence,
153
+ }.compact
154
+
155
+ # Merge user metadata with judge metadata
156
+ combined_metadata = result_metadata.merge(judge_metadata)
157
+
158
+ expectation_result = expect "LLM judge score (#{rubric_name}): >= #{min_passing_score}", result_metadata: combined_metadata do
159
+ judge_task.completed? && judge_task.judgment_score && judge_task.judgment_score >= min_passing_score
160
+ end
161
+
162
+ if expectation_result && judge_task.errors.any?
163
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
164
+ end
165
+
166
+ expectation_result
167
+ end
168
+
169
+ # Uses an LLM judge to compare two pieces of content and determine which better meets specified criteria.
170
+ #
171
+ # This method leverages the Comparative LLM judge to perform A/B testing between two pieces
172
+ # of content. Content placement is randomized to avoid position bias, and the judge determines
173
+ # which content better satisfies the comparison criteria.
174
+ #
175
+ # @param content_to_judge [String] The primary content being evaluated (will be randomly assigned to position A or B)
176
+ # @param over [String] The comparison content to evaluate against (will be randomly assigned to position A or B)
177
+ # @param criteria [String] The comparison criteria to use for evaluation
178
+ # @param allow_ties [Boolean] Whether the judge can declare a tie if both contents are equal (default: true)
179
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
180
+ # If nil, uses the configured default judge model or falls back to default LLM
181
+ # @param additional_context [String, nil] Optional additional context to help the judge
182
+ #
183
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
184
+ #
185
+ # @example Basic A/B comparison
186
+ # expect_llm_judge_prefers(
187
+ # new_response,
188
+ # over: baseline_response,
189
+ # criteria: "More comprehensive and accurate response"
190
+ # )
191
+ #
192
+ # @example Model comparison with no ties allowed
193
+ # expect_llm_judge_prefers(
194
+ # claude_response,
195
+ # over: gpt_response,
196
+ # criteria: "Better follows the specific instructions given",
197
+ # allow_ties: false
198
+ # )
199
+ #
200
+ # @example With additional context
201
+ # expect_llm_judge_prefers(
202
+ # response_a,
203
+ # over: response_b,
204
+ # criteria: "More helpful for a beginner audience",
205
+ # additional_context: "The user identified themselves as new to programming"
206
+ # )
207
+ #
208
+ # @note The expectation passes if the judge correctly identifies the expected winner.
209
+ # Due to randomization, content_to_judge may be assigned to either position A or B,
210
+ # and the judge's choice is validated against the expected winner.
211
+ #
212
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
213
+ # - :winner - Which content won ("A", "B", or "tie")
214
+ # - :reasoning - Detailed explanation of the choice
215
+ # - :confidence - Confidence score (0.0-1.0)
216
+ def expect_llm_judge_prefers(content_to_judge, over:, criteria:, allow_ties: true, llm_judge_model_key: nil, additional_context: nil,
217
+ result_metadata: {})
218
+ judge_task = LlmJudges::Comparative.run(
219
+ content_to_judge: content_to_judge,
220
+ over_content: over,
221
+ comparison_criteria: criteria,
222
+ allow_ties: allow_ties,
223
+ llm_model_key: llm_judge_model_key,
224
+ additional_context: additional_context
225
+ )
226
+
227
+ if output.respond_to?(:puts)
228
+ output.puts " Winner: #{judge_task.winner}"
229
+ output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
230
+ end
231
+
232
+ judge_metadata = {
233
+ winner: judge_task.winner,
234
+ reasoning: judge_task.judgment_reasoning,
235
+ confidence: judge_task.judgment_confidence,
236
+ }.compact
237
+
238
+ # Merge user metadata with judge metadata
239
+ combined_metadata = result_metadata.merge(judge_metadata)
240
+
241
+ expectation_result = expect "LLM judge prefers A over B: #{criteria}", result_metadata: combined_metadata do
242
+ judge_task.completed? && judge_task.correct_expected_winner?
243
+ end
244
+
245
+ if expectation_result && judge_task.errors.any?
246
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
247
+ end
248
+
249
+ expectation_result
250
+ end
251
+
252
+ end
253
+ end
254
+ end
255
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class ExpectationResult
6
+ attr_reader :description, :status, :error
7
+ attr_accessor :metadata, :error_message
8
+
9
+ def initialize(description:, status:, error: nil, error_message: nil, metadata: nil)
10
+ @description = description
11
+ @status = status
12
+ @error = error
13
+ @error_message = error_message
14
+ @metadata = metadata
15
+ end
16
+
17
+ def passed?
18
+ @status == :passed
19
+ end
20
+
21
+ def failed?
22
+ @status == :failed
23
+ end
24
+
25
+ def error?
26
+ @status == :error
27
+ end
28
+
29
+ def to_h
30
+ {
31
+ description: description,
32
+ status: status,
33
+ error: error_message.presence || error&.message,
34
+ metadata: metadata
35
+ }.compact
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class LlmJudge < Raif::Task
6
+ # Set default temperature for consistent judging
7
+ llm_temperature 0.0
8
+
9
+ # Default to JSON response format for structured output
10
+ llm_response_format :json
11
+
12
+ task_run_arg :content_to_judge # the content to judge
13
+ task_run_arg :additional_context # additional context to be provided to the judge
14
+
15
+ def default_llm_model_key
16
+ Raif.config.evals_default_llm_judge_model_key || super
17
+ end
18
+
19
+ def judgment_reasoning
20
+ parsed_response["reasoning"] if completed?
21
+ end
22
+
23
+ def judgment_confidence
24
+ parsed_response["confidence"] if completed?
25
+ end
26
+
27
+ def low_confidence?
28
+ judgment_confidence && judgment_confidence < 0.5
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Binary < Raif::Evals::LlmJudge
7
+ task_run_arg :criteria
8
+ task_run_arg :examples
9
+ task_run_arg :strict_mode
10
+
11
+ json_response_schema do
12
+ boolean :passes, description: "Whether the content passes the criteria"
13
+ string :reasoning, description: "Detailed explanation of the judgment"
14
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
15
+ end
16
+
17
+ def build_system_prompt
18
+ <<~PROMPT.strip
19
+ You are an expert evaluator assessing whether content meets specific criteria.
20
+ Your task is to make binary pass/fail judgments with clear reasoning.
21
+
22
+ First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise pass/fail judgment.
23
+
24
+ Respond with JSON matching this schema:
25
+ {
26
+ "passes": boolean,
27
+ "reasoning": "detailed explanation",
28
+ "confidence": 0.0-1.0
29
+ }
30
+ PROMPT
31
+ end
32
+
33
+ def build_prompt
34
+ prompt = <<~PROMPT
35
+ Evaluation criteria: #{criteria}
36
+
37
+ #{strict_mode ? "Apply the criteria strictly without any leniency." : "Apply reasonable judgment while adhering to the criteria."}
38
+ PROMPT
39
+
40
+ if examples.present?
41
+ prompt += "\nHere are examples of how to evaluate:"
42
+ examples.each do |example|
43
+ prompt += format_example(example)
44
+ end
45
+ end
46
+
47
+ prompt += additional_context_prompt if additional_context.present?
48
+
49
+ prompt += <<~PROMPT.rstrip
50
+
51
+ Now evaluate this content:
52
+ #{content_to_judge}
53
+
54
+ Does this content meet the evaluation criteria?
55
+ PROMPT
56
+
57
+ prompt
58
+ end
59
+
60
+ # Judgment accessor methods
61
+ def passes?
62
+ parsed_response["passes"] if completed?
63
+ end
64
+
65
+ private
66
+
67
+ def additional_context_prompt
68
+ <<~PROMPT
69
+
70
+ Additional context:
71
+ #{additional_context}
72
+ PROMPT
73
+ end
74
+
75
+ def format_example(example)
76
+ if example.key?(:output)
77
+ content_label = "Output"
78
+ content_value = example[:output]
79
+ else
80
+ content_label = "Content"
81
+ content_value = example[:content]
82
+ end
83
+
84
+ <<~EXAMPLE
85
+
86
+ #{content_label}: #{content_value}
87
+ Reasoning: #{example[:reasoning]}
88
+ Judgment: #{example[:passes] ? "PASS" : "FAIL"}
89
+ EXAMPLE
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Comparative < Raif::Evals::LlmJudge
7
+ task_run_arg :over_content # the content to compare against
8
+ task_run_arg :comparison_criteria # the criteria to use when comparing content_to_judge to over_content
9
+ task_run_arg :allow_ties # whether to allow ties in the comparison
10
+
11
+ attr_accessor :content_a, :content_b, :expected_winner
12
+
13
+ before_create do
14
+ self.expected_winner = ["A", "B"].sample
15
+
16
+ if expected_winner == "A"
17
+ self.content_a = content_to_judge
18
+ self.content_b = over_content
19
+ else
20
+ self.content_a = over_content
21
+ self.content_b = content_to_judge
22
+ end
23
+ end
24
+
25
+ json_response_schema do
26
+ string :winner, description: "Which content is better (A, B, or tie)", enum: ["A", "B", "tie"]
27
+ string :reasoning, description: "Detailed explanation of the judgment"
28
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
29
+ end
30
+
31
+ def build_system_prompt
32
+ <<~PROMPT.strip
33
+ You are an expert evaluator comparing two pieces of content to determine which better meets specified criteria.
34
+
35
+ #{allow_ties ? "You may declare a tie if both pieces of content are equally good." : "You must choose a winner even if the difference is minimal."}
36
+
37
+ First, provide detailed reasoning for your choice. Then, provide a precise winner #{allow_ties ? "(A, B, or tie)" : "(A or B)"}.
38
+
39
+ Respond with JSON matching the required schema.
40
+ PROMPT
41
+ end
42
+
43
+ def build_prompt
44
+ <<~PROMPT.strip
45
+ Comparison criteria: #{comparison_criteria}
46
+ #{additional_context_prompt}
47
+ Compare the following two pieces of content:
48
+
49
+ CONTENT A:
50
+ #{content_a}
51
+
52
+ CONTENT B:
53
+ #{content_b}
54
+
55
+ Which content better meets the comparison criteria?
56
+ PROMPT
57
+ end
58
+
59
+ def winner
60
+ parsed_response["winner"] if completed?
61
+ end
62
+
63
+ def tie?
64
+ return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
65
+
66
+ parsed_response["winner"] == "tie"
67
+ end
68
+
69
+ def correct_expected_winner?
70
+ return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
71
+
72
+ parsed_response["winner"] == expected_winner
73
+ end
74
+
75
+ private
76
+
77
+ def additional_context_prompt
78
+ return if additional_context.blank?
79
+
80
+ <<~PROMPT
81
+
82
+ Additional context:
83
+ #{additional_context}
84
+ PROMPT
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module LlmJudges
6
+ class Scored < Raif::Evals::LlmJudge
7
+ task_run_arg :scoring_rubric # the scoring rubric to use when evaluating the content
8
+
9
+ json_response_schema do
10
+ number :score, description: "Numerical score based on the rubric"
11
+ string :reasoning, description: "Detailed explanation of the score"
12
+ number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
13
+ end
14
+
15
+ def build_system_prompt
16
+ <<~PROMPT.strip
17
+ You are an expert evaluator providing numerical scores based on a detailed rubric.
18
+
19
+ First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise score according to the provided rubric.
20
+
21
+ Respond with JSON matching this schema:
22
+ {
23
+ "score": number,
24
+ "reasoning": "detailed explanation",
25
+ "confidence": 0.0-1.0
26
+ }
27
+ PROMPT
28
+ end
29
+
30
+ def build_prompt
31
+ <<~PROMPT.strip
32
+ Scoring rubric:
33
+ #{format_rubric(scoring_rubric)}
34
+ #{additional_context_prompt}
35
+ Evaluate the following content according to the scoring rubric:
36
+ #{content_to_judge}
37
+
38
+ Provide your score and detailed reasoning.
39
+ PROMPT
40
+ end
41
+
42
+ def judgment_score
43
+ parsed_response["score"] if completed?
44
+ end
45
+
46
+ private
47
+
48
+ def additional_context_prompt
49
+ return if additional_context.blank?
50
+
51
+ <<~PROMPT
52
+ \nAdditional context:
53
+ #{additional_context}
54
+ PROMPT
55
+ end
56
+
57
+ def format_rubric(rubric)
58
+ rubric.is_a?(ScoringRubric) ? rubric.to_prompt : rubric.to_s
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end