raif 1.2.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/raif/conversation_entry_job.rb +1 -1
- data/app/models/raif/agents/re_act_step.rb +1 -2
- data/app/models/raif/concerns/has_llm.rb +1 -1
- data/app/models/raif/concerns/task_run_args.rb +62 -0
- data/app/models/raif/conversation.rb +5 -1
- data/app/models/raif/conversation_entry.rb +6 -8
- data/app/models/raif/llm.rb +1 -1
- data/app/models/raif/llms/open_router.rb +3 -1
- data/app/models/raif/task.rb +22 -9
- data/app/views/raif/conversation_entries/_form.html.erb +1 -1
- data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
- data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
- data/config/locales/en.yml +8 -0
- data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
- data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
- data/exe/raif +7 -0
- data/lib/generators/raif/agent/agent_generator.rb +22 -7
- data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
- data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
- data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
- data/lib/generators/raif/base_generator.rb +19 -0
- data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
- data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
- data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
- data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
- data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
- data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
- data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
- data/lib/generators/raif/install/install_generator.rb +15 -0
- data/lib/generators/raif/install/templates/initializer.rb +11 -0
- data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
- data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
- data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
- data/lib/generators/raif/task/task_generator.rb +22 -3
- data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
- data/lib/generators/raif/task/templates/task.rb.tt +55 -59
- data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
- data/lib/raif/cli/base.rb +39 -0
- data/lib/raif/cli/evals.rb +47 -0
- data/lib/raif/cli/evals_setup.rb +27 -0
- data/lib/raif/cli.rb +67 -0
- data/lib/raif/configuration.rb +20 -6
- data/lib/raif/evals/eval.rb +30 -0
- data/lib/raif/evals/eval_set.rb +111 -0
- data/lib/raif/evals/eval_sets/expectations.rb +53 -0
- data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
- data/lib/raif/evals/expectation_result.rb +39 -0
- data/lib/raif/evals/llm_judge.rb +32 -0
- data/lib/raif/evals/llm_judges/binary.rb +94 -0
- data/lib/raif/evals/llm_judges/comparative.rb +89 -0
- data/lib/raif/evals/llm_judges/scored.rb +63 -0
- data/lib/raif/evals/llm_judges/summarization.rb +166 -0
- data/lib/raif/evals/run.rb +201 -0
- data/lib/raif/evals/scoring_rubric.rb +174 -0
- data/lib/raif/evals.rb +26 -0
- data/lib/raif/llm_registry.rb +33 -0
- data/lib/raif/migration_checker.rb +3 -3
- data/lib/raif/utils/colors.rb +23 -0
- data/lib/raif/utils.rb +1 -0
- data/lib/raif/version.rb +1 -1
- data/lib/raif.rb +4 -0
- data/spec/support/current_temperature_test_tool.rb +34 -0
- data/spec/support/test_conversation.rb +1 -1
- metadata +35 -3
@@ -0,0 +1,255 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module EvalSets
|
6
|
+
module LlmJudgeExpectations
|
7
|
+
|
8
|
+
# Uses an LLM judge to evaluate whether content meets specific criteria with a binary pass/fail result.
|
9
|
+
#
|
10
|
+
# This method leverages the Binary LLM judge to assess content against provided criteria,
|
11
|
+
# returning a pass or fail judgment with reasoning and confidence scores.
|
12
|
+
#
|
13
|
+
# @param content [String] The content to be evaluated by the LLM judge
|
14
|
+
# @param criteria [String] The evaluation criteria that the content must meet
|
15
|
+
# @param examples [Array<Hash>] Optional examples showing how to evaluate similar content.
|
16
|
+
# Each example should have keys: :content, :passes (boolean), :reasoning
|
17
|
+
# @param strict [Boolean] Whether to apply criteria strictly (true) or with reasonable judgment (false)
|
18
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
19
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
20
|
+
# @param additional_context [String, nil] Optional additional context to be provided to the judge
|
21
|
+
#
|
22
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
23
|
+
#
|
24
|
+
# @example Basic usage
|
25
|
+
# expect_llm_judge_passes(
|
26
|
+
# task.parsed_response,
|
27
|
+
# criteria: "Response is polite and professional"
|
28
|
+
# )
|
29
|
+
#
|
30
|
+
# @example With examples and strict mode
|
31
|
+
# expect_llm_judge_passes(
|
32
|
+
# content,
|
33
|
+
# criteria: "Contains a proper greeting",
|
34
|
+
# examples: [
|
35
|
+
# { content: "Hello, how can I help?", passes: true, reasoning: "Contains greeting" },
|
36
|
+
# { content: "What do you want?", passes: false, reasoning: "No greeting, rude tone" }
|
37
|
+
# ],
|
38
|
+
# strict: true
|
39
|
+
# )
|
40
|
+
#
|
41
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
42
|
+
# - :passes - Boolean result
|
43
|
+
# - :reasoning - Detailed explanation
|
44
|
+
# - :confidence - Confidence score (0.0-1.0)
|
45
|
+
def expect_llm_judge_passes(content, criteria:, examples: [], strict: false, llm_judge_model_key: nil, additional_context: nil,
|
46
|
+
result_metadata: {})
|
47
|
+
judge_task = LlmJudges::Binary.run(
|
48
|
+
content_to_judge: content,
|
49
|
+
criteria: criteria,
|
50
|
+
examples: examples,
|
51
|
+
strict_mode: strict,
|
52
|
+
llm_model_key: llm_judge_model_key,
|
53
|
+
additional_context: additional_context
|
54
|
+
)
|
55
|
+
|
56
|
+
if judge_task.low_confidence? && output.respond_to?(:puts)
|
57
|
+
output.puts Raif::Utils::Colors.yellow(" ⚠ Low confidence: #{judge_task.judgment_confidence}")
|
58
|
+
end
|
59
|
+
|
60
|
+
if Raif.config.evals_verbose_output && output.respond_to?(:puts)
|
61
|
+
output.puts " #{judge_task.judgment_reasoning}"
|
62
|
+
end
|
63
|
+
|
64
|
+
judge_metadata = {
|
65
|
+
passes: judge_task.passes?,
|
66
|
+
reasoning: judge_task.judgment_reasoning,
|
67
|
+
confidence: judge_task.judgment_confidence,
|
68
|
+
}.compact
|
69
|
+
|
70
|
+
# Merge user metadata with judge metadata
|
71
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
72
|
+
|
73
|
+
expectation_result = expect "LLM judge: #{criteria}", result_metadata: combined_metadata do
|
74
|
+
judge_task.passes?
|
75
|
+
end
|
76
|
+
|
77
|
+
if expectation_result && judge_task.errors.any?
|
78
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
79
|
+
end
|
80
|
+
|
81
|
+
expectation_result
|
82
|
+
end
|
83
|
+
|
84
|
+
# Uses an LLM judge to evaluate content with a numerical score based on a detailed rubric.
|
85
|
+
#
|
86
|
+
# This method leverages the Scored LLM judge to assess content against a scoring rubric,
|
87
|
+
# providing a numerical score with detailed reasoning and determining pass/fail based on
|
88
|
+
# the minimum passing score threshold.
|
89
|
+
#
|
90
|
+
# @param output [String] The content to be evaluated by the LLM judge
|
91
|
+
# @param scoring_rubric [ScoringRubric, String] The rubric to use for scoring. Can be a
|
92
|
+
# ScoringRubric object with structured levels or a plain string description
|
93
|
+
# @param min_passing_score [Integer] Minimum score required to pass
|
94
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
95
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
96
|
+
# @param additional_context [String, nil] Optional additional context to be provided to the judge
|
97
|
+
#
|
98
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
99
|
+
#
|
100
|
+
# @example Using a built-in rubric
|
101
|
+
# expect_llm_judge_score(
|
102
|
+
# task.parsed_response,
|
103
|
+
# scoring_rubric: ScoringRubric.accuracy,
|
104
|
+
# min_passing_score: 8
|
105
|
+
# )
|
106
|
+
#
|
107
|
+
# @example Using a custom rubric
|
108
|
+
# rubric = ScoringRubric.new(
|
109
|
+
# name: :technical_writing,
|
110
|
+
# description: "Evaluates technical writing quality",
|
111
|
+
# levels: [
|
112
|
+
# { score_range: (9..10), description: "Expert-level technical content" },
|
113
|
+
# { score_range: (7..8), description: "Strong technical content" },
|
114
|
+
# { score_range: (5..6), description: "Adequate technical content" },
|
115
|
+
# { score_range: (3..4), description: "Weak technical content" },
|
116
|
+
# { score_range: (0..2), description: "Poor technical content" }
|
117
|
+
# ]
|
118
|
+
# )
|
119
|
+
# expect_llm_judge_score(output, scoring_rubric: rubric, min_passing_score: 7)
|
120
|
+
#
|
121
|
+
# @example Using a simple string rubric
|
122
|
+
# expect_llm_judge_score(
|
123
|
+
# output,
|
124
|
+
# scoring_rubric: "Rate clarity from 0-5 where 5 is crystal clear",
|
125
|
+
# min_passing_score: 4
|
126
|
+
# )
|
127
|
+
#
|
128
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
129
|
+
# - :score - Numerical score given
|
130
|
+
# - :reasoning - Detailed explanation
|
131
|
+
# - :confidence - Confidence score (0.0-1.0)
|
132
|
+
def expect_llm_judge_score(output, scoring_rubric:, min_passing_score:, llm_judge_model_key: nil, additional_context: nil,
|
133
|
+
result_metadata: {})
|
134
|
+
scoring_rubric_obj = scoring_rubric
|
135
|
+
|
136
|
+
judge_task = LlmJudges::Scored.run(
|
137
|
+
content_to_judge: output,
|
138
|
+
scoring_rubric: scoring_rubric_obj,
|
139
|
+
llm_model_key: llm_judge_model_key,
|
140
|
+
additional_context: additional_context
|
141
|
+
)
|
142
|
+
|
143
|
+
rubric_name = scoring_rubric_obj.respond_to?(:name) ? scoring_rubric_obj.name : "custom"
|
144
|
+
if output.respond_to?(:puts)
|
145
|
+
output.puts " Score: #{judge_task.judgment_score}"
|
146
|
+
output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
|
147
|
+
end
|
148
|
+
|
149
|
+
judge_metadata = {
|
150
|
+
score: judge_task.judgment_score,
|
151
|
+
reasoning: judge_task.judgment_reasoning,
|
152
|
+
confidence: judge_task.judgment_confidence,
|
153
|
+
}.compact
|
154
|
+
|
155
|
+
# Merge user metadata with judge metadata
|
156
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
157
|
+
|
158
|
+
expectation_result = expect "LLM judge score (#{rubric_name}): >= #{min_passing_score}", result_metadata: combined_metadata do
|
159
|
+
judge_task.completed? && judge_task.judgment_score && judge_task.judgment_score >= min_passing_score
|
160
|
+
end
|
161
|
+
|
162
|
+
if expectation_result && judge_task.errors.any?
|
163
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
164
|
+
end
|
165
|
+
|
166
|
+
expectation_result
|
167
|
+
end
|
168
|
+
|
169
|
+
# Uses an LLM judge to compare two pieces of content and determine which better meets specified criteria.
|
170
|
+
#
|
171
|
+
# This method leverages the Comparative LLM judge to perform A/B testing between two pieces
|
172
|
+
# of content. Content placement is randomized to avoid position bias, and the judge determines
|
173
|
+
# which content better satisfies the comparison criteria.
|
174
|
+
#
|
175
|
+
# @param content_to_judge [String] The primary content being evaluated (will be randomly assigned to position A or B)
|
176
|
+
# @param over [String] The comparison content to evaluate against (will be randomly assigned to position A or B)
|
177
|
+
# @param criteria [String] The comparison criteria to use for evaluation
|
178
|
+
# @param allow_ties [Boolean] Whether the judge can declare a tie if both contents are equal (default: true)
|
179
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
180
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
181
|
+
# @param additional_context [String, nil] Optional additional context to help the judge
|
182
|
+
#
|
183
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
184
|
+
#
|
185
|
+
# @example Basic A/B comparison
|
186
|
+
# expect_llm_judge_prefers(
|
187
|
+
# new_response,
|
188
|
+
# over: baseline_response,
|
189
|
+
# criteria: "More comprehensive and accurate response"
|
190
|
+
# )
|
191
|
+
#
|
192
|
+
# @example Model comparison with no ties allowed
|
193
|
+
# expect_llm_judge_prefers(
|
194
|
+
# claude_response,
|
195
|
+
# over: gpt_response,
|
196
|
+
# criteria: "Better follows the specific instructions given",
|
197
|
+
# allow_ties: false
|
198
|
+
# )
|
199
|
+
#
|
200
|
+
# @example With additional context
|
201
|
+
# expect_llm_judge_prefers(
|
202
|
+
# response_a,
|
203
|
+
# over: response_b,
|
204
|
+
# criteria: "More helpful for a beginner audience",
|
205
|
+
# additional_context: "The user identified themselves as new to programming"
|
206
|
+
# )
|
207
|
+
#
|
208
|
+
# @note The expectation passes if the judge correctly identifies the expected winner.
|
209
|
+
# Due to randomization, content_to_judge may be assigned to either position A or B,
|
210
|
+
# and the judge's choice is validated against the expected winner.
|
211
|
+
#
|
212
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
213
|
+
# - :winner - Which content won ("A", "B", or "tie")
|
214
|
+
# - :reasoning - Detailed explanation of the choice
|
215
|
+
# - :confidence - Confidence score (0.0-1.0)
|
216
|
+
def expect_llm_judge_prefers(content_to_judge, over:, criteria:, allow_ties: true, llm_judge_model_key: nil, additional_context: nil,
|
217
|
+
result_metadata: {})
|
218
|
+
judge_task = LlmJudges::Comparative.run(
|
219
|
+
content_to_judge: content_to_judge,
|
220
|
+
over_content: over,
|
221
|
+
comparison_criteria: criteria,
|
222
|
+
allow_ties: allow_ties,
|
223
|
+
llm_model_key: llm_judge_model_key,
|
224
|
+
additional_context: additional_context
|
225
|
+
)
|
226
|
+
|
227
|
+
if output.respond_to?(:puts)
|
228
|
+
output.puts " Winner: #{judge_task.winner}"
|
229
|
+
output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
|
230
|
+
end
|
231
|
+
|
232
|
+
judge_metadata = {
|
233
|
+
winner: judge_task.winner,
|
234
|
+
reasoning: judge_task.judgment_reasoning,
|
235
|
+
confidence: judge_task.judgment_confidence,
|
236
|
+
}.compact
|
237
|
+
|
238
|
+
# Merge user metadata with judge metadata
|
239
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
240
|
+
|
241
|
+
expectation_result = expect "LLM judge prefers A over B: #{criteria}", result_metadata: combined_metadata do
|
242
|
+
judge_task.completed? && judge_task.correct_expected_winner?
|
243
|
+
end
|
244
|
+
|
245
|
+
if expectation_result && judge_task.errors.any?
|
246
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
247
|
+
end
|
248
|
+
|
249
|
+
expectation_result
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
class ExpectationResult
|
6
|
+
attr_reader :description, :status, :error
|
7
|
+
attr_accessor :metadata, :error_message
|
8
|
+
|
9
|
+
def initialize(description:, status:, error: nil, error_message: nil, metadata: nil)
|
10
|
+
@description = description
|
11
|
+
@status = status
|
12
|
+
@error = error
|
13
|
+
@error_message = error_message
|
14
|
+
@metadata = metadata
|
15
|
+
end
|
16
|
+
|
17
|
+
def passed?
|
18
|
+
@status == :passed
|
19
|
+
end
|
20
|
+
|
21
|
+
def failed?
|
22
|
+
@status == :failed
|
23
|
+
end
|
24
|
+
|
25
|
+
def error?
|
26
|
+
@status == :error
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_h
|
30
|
+
{
|
31
|
+
description: description,
|
32
|
+
status: status,
|
33
|
+
error: error_message.presence || error&.message,
|
34
|
+
metadata: metadata
|
35
|
+
}.compact
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
class LlmJudge < Raif::Task
|
6
|
+
# Set default temperature for consistent judging
|
7
|
+
llm_temperature 0.0
|
8
|
+
|
9
|
+
# Default to JSON response format for structured output
|
10
|
+
llm_response_format :json
|
11
|
+
|
12
|
+
task_run_arg :content_to_judge # the content to judge
|
13
|
+
task_run_arg :additional_context # additional context to be provided to the judge
|
14
|
+
|
15
|
+
def default_llm_model_key
|
16
|
+
Raif.config.evals_default_llm_judge_model_key || super
|
17
|
+
end
|
18
|
+
|
19
|
+
def judgment_reasoning
|
20
|
+
parsed_response["reasoning"] if completed?
|
21
|
+
end
|
22
|
+
|
23
|
+
def judgment_confidence
|
24
|
+
parsed_response["confidence"] if completed?
|
25
|
+
end
|
26
|
+
|
27
|
+
def low_confidence?
|
28
|
+
judgment_confidence && judgment_confidence < 0.5
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module LlmJudges
|
6
|
+
class Binary < Raif::Evals::LlmJudge
|
7
|
+
task_run_arg :criteria
|
8
|
+
task_run_arg :examples
|
9
|
+
task_run_arg :strict_mode
|
10
|
+
|
11
|
+
json_response_schema do
|
12
|
+
boolean :passes, description: "Whether the content passes the criteria"
|
13
|
+
string :reasoning, description: "Detailed explanation of the judgment"
|
14
|
+
number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def build_system_prompt
|
18
|
+
<<~PROMPT.strip
|
19
|
+
You are an expert evaluator assessing whether content meets specific criteria.
|
20
|
+
Your task is to make binary pass/fail judgments with clear reasoning.
|
21
|
+
|
22
|
+
First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise pass/fail judgment.
|
23
|
+
|
24
|
+
Respond with JSON matching this schema:
|
25
|
+
{
|
26
|
+
"passes": boolean,
|
27
|
+
"reasoning": "detailed explanation",
|
28
|
+
"confidence": 0.0-1.0
|
29
|
+
}
|
30
|
+
PROMPT
|
31
|
+
end
|
32
|
+
|
33
|
+
def build_prompt
|
34
|
+
prompt = <<~PROMPT
|
35
|
+
Evaluation criteria: #{criteria}
|
36
|
+
|
37
|
+
#{strict_mode ? "Apply the criteria strictly without any leniency." : "Apply reasonable judgment while adhering to the criteria."}
|
38
|
+
PROMPT
|
39
|
+
|
40
|
+
if examples.present?
|
41
|
+
prompt += "\nHere are examples of how to evaluate:"
|
42
|
+
examples.each do |example|
|
43
|
+
prompt += format_example(example)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
prompt += additional_context_prompt if additional_context.present?
|
48
|
+
|
49
|
+
prompt += <<~PROMPT.rstrip
|
50
|
+
|
51
|
+
Now evaluate this content:
|
52
|
+
#{content_to_judge}
|
53
|
+
|
54
|
+
Does this content meet the evaluation criteria?
|
55
|
+
PROMPT
|
56
|
+
|
57
|
+
prompt
|
58
|
+
end
|
59
|
+
|
60
|
+
# Judgment accessor methods
|
61
|
+
def passes?
|
62
|
+
parsed_response["passes"] if completed?
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def additional_context_prompt
|
68
|
+
<<~PROMPT
|
69
|
+
|
70
|
+
Additional context:
|
71
|
+
#{additional_context}
|
72
|
+
PROMPT
|
73
|
+
end
|
74
|
+
|
75
|
+
def format_example(example)
|
76
|
+
if example.key?(:output)
|
77
|
+
content_label = "Output"
|
78
|
+
content_value = example[:output]
|
79
|
+
else
|
80
|
+
content_label = "Content"
|
81
|
+
content_value = example[:content]
|
82
|
+
end
|
83
|
+
|
84
|
+
<<~EXAMPLE
|
85
|
+
|
86
|
+
#{content_label}: #{content_value}
|
87
|
+
Reasoning: #{example[:reasoning]}
|
88
|
+
Judgment: #{example[:passes] ? "PASS" : "FAIL"}
|
89
|
+
EXAMPLE
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module LlmJudges
|
6
|
+
class Comparative < Raif::Evals::LlmJudge
|
7
|
+
task_run_arg :over_content # the content to compare against
|
8
|
+
task_run_arg :comparison_criteria # the criteria to use when comparing content_to_judge to over_content
|
9
|
+
task_run_arg :allow_ties # whether to allow ties in the comparison
|
10
|
+
|
11
|
+
attr_accessor :content_a, :content_b, :expected_winner
|
12
|
+
|
13
|
+
before_create do
|
14
|
+
self.expected_winner = ["A", "B"].sample
|
15
|
+
|
16
|
+
if expected_winner == "A"
|
17
|
+
self.content_a = content_to_judge
|
18
|
+
self.content_b = over_content
|
19
|
+
else
|
20
|
+
self.content_a = over_content
|
21
|
+
self.content_b = content_to_judge
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
json_response_schema do
|
26
|
+
string :winner, description: "Which content is better (A, B, or tie)", enum: ["A", "B", "tie"]
|
27
|
+
string :reasoning, description: "Detailed explanation of the judgment"
|
28
|
+
number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_system_prompt
|
32
|
+
<<~PROMPT.strip
|
33
|
+
You are an expert evaluator comparing two pieces of content to determine which better meets specified criteria.
|
34
|
+
|
35
|
+
#{allow_ties ? "You may declare a tie if both pieces of content are equally good." : "You must choose a winner even if the difference is minimal."}
|
36
|
+
|
37
|
+
First, provide detailed reasoning for your choice. Then, provide a precise winner #{allow_ties ? "(A, B, or tie)" : "(A or B)"}.
|
38
|
+
|
39
|
+
Respond with JSON matching the required schema.
|
40
|
+
PROMPT
|
41
|
+
end
|
42
|
+
|
43
|
+
def build_prompt
|
44
|
+
<<~PROMPT.strip
|
45
|
+
Comparison criteria: #{comparison_criteria}
|
46
|
+
#{additional_context_prompt}
|
47
|
+
Compare the following two pieces of content:
|
48
|
+
|
49
|
+
CONTENT A:
|
50
|
+
#{content_a}
|
51
|
+
|
52
|
+
CONTENT B:
|
53
|
+
#{content_b}
|
54
|
+
|
55
|
+
Which content better meets the comparison criteria?
|
56
|
+
PROMPT
|
57
|
+
end
|
58
|
+
|
59
|
+
def winner
|
60
|
+
parsed_response["winner"] if completed?
|
61
|
+
end
|
62
|
+
|
63
|
+
def tie?
|
64
|
+
return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
|
65
|
+
|
66
|
+
parsed_response["winner"] == "tie"
|
67
|
+
end
|
68
|
+
|
69
|
+
def correct_expected_winner?
|
70
|
+
return unless completed? # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
|
71
|
+
|
72
|
+
parsed_response["winner"] == expected_winner
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def additional_context_prompt
|
78
|
+
return if additional_context.blank?
|
79
|
+
|
80
|
+
<<~PROMPT
|
81
|
+
|
82
|
+
Additional context:
|
83
|
+
#{additional_context}
|
84
|
+
PROMPT
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module LlmJudges
|
6
|
+
class Scored < Raif::Evals::LlmJudge
|
7
|
+
task_run_arg :scoring_rubric # the scoring rubric to use when evaluating the content
|
8
|
+
|
9
|
+
json_response_schema do
|
10
|
+
number :score, description: "Numerical score based on the rubric"
|
11
|
+
string :reasoning, description: "Detailed explanation of the score"
|
12
|
+
number :confidence, description: "Confidence level from 0.0 to 1.0", minimum: 0, maximum: 1
|
13
|
+
end
|
14
|
+
|
15
|
+
def build_system_prompt
|
16
|
+
<<~PROMPT.strip
|
17
|
+
You are an expert evaluator providing numerical scores based on a detailed rubric.
|
18
|
+
|
19
|
+
First, provide detailed reasoning/explanation of your evaluation. Then, provide a precise score according to the provided rubric.
|
20
|
+
|
21
|
+
Respond with JSON matching this schema:
|
22
|
+
{
|
23
|
+
"score": number,
|
24
|
+
"reasoning": "detailed explanation",
|
25
|
+
"confidence": 0.0-1.0
|
26
|
+
}
|
27
|
+
PROMPT
|
28
|
+
end
|
29
|
+
|
30
|
+
def build_prompt
|
31
|
+
<<~PROMPT.strip
|
32
|
+
Scoring rubric:
|
33
|
+
#{format_rubric(scoring_rubric)}
|
34
|
+
#{additional_context_prompt}
|
35
|
+
Evaluate the following content according to the scoring rubric:
|
36
|
+
#{content_to_judge}
|
37
|
+
|
38
|
+
Provide your score and detailed reasoning.
|
39
|
+
PROMPT
|
40
|
+
end
|
41
|
+
|
42
|
+
def judgment_score
|
43
|
+
parsed_response["score"] if completed?
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def additional_context_prompt
|
49
|
+
return if additional_context.blank?
|
50
|
+
|
51
|
+
<<~PROMPT
|
52
|
+
\nAdditional context:
|
53
|
+
#{additional_context}
|
54
|
+
PROMPT
|
55
|
+
end
|
56
|
+
|
57
|
+
def format_rubric(rubric)
|
58
|
+
rubric.is_a?(ScoringRubric) ? rubric.to_prompt : rubric.to_s
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|