raif 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +29 -935
- data/app/assets/builds/raif_admin.css +5 -1
- data/app/assets/images/raif-logo-white.svg +8 -0
- data/app/assets/stylesheets/raif_admin.scss +4 -0
- data/app/jobs/raif/conversation_entry_job.rb +1 -1
- data/app/models/raif/agents/re_act_step.rb +1 -2
- data/app/models/raif/concerns/has_llm.rb +1 -1
- data/app/models/raif/concerns/task_run_args.rb +62 -0
- data/app/models/raif/conversation.rb +8 -0
- data/app/models/raif/conversation_entry.rb +6 -9
- data/app/models/raif/llm.rb +1 -1
- data/app/models/raif/llms/open_router.rb +47 -4
- data/app/models/raif/task.rb +22 -9
- data/app/views/layouts/raif/admin.html.erb +3 -1
- data/app/views/raif/conversation_entries/_form.html.erb +1 -1
- data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
- data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
- data/config/locales/en.yml +8 -0
- data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
- data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
- data/exe/raif +7 -0
- data/lib/generators/raif/agent/agent_generator.rb +22 -7
- data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
- data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
- data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
- data/lib/generators/raif/base_generator.rb +19 -0
- data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
- data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
- data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
- data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
- data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
- data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
- data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
- data/lib/generators/raif/install/install_generator.rb +15 -0
- data/lib/generators/raif/install/templates/initializer.rb +14 -3
- data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -2
- data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -76
- data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +10 -0
- data/lib/generators/raif/task/task_generator.rb +22 -3
- data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
- data/lib/generators/raif/task/templates/task.rb.tt +55 -59
- data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
- data/lib/raif/cli/base.rb +39 -0
- data/lib/raif/cli/evals.rb +47 -0
- data/lib/raif/cli/evals_setup.rb +27 -0
- data/lib/raif/cli.rb +67 -0
- data/lib/raif/configuration.rb +23 -9
- data/lib/raif/engine.rb +2 -1
- data/lib/raif/evals/eval.rb +30 -0
- data/lib/raif/evals/eval_set.rb +111 -0
- data/lib/raif/evals/eval_sets/expectations.rb +53 -0
- data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
- data/lib/raif/evals/expectation_result.rb +39 -0
- data/lib/raif/evals/llm_judge.rb +32 -0
- data/lib/raif/evals/llm_judges/binary.rb +94 -0
- data/lib/raif/evals/llm_judges/comparative.rb +89 -0
- data/lib/raif/evals/llm_judges/scored.rb +63 -0
- data/lib/raif/evals/llm_judges/summarization.rb +166 -0
- data/lib/raif/evals/run.rb +201 -0
- data/lib/raif/evals/scoring_rubric.rb +174 -0
- data/lib/raif/evals.rb +26 -0
- data/lib/raif/llm_registry.rb +33 -0
- data/lib/raif/migration_checker.rb +3 -3
- data/lib/raif/utils/colors.rb +23 -0
- data/lib/raif/utils.rb +1 -0
- data/lib/raif/version.rb +1 -1
- data/lib/raif.rb +4 -0
- data/spec/support/current_temperature_test_tool.rb +34 -0
- data/spec/support/test_conversation.rb +1 -1
- metadata +37 -3
data/lib/raif/configuration.rb
CHANGED
@@ -18,6 +18,8 @@ module Raif
|
|
18
18
|
:current_user_method,
|
19
19
|
:default_embedding_model_key,
|
20
20
|
:default_llm_model_key,
|
21
|
+
:evals_default_llm_judge_model_key,
|
22
|
+
:evals_verbose_output,
|
21
23
|
:llm_api_requests_enabled,
|
22
24
|
:llm_request_max_retries,
|
23
25
|
:llm_request_retriable_exceptions,
|
@@ -30,6 +32,7 @@ module Raif
|
|
30
32
|
:open_router_app_name,
|
31
33
|
:open_router_site_url,
|
32
34
|
:streaming_update_chunk_size_threshold,
|
35
|
+
:task_creator_optional,
|
33
36
|
:task_system_prompt_intro,
|
34
37
|
:user_tool_types
|
35
38
|
|
@@ -40,9 +43,8 @@ module Raif
|
|
40
43
|
alias_method :aws_bedrock_titan_embedding_models_enabled=, :bedrock_embedding_models_enabled=
|
41
44
|
|
42
45
|
def initialize
|
43
|
-
# Set default config
|
44
46
|
@agent_types = Set.new(["Raif::Agents::ReActAgent", "Raif::Agents::NativeToolCallingAgent"])
|
45
|
-
@anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
|
47
|
+
@anthropic_api_key = default_disable_llm_api_requests? ? "placeholder-anthropic-api-key" : ENV["ANTHROPIC_API_KEY"]
|
46
48
|
@bedrock_models_enabled = false
|
47
49
|
@anthropic_models_enabled = ENV["ANTHROPIC_API_KEY"].present?
|
48
50
|
@authorize_admin_controller_action = ->{ false }
|
@@ -57,8 +59,10 @@ module Raif
|
|
57
59
|
@conversations_controller = "Raif::ConversationsController"
|
58
60
|
@current_user_method = :current_user
|
59
61
|
@default_embedding_model_key = "open_ai_text_embedding_3_small"
|
60
|
-
@default_llm_model_key = "open_ai_gpt_4o"
|
61
|
-
@
|
62
|
+
@default_llm_model_key = default_disable_llm_api_requests? ? :raif_test_llm : (ENV["RAIF_DEFAULT_LLM_MODEL_KEY"].presence || "open_ai_gpt_4o")
|
63
|
+
@evals_default_llm_judge_model_key = ENV["RAIF_EVALS_DEFAULT_LLM_JUDGE_MODEL_KEY"].presence
|
64
|
+
@evals_verbose_output = false
|
65
|
+
@llm_api_requests_enabled = !default_disable_llm_api_requests?
|
62
66
|
@llm_request_max_retries = 2
|
63
67
|
@llm_request_retriable_exceptions = [
|
64
68
|
Faraday::ConnectionFailed,
|
@@ -66,14 +70,16 @@ module Raif
|
|
66
70
|
Faraday::ServerError,
|
67
71
|
]
|
68
72
|
@model_superclass = "ApplicationRecord"
|
69
|
-
@open_ai_api_key = ENV["OPENAI_API_KEY"]
|
73
|
+
@open_ai_api_key = default_disable_llm_api_requests? ? "placeholder-open-ai-api-key" : ENV["OPENAI_API_KEY"]
|
70
74
|
@open_ai_embedding_models_enabled = ENV["OPENAI_API_KEY"].present?
|
71
75
|
@open_ai_models_enabled = ENV["OPENAI_API_KEY"].present?
|
72
|
-
|
73
|
-
@
|
76
|
+
open_router_api_key = ENV["OPEN_ROUTER_API_KEY"].presence || ENV["OPENROUTER_API_KEY"]
|
77
|
+
@open_router_api_key = default_disable_llm_api_requests? ? "placeholder-open-router-api-key" : open_router_api_key
|
78
|
+
@open_router_models_enabled = @open_router_api_key.present?
|
74
79
|
@open_router_app_name = nil
|
75
80
|
@open_router_site_url = nil
|
76
81
|
@streaming_update_chunk_size_threshold = 25
|
82
|
+
@task_creator_optional = true
|
77
83
|
@user_tool_types = []
|
78
84
|
end
|
79
85
|
|
@@ -82,7 +88,7 @@ module Raif
|
|
82
88
|
puts <<~EOS
|
83
89
|
|
84
90
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
85
|
-
No LLMs are enabled in Raif. Make sure you have an API key configured for at least one LLM provider. You can do this by setting an API key in your environment variables or in config/initializers/raif.rb (e.g. ENV["OPENAI_API_KEY"], ENV["ANTHROPIC_API_KEY"], ENV["
|
91
|
+
No LLMs are enabled in Raif. Make sure you have an API key configured for at least one LLM provider. You can do this by setting an API key in your environment variables or in config/initializers/raif.rb (e.g. ENV["OPENAI_API_KEY"], ENV["ANTHROPIC_API_KEY"], ENV["OPEN_ROUTER_API_KEY"]).
|
86
92
|
|
87
93
|
See the README for more information: https://github.com/CultivateLabs/raif#setup
|
88
94
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
@@ -133,9 +139,17 @@ module Raif
|
|
133
139
|
|
134
140
|
if open_router_models_enabled && open_router_api_key.blank?
|
135
141
|
raise Raif::Errors::InvalidConfigError,
|
136
|
-
"Raif.config.open_router_api_key is required when Raif.config.open_router_models_enabled is true. Set it via Raif.config.open_router_api_key or ENV['
|
142
|
+
"Raif.config.open_router_api_key is required when Raif.config.open_router_models_enabled is true. Set it via Raif.config.open_router_api_key or ENV['OPEN_ROUTER_API_KEY']" # rubocop:disable Layout/LineLength
|
137
143
|
end
|
138
144
|
end
|
139
145
|
|
146
|
+
private
|
147
|
+
|
148
|
+
# By default, evals run in the test environment, but need real API keys.
|
149
|
+
# In normal tests, we insert placeholders to make it hard to accidentally rack up an LLM API bill.
|
150
|
+
def default_disable_llm_api_requests?
|
151
|
+
Rails.env.test? && !Raif.running_evals?
|
152
|
+
end
|
153
|
+
|
140
154
|
end
|
141
155
|
end
|
data/lib/raif/engine.rb
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
class Eval
|
6
|
+
attr_reader :description, :expectation_results
|
7
|
+
|
8
|
+
def initialize(description:)
|
9
|
+
@description = description
|
10
|
+
@expectation_results = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_expectation_result(result)
|
14
|
+
@expectation_results << result
|
15
|
+
end
|
16
|
+
|
17
|
+
def passed?
|
18
|
+
expectation_results.all?(&:passed?)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_h
|
22
|
+
{
|
23
|
+
description: description,
|
24
|
+
passed: passed?,
|
25
|
+
expectation_results: expectation_results.map(&:to_h)
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "raif/evals/eval_sets/expectations"
|
4
|
+
require "raif/evals/eval_sets/llm_judge_expectations"
|
5
|
+
|
6
|
+
module Raif
|
7
|
+
module Evals
|
8
|
+
class EvalSet
|
9
|
+
include Raif::Evals::EvalSets::Expectations
|
10
|
+
include Raif::Evals::EvalSets::LlmJudgeExpectations
|
11
|
+
|
12
|
+
attr_reader :current_eval, :output, :results
|
13
|
+
|
14
|
+
def initialize(output: $stdout)
|
15
|
+
@output = output
|
16
|
+
end
|
17
|
+
|
18
|
+
class << self
|
19
|
+
attr_reader :setup_block
|
20
|
+
attr_reader :teardown_block
|
21
|
+
|
22
|
+
def inherited(subclass)
|
23
|
+
subclass.instance_variable_set(:@evals, [])
|
24
|
+
super
|
25
|
+
end
|
26
|
+
|
27
|
+
def evals
|
28
|
+
@evals ||= []
|
29
|
+
end
|
30
|
+
|
31
|
+
def eval(description, &block)
|
32
|
+
evals << { description: description, block: block, definition_line_number: caller_locations(1, 1).first.lineno }
|
33
|
+
end
|
34
|
+
|
35
|
+
def setup(&block)
|
36
|
+
@setup_block = block
|
37
|
+
end
|
38
|
+
|
39
|
+
def teardown(&block)
|
40
|
+
@teardown_block = block
|
41
|
+
end
|
42
|
+
|
43
|
+
def run(output: $stdout)
|
44
|
+
new(output: output).run
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def run
|
49
|
+
@results = []
|
50
|
+
|
51
|
+
self.class.evals.each do |eval_definition|
|
52
|
+
@results << run_eval(eval_definition)
|
53
|
+
end
|
54
|
+
|
55
|
+
@results
|
56
|
+
end
|
57
|
+
|
58
|
+
def run_eval(eval_definition)
|
59
|
+
@current_eval = Eval.new(description: eval_definition[:description])
|
60
|
+
|
61
|
+
output.puts "Running: #{eval_definition[:description]}"
|
62
|
+
|
63
|
+
ActiveRecord::Base.transaction do
|
64
|
+
instance_eval(&self.class.setup_block) if self.class.setup_block
|
65
|
+
|
66
|
+
begin
|
67
|
+
instance_eval(&eval_definition[:block])
|
68
|
+
rescue => e
|
69
|
+
output.puts Raif::Utils::Colors.red(" Error in eval block: #{e.message}")
|
70
|
+
output.puts Raif::Utils::Colors.red(" #{e.backtrace.join("\n ")}")
|
71
|
+
@current_eval.add_expectation_result(
|
72
|
+
ExpectationResult.new(
|
73
|
+
description: "Eval block execution",
|
74
|
+
status: :error,
|
75
|
+
error: e
|
76
|
+
)
|
77
|
+
)
|
78
|
+
ensure
|
79
|
+
instance_eval(&self.class.teardown_block) if self.class.teardown_block
|
80
|
+
end
|
81
|
+
|
82
|
+
raise ActiveRecord::Rollback
|
83
|
+
end
|
84
|
+
|
85
|
+
@current_eval
|
86
|
+
end
|
87
|
+
|
88
|
+
def file(filename)
|
89
|
+
# Validate filename to prevent directory traversal
|
90
|
+
raise ArgumentError, "Invalid filename: cannot be empty" if filename.nil? || filename.empty?
|
91
|
+
raise ArgumentError, "Invalid filename: cannot contain '..' or absolute paths" if filename.include?("..") || filename.start_with?("/")
|
92
|
+
|
93
|
+
# Ensure we're only accessing files within the raif_evals/files directory
|
94
|
+
base_path = Rails.root.join("raif_evals", "files")
|
95
|
+
full_path = base_path.join(filename)
|
96
|
+
|
97
|
+
# Verify the resolved path is within the expected directory
|
98
|
+
unless full_path.to_s.start_with?(base_path.to_s)
|
99
|
+
raise ArgumentError, "Invalid filename: path traversal detected"
|
100
|
+
end
|
101
|
+
|
102
|
+
if full_path.exist?
|
103
|
+
full_path.read
|
104
|
+
else
|
105
|
+
raise ArgumentError, "File #{filename} does not exist in raif_evals/files/"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module EvalSets
|
6
|
+
module Expectations
|
7
|
+
|
8
|
+
def expect(description, result_metadata: nil, &block)
|
9
|
+
result = begin
|
10
|
+
if block.call
|
11
|
+
output.puts Raif::Utils::Colors.green(" ✓ #{description}")
|
12
|
+
output.puts Raif::Utils::Colors.green(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
|
13
|
+
ExpectationResult.new(description: description, status: :passed, metadata: result_metadata)
|
14
|
+
else
|
15
|
+
output.puts Raif::Utils::Colors.red(" ✗ #{description}")
|
16
|
+
output.puts Raif::Utils::Colors.red(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
|
17
|
+
ExpectationResult.new(description: description, status: :failed, metadata: result_metadata)
|
18
|
+
end
|
19
|
+
rescue => e
|
20
|
+
output.puts Raif::Utils::Colors.red(" ✗ #{description} (Error: #{e.message})")
|
21
|
+
ExpectationResult.new(description: description, status: :error, error: e, metadata: result_metadata)
|
22
|
+
end
|
23
|
+
|
24
|
+
current_eval.add_expectation_result(result)
|
25
|
+
result
|
26
|
+
end
|
27
|
+
|
28
|
+
def expect_tool_invocation(tool_invoker, tool_type, with: {})
|
29
|
+
invocations = tool_invoker.raif_model_tool_invocations.select { |inv| inv.tool_type == tool_type }
|
30
|
+
invoked_tools = tool_invoker.raif_model_tool_invocations.map{|inv| [inv.tool_type, inv.tool_arguments] }.to_h
|
31
|
+
|
32
|
+
if with.any?
|
33
|
+
invocations = invocations.select do |invocation|
|
34
|
+
with.all? { |key, value| invocation.tool_arguments[key.to_s] == value }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
result_metadata = { invoked_tools: invoked_tools }
|
39
|
+
expect "invokes #{tool_type}#{with.any? ? " with #{with.to_json}" : ""}", result_metadata: result_metadata do
|
40
|
+
invocations.any?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def expect_no_tool_invocation(tool_invoker, tool_name)
|
45
|
+
expect "does not invoke #{tool_name}" do
|
46
|
+
tool_invoker.raif_model_tool_invocations.none? { |inv| inv.tool_name == tool_name }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
module EvalSets
|
6
|
+
module LlmJudgeExpectations
|
7
|
+
|
8
|
+
# Uses an LLM judge to evaluate whether content meets specific criteria with a binary pass/fail result.
|
9
|
+
#
|
10
|
+
# This method leverages the Binary LLM judge to assess content against provided criteria,
|
11
|
+
# returning a pass or fail judgment with reasoning and confidence scores.
|
12
|
+
#
|
13
|
+
# @param content [String] The content to be evaluated by the LLM judge
|
14
|
+
# @param criteria [String] The evaluation criteria that the content must meet
|
15
|
+
# @param examples [Array<Hash>] Optional examples showing how to evaluate similar content.
|
16
|
+
# Each example should have keys: :content, :passes (boolean), :reasoning
|
17
|
+
# @param strict [Boolean] Whether to apply criteria strictly (true) or with reasonable judgment (false)
|
18
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
19
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
20
|
+
# @param additional_context [String, nil] Optional additional context to be provided to the judge
|
21
|
+
#
|
22
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
23
|
+
#
|
24
|
+
# @example Basic usage
|
25
|
+
# expect_llm_judge_passes(
|
26
|
+
# task.parsed_response,
|
27
|
+
# criteria: "Response is polite and professional"
|
28
|
+
# )
|
29
|
+
#
|
30
|
+
# @example With examples and strict mode
|
31
|
+
# expect_llm_judge_passes(
|
32
|
+
# content,
|
33
|
+
# criteria: "Contains a proper greeting",
|
34
|
+
# examples: [
|
35
|
+
# { content: "Hello, how can I help?", passes: true, reasoning: "Contains greeting" },
|
36
|
+
# { content: "What do you want?", passes: false, reasoning: "No greeting, rude tone" }
|
37
|
+
# ],
|
38
|
+
# strict: true
|
39
|
+
# )
|
40
|
+
#
|
41
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
42
|
+
# - :passes - Boolean result
|
43
|
+
# - :reasoning - Detailed explanation
|
44
|
+
# - :confidence - Confidence score (0.0-1.0)
|
45
|
+
def expect_llm_judge_passes(content, criteria:, examples: [], strict: false, llm_judge_model_key: nil, additional_context: nil,
|
46
|
+
result_metadata: {})
|
47
|
+
judge_task = LlmJudges::Binary.run(
|
48
|
+
content_to_judge: content,
|
49
|
+
criteria: criteria,
|
50
|
+
examples: examples,
|
51
|
+
strict_mode: strict,
|
52
|
+
llm_model_key: llm_judge_model_key,
|
53
|
+
additional_context: additional_context
|
54
|
+
)
|
55
|
+
|
56
|
+
if judge_task.low_confidence? && output.respond_to?(:puts)
|
57
|
+
output.puts Raif::Utils::Colors.yellow(" ⚠ Low confidence: #{judge_task.judgment_confidence}")
|
58
|
+
end
|
59
|
+
|
60
|
+
if Raif.config.evals_verbose_output && output.respond_to?(:puts)
|
61
|
+
output.puts " #{judge_task.judgment_reasoning}"
|
62
|
+
end
|
63
|
+
|
64
|
+
judge_metadata = {
|
65
|
+
passes: judge_task.passes?,
|
66
|
+
reasoning: judge_task.judgment_reasoning,
|
67
|
+
confidence: judge_task.judgment_confidence,
|
68
|
+
}.compact
|
69
|
+
|
70
|
+
# Merge user metadata with judge metadata
|
71
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
72
|
+
|
73
|
+
expectation_result = expect "LLM judge: #{criteria}", result_metadata: combined_metadata do
|
74
|
+
judge_task.passes?
|
75
|
+
end
|
76
|
+
|
77
|
+
if expectation_result && judge_task.errors.any?
|
78
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
79
|
+
end
|
80
|
+
|
81
|
+
expectation_result
|
82
|
+
end
|
83
|
+
|
84
|
+
# Uses an LLM judge to evaluate content with a numerical score based on a detailed rubric.
|
85
|
+
#
|
86
|
+
# This method leverages the Scored LLM judge to assess content against a scoring rubric,
|
87
|
+
# providing a numerical score with detailed reasoning and determining pass/fail based on
|
88
|
+
# the minimum passing score threshold.
|
89
|
+
#
|
90
|
+
# @param output [String] The content to be evaluated by the LLM judge
|
91
|
+
# @param scoring_rubric [ScoringRubric, String] The rubric to use for scoring. Can be a
|
92
|
+
# ScoringRubric object with structured levels or a plain string description
|
93
|
+
# @param min_passing_score [Integer] Minimum score required to pass
|
94
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
95
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
96
|
+
# @param additional_context [String, nil] Optional additional context to be provided to the judge
|
97
|
+
#
|
98
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
99
|
+
#
|
100
|
+
# @example Using a built-in rubric
|
101
|
+
# expect_llm_judge_score(
|
102
|
+
# task.parsed_response,
|
103
|
+
# scoring_rubric: ScoringRubric.accuracy,
|
104
|
+
# min_passing_score: 8
|
105
|
+
# )
|
106
|
+
#
|
107
|
+
# @example Using a custom rubric
|
108
|
+
# rubric = ScoringRubric.new(
|
109
|
+
# name: :technical_writing,
|
110
|
+
# description: "Evaluates technical writing quality",
|
111
|
+
# levels: [
|
112
|
+
# { score_range: (9..10), description: "Expert-level technical content" },
|
113
|
+
# { score_range: (7..8), description: "Strong technical content" },
|
114
|
+
# { score_range: (5..6), description: "Adequate technical content" },
|
115
|
+
# { score_range: (3..4), description: "Weak technical content" },
|
116
|
+
# { score_range: (0..2), description: "Poor technical content" }
|
117
|
+
# ]
|
118
|
+
# )
|
119
|
+
# expect_llm_judge_score(output, scoring_rubric: rubric, min_passing_score: 7)
|
120
|
+
#
|
121
|
+
# @example Using a simple string rubric
|
122
|
+
# expect_llm_judge_score(
|
123
|
+
# output,
|
124
|
+
# scoring_rubric: "Rate clarity from 0-5 where 5 is crystal clear",
|
125
|
+
# min_passing_score: 4
|
126
|
+
# )
|
127
|
+
#
|
128
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
129
|
+
# - :score - Numerical score given
|
130
|
+
# - :reasoning - Detailed explanation
|
131
|
+
# - :confidence - Confidence score (0.0-1.0)
|
132
|
+
def expect_llm_judge_score(output, scoring_rubric:, min_passing_score:, llm_judge_model_key: nil, additional_context: nil,
|
133
|
+
result_metadata: {})
|
134
|
+
scoring_rubric_obj = scoring_rubric
|
135
|
+
|
136
|
+
judge_task = LlmJudges::Scored.run(
|
137
|
+
content_to_judge: output,
|
138
|
+
scoring_rubric: scoring_rubric_obj,
|
139
|
+
llm_model_key: llm_judge_model_key,
|
140
|
+
additional_context: additional_context
|
141
|
+
)
|
142
|
+
|
143
|
+
rubric_name = scoring_rubric_obj.respond_to?(:name) ? scoring_rubric_obj.name : "custom"
|
144
|
+
if output.respond_to?(:puts)
|
145
|
+
output.puts " Score: #{judge_task.judgment_score}"
|
146
|
+
output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
|
147
|
+
end
|
148
|
+
|
149
|
+
judge_metadata = {
|
150
|
+
score: judge_task.judgment_score,
|
151
|
+
reasoning: judge_task.judgment_reasoning,
|
152
|
+
confidence: judge_task.judgment_confidence,
|
153
|
+
}.compact
|
154
|
+
|
155
|
+
# Merge user metadata with judge metadata
|
156
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
157
|
+
|
158
|
+
expectation_result = expect "LLM judge score (#{rubric_name}): >= #{min_passing_score}", result_metadata: combined_metadata do
|
159
|
+
judge_task.completed? && judge_task.judgment_score && judge_task.judgment_score >= min_passing_score
|
160
|
+
end
|
161
|
+
|
162
|
+
if expectation_result && judge_task.errors.any?
|
163
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
164
|
+
end
|
165
|
+
|
166
|
+
expectation_result
|
167
|
+
end
|
168
|
+
|
169
|
+
# Uses an LLM judge to compare two pieces of content and determine which better meets specified criteria.
|
170
|
+
#
|
171
|
+
# This method leverages the Comparative LLM judge to perform A/B testing between two pieces
|
172
|
+
# of content. Content placement is randomized to avoid position bias, and the judge determines
|
173
|
+
# which content better satisfies the comparison criteria.
|
174
|
+
#
|
175
|
+
# @param content_to_judge [String] The primary content being evaluated (will be randomly assigned to position A or B)
|
176
|
+
# @param over [String] The comparison content to evaluate against (will be randomly assigned to position A or B)
|
177
|
+
# @param criteria [String] The comparison criteria to use for evaluation
|
178
|
+
# @param allow_ties [Boolean] Whether the judge can declare a tie if both contents are equal (default: true)
|
179
|
+
# @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
|
180
|
+
# If nil, uses the configured default judge model or falls back to default LLM
|
181
|
+
# @param additional_context [String, nil] Optional additional context to help the judge
|
182
|
+
#
|
183
|
+
# @return [ExpectationResult] Result object containing pass/fail status and judge metadata
|
184
|
+
#
|
185
|
+
# @example Basic A/B comparison
|
186
|
+
# expect_llm_judge_prefers(
|
187
|
+
# new_response,
|
188
|
+
# over: baseline_response,
|
189
|
+
# criteria: "More comprehensive and accurate response"
|
190
|
+
# )
|
191
|
+
#
|
192
|
+
# @example Model comparison with no ties allowed
|
193
|
+
# expect_llm_judge_prefers(
|
194
|
+
# claude_response,
|
195
|
+
# over: gpt_response,
|
196
|
+
# criteria: "Better follows the specific instructions given",
|
197
|
+
# allow_ties: false
|
198
|
+
# )
|
199
|
+
#
|
200
|
+
# @example With additional context
|
201
|
+
# expect_llm_judge_prefers(
|
202
|
+
# response_a,
|
203
|
+
# over: response_b,
|
204
|
+
# criteria: "More helpful for a beginner audience",
|
205
|
+
# additional_context: "The user identified themselves as new to programming"
|
206
|
+
# )
|
207
|
+
#
|
208
|
+
# @note The expectation passes if the judge correctly identifies the expected winner.
|
209
|
+
# Due to randomization, content_to_judge may be assigned to either position A or B,
|
210
|
+
# and the judge's choice is validated against the expected winner.
|
211
|
+
#
|
212
|
+
# @note The judge result includes metadata accessible via expectation_result.metadata:
|
213
|
+
# - :winner - Which content won ("A", "B", or "tie")
|
214
|
+
# - :reasoning - Detailed explanation of the choice
|
215
|
+
# - :confidence - Confidence score (0.0-1.0)
|
216
|
+
def expect_llm_judge_prefers(content_to_judge, over:, criteria:, allow_ties: true, llm_judge_model_key: nil, additional_context: nil,
|
217
|
+
result_metadata: {})
|
218
|
+
judge_task = LlmJudges::Comparative.run(
|
219
|
+
content_to_judge: content_to_judge,
|
220
|
+
over_content: over,
|
221
|
+
comparison_criteria: criteria,
|
222
|
+
allow_ties: allow_ties,
|
223
|
+
llm_model_key: llm_judge_model_key,
|
224
|
+
additional_context: additional_context
|
225
|
+
)
|
226
|
+
|
227
|
+
if output.respond_to?(:puts)
|
228
|
+
output.puts " Winner: #{judge_task.winner}"
|
229
|
+
output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
|
230
|
+
end
|
231
|
+
|
232
|
+
judge_metadata = {
|
233
|
+
winner: judge_task.winner,
|
234
|
+
reasoning: judge_task.judgment_reasoning,
|
235
|
+
confidence: judge_task.judgment_confidence,
|
236
|
+
}.compact
|
237
|
+
|
238
|
+
# Merge user metadata with judge metadata
|
239
|
+
combined_metadata = result_metadata.merge(judge_metadata)
|
240
|
+
|
241
|
+
expectation_result = expect "LLM judge prefers A over B: #{criteria}", result_metadata: combined_metadata do
|
242
|
+
judge_task.completed? && judge_task.correct_expected_winner?
|
243
|
+
end
|
244
|
+
|
245
|
+
if expectation_result && judge_task.errors.any?
|
246
|
+
expectation_result.error_message = judge_task.errors.full_messages.join(", ")
|
247
|
+
end
|
248
|
+
|
249
|
+
expectation_result
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
class ExpectationResult
|
6
|
+
attr_reader :description, :status, :error
|
7
|
+
attr_accessor :metadata, :error_message
|
8
|
+
|
9
|
+
def initialize(description:, status:, error: nil, error_message: nil, metadata: nil)
|
10
|
+
@description = description
|
11
|
+
@status = status
|
12
|
+
@error = error
|
13
|
+
@error_message = error_message
|
14
|
+
@metadata = metadata
|
15
|
+
end
|
16
|
+
|
17
|
+
def passed?
|
18
|
+
@status == :passed
|
19
|
+
end
|
20
|
+
|
21
|
+
def failed?
|
22
|
+
@status == :failed
|
23
|
+
end
|
24
|
+
|
25
|
+
def error?
|
26
|
+
@status == :error
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_h
|
30
|
+
{
|
31
|
+
description: description,
|
32
|
+
status: status,
|
33
|
+
error: error_message.presence || error&.message,
|
34
|
+
metadata: metadata
|
35
|
+
}.compact
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Raif
|
4
|
+
module Evals
|
5
|
+
class LlmJudge < Raif::Task
|
6
|
+
# Set default temperature for consistent judging
|
7
|
+
llm_temperature 0.0
|
8
|
+
|
9
|
+
# Default to JSON response format for structured output
|
10
|
+
llm_response_format :json
|
11
|
+
|
12
|
+
task_run_arg :content_to_judge # the content to judge
|
13
|
+
task_run_arg :additional_context # additional context to be provided to the judge
|
14
|
+
|
15
|
+
def default_llm_model_key
|
16
|
+
Raif.config.evals_default_llm_judge_model_key || super
|
17
|
+
end
|
18
|
+
|
19
|
+
def judgment_reasoning
|
20
|
+
parsed_response["reasoning"] if completed?
|
21
|
+
end
|
22
|
+
|
23
|
+
def judgment_confidence
|
24
|
+
parsed_response["confidence"] if completed?
|
25
|
+
end
|
26
|
+
|
27
|
+
def low_confidence?
|
28
|
+
judgment_confidence && judgment_confidence < 0.5
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|