raif 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +29 -935
  3. data/app/assets/builds/raif_admin.css +5 -1
  4. data/app/assets/images/raif-logo-white.svg +8 -0
  5. data/app/assets/stylesheets/raif_admin.scss +4 -0
  6. data/app/jobs/raif/conversation_entry_job.rb +1 -1
  7. data/app/models/raif/agents/re_act_step.rb +1 -2
  8. data/app/models/raif/concerns/has_llm.rb +1 -1
  9. data/app/models/raif/concerns/task_run_args.rb +62 -0
  10. data/app/models/raif/conversation.rb +8 -0
  11. data/app/models/raif/conversation_entry.rb +6 -9
  12. data/app/models/raif/llm.rb +1 -1
  13. data/app/models/raif/llms/open_router.rb +47 -4
  14. data/app/models/raif/task.rb +22 -9
  15. data/app/views/layouts/raif/admin.html.erb +3 -1
  16. data/app/views/raif/conversation_entries/_form.html.erb +1 -1
  17. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  18. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  19. data/config/locales/en.yml +8 -0
  20. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  21. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  22. data/exe/raif +7 -0
  23. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  24. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  25. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  26. data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
  27. data/lib/generators/raif/base_generator.rb +19 -0
  28. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  29. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  30. data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
  31. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  32. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  33. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  34. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  35. data/lib/generators/raif/install/install_generator.rb +15 -0
  36. data/lib/generators/raif/install/templates/initializer.rb +14 -3
  37. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -2
  38. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -76
  39. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +10 -0
  40. data/lib/generators/raif/task/task_generator.rb +22 -3
  41. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  42. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  43. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  44. data/lib/raif/cli/base.rb +39 -0
  45. data/lib/raif/cli/evals.rb +47 -0
  46. data/lib/raif/cli/evals_setup.rb +27 -0
  47. data/lib/raif/cli.rb +67 -0
  48. data/lib/raif/configuration.rb +23 -9
  49. data/lib/raif/engine.rb +2 -1
  50. data/lib/raif/evals/eval.rb +30 -0
  51. data/lib/raif/evals/eval_set.rb +111 -0
  52. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  53. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  54. data/lib/raif/evals/expectation_result.rb +39 -0
  55. data/lib/raif/evals/llm_judge.rb +32 -0
  56. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  57. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  58. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  59. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  60. data/lib/raif/evals/run.rb +201 -0
  61. data/lib/raif/evals/scoring_rubric.rb +174 -0
  62. data/lib/raif/evals.rb +26 -0
  63. data/lib/raif/llm_registry.rb +33 -0
  64. data/lib/raif/migration_checker.rb +3 -3
  65. data/lib/raif/utils/colors.rb +23 -0
  66. data/lib/raif/utils.rb +1 -0
  67. data/lib/raif/version.rb +1 -1
  68. data/lib/raif.rb +4 -0
  69. data/spec/support/current_temperature_test_tool.rb +34 -0
  70. data/spec/support/test_conversation.rb +1 -1
  71. metadata +37 -3
@@ -18,6 +18,8 @@ module Raif
18
18
  :current_user_method,
19
19
  :default_embedding_model_key,
20
20
  :default_llm_model_key,
21
+ :evals_default_llm_judge_model_key,
22
+ :evals_verbose_output,
21
23
  :llm_api_requests_enabled,
22
24
  :llm_request_max_retries,
23
25
  :llm_request_retriable_exceptions,
@@ -30,6 +32,7 @@ module Raif
30
32
  :open_router_app_name,
31
33
  :open_router_site_url,
32
34
  :streaming_update_chunk_size_threshold,
35
+ :task_creator_optional,
33
36
  :task_system_prompt_intro,
34
37
  :user_tool_types
35
38
 
@@ -40,9 +43,8 @@ module Raif
40
43
  alias_method :aws_bedrock_titan_embedding_models_enabled=, :bedrock_embedding_models_enabled=
41
44
 
42
45
  def initialize
43
- # Set default config
44
46
  @agent_types = Set.new(["Raif::Agents::ReActAgent", "Raif::Agents::NativeToolCallingAgent"])
45
- @anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
47
+ @anthropic_api_key = default_disable_llm_api_requests? ? "placeholder-anthropic-api-key" : ENV["ANTHROPIC_API_KEY"]
46
48
  @bedrock_models_enabled = false
47
49
  @anthropic_models_enabled = ENV["ANTHROPIC_API_KEY"].present?
48
50
  @authorize_admin_controller_action = ->{ false }
@@ -57,8 +59,10 @@ module Raif
57
59
  @conversations_controller = "Raif::ConversationsController"
58
60
  @current_user_method = :current_user
59
61
  @default_embedding_model_key = "open_ai_text_embedding_3_small"
60
- @default_llm_model_key = "open_ai_gpt_4o"
61
- @llm_api_requests_enabled = true
62
+ @default_llm_model_key = default_disable_llm_api_requests? ? :raif_test_llm : (ENV["RAIF_DEFAULT_LLM_MODEL_KEY"].presence || "open_ai_gpt_4o")
63
+ @evals_default_llm_judge_model_key = ENV["RAIF_EVALS_DEFAULT_LLM_JUDGE_MODEL_KEY"].presence
64
+ @evals_verbose_output = false
65
+ @llm_api_requests_enabled = !default_disable_llm_api_requests?
62
66
  @llm_request_max_retries = 2
63
67
  @llm_request_retriable_exceptions = [
64
68
  Faraday::ConnectionFailed,
@@ -66,14 +70,16 @@ module Raif
66
70
  Faraday::ServerError,
67
71
  ]
68
72
  @model_superclass = "ApplicationRecord"
69
- @open_ai_api_key = ENV["OPENAI_API_KEY"]
73
+ @open_ai_api_key = default_disable_llm_api_requests? ? "placeholder-open-ai-api-key" : ENV["OPENAI_API_KEY"]
70
74
  @open_ai_embedding_models_enabled = ENV["OPENAI_API_KEY"].present?
71
75
  @open_ai_models_enabled = ENV["OPENAI_API_KEY"].present?
72
- @open_router_api_key = ENV["OPENROUTER_API_KEY"]
73
- @open_router_models_enabled = ENV["OPENROUTER_API_KEY"].present?
76
+ open_router_api_key = ENV["OPEN_ROUTER_API_KEY"].presence || ENV["OPENROUTER_API_KEY"]
77
+ @open_router_api_key = default_disable_llm_api_requests? ? "placeholder-open-router-api-key" : open_router_api_key
78
+ @open_router_models_enabled = @open_router_api_key.present?
74
79
  @open_router_app_name = nil
75
80
  @open_router_site_url = nil
76
81
  @streaming_update_chunk_size_threshold = 25
82
+ @task_creator_optional = true
77
83
  @user_tool_types = []
78
84
  end
79
85
 
@@ -82,7 +88,7 @@ module Raif
82
88
  puts <<~EOS
83
89
 
84
90
  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
85
- No LLMs are enabled in Raif. Make sure you have an API key configured for at least one LLM provider. You can do this by setting an API key in your environment variables or in config/initializers/raif.rb (e.g. ENV["OPENAI_API_KEY"], ENV["ANTHROPIC_API_KEY"], ENV["OPENROUTER_API_KEY"]).
91
+ No LLMs are enabled in Raif. Make sure you have an API key configured for at least one LLM provider. You can do this by setting an API key in your environment variables or in config/initializers/raif.rb (e.g. ENV["OPENAI_API_KEY"], ENV["ANTHROPIC_API_KEY"], ENV["OPEN_ROUTER_API_KEY"]).
86
92
 
87
93
  See the README for more information: https://github.com/CultivateLabs/raif#setup
88
94
  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -133,9 +139,17 @@ module Raif
133
139
 
134
140
  if open_router_models_enabled && open_router_api_key.blank?
135
141
  raise Raif::Errors::InvalidConfigError,
136
- "Raif.config.open_router_api_key is required when Raif.config.open_router_models_enabled is true. Set it via Raif.config.open_router_api_key or ENV['OPENROUTER_API_KEY']" # rubocop:disable Layout/LineLength
142
+ "Raif.config.open_router_api_key is required when Raif.config.open_router_models_enabled is true. Set it via Raif.config.open_router_api_key or ENV['OPEN_ROUTER_API_KEY']" # rubocop:disable Layout/LineLength
137
143
  end
138
144
  end
139
145
 
146
+ private
147
+
148
+ # By default, evals run in the test environment, but need real API keys.
149
+ # In normal tests, we insert placeholders to make it hard to accidentally rack up an LLM API bill.
150
+ def default_disable_llm_api_requests?
151
+ Rails.env.test? && !Raif.running_evals?
152
+ end
153
+
140
154
  end
141
155
  end
data/lib/raif/engine.rb CHANGED
@@ -120,7 +120,8 @@ module Raif
120
120
  Rails.application.config.assets.precompile += [
121
121
  "raif.js",
122
122
  "raif.css",
123
- "raif_admin.css"
123
+ "raif_admin.css",
124
+ "raif-logo-white.svg"
124
125
  ]
125
126
  end
126
127
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class Eval
6
+ attr_reader :description, :expectation_results
7
+
8
+ def initialize(description:)
9
+ @description = description
10
+ @expectation_results = []
11
+ end
12
+
13
+ def add_expectation_result(result)
14
+ @expectation_results << result
15
+ end
16
+
17
+ def passed?
18
+ expectation_results.all?(&:passed?)
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ description: description,
24
+ passed: passed?,
25
+ expectation_results: expectation_results.map(&:to_h)
26
+ }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "raif/evals/eval_sets/expectations"
4
+ require "raif/evals/eval_sets/llm_judge_expectations"
5
+
6
+ module Raif
7
+ module Evals
8
+ class EvalSet
9
+ include Raif::Evals::EvalSets::Expectations
10
+ include Raif::Evals::EvalSets::LlmJudgeExpectations
11
+
12
+ attr_reader :current_eval, :output, :results
13
+
14
+ def initialize(output: $stdout)
15
+ @output = output
16
+ end
17
+
18
+ class << self
19
+ attr_reader :setup_block
20
+ attr_reader :teardown_block
21
+
22
+ def inherited(subclass)
23
+ subclass.instance_variable_set(:@evals, [])
24
+ super
25
+ end
26
+
27
+ def evals
28
+ @evals ||= []
29
+ end
30
+
31
+ def eval(description, &block)
32
+ evals << { description: description, block: block, definition_line_number: caller_locations(1, 1).first.lineno }
33
+ end
34
+
35
+ def setup(&block)
36
+ @setup_block = block
37
+ end
38
+
39
+ def teardown(&block)
40
+ @teardown_block = block
41
+ end
42
+
43
+ def run(output: $stdout)
44
+ new(output: output).run
45
+ end
46
+ end
47
+
48
+ def run
49
+ @results = []
50
+
51
+ self.class.evals.each do |eval_definition|
52
+ @results << run_eval(eval_definition)
53
+ end
54
+
55
+ @results
56
+ end
57
+
58
+ def run_eval(eval_definition)
59
+ @current_eval = Eval.new(description: eval_definition[:description])
60
+
61
+ output.puts "Running: #{eval_definition[:description]}"
62
+
63
+ ActiveRecord::Base.transaction do
64
+ instance_eval(&self.class.setup_block) if self.class.setup_block
65
+
66
+ begin
67
+ instance_eval(&eval_definition[:block])
68
+ rescue => e
69
+ output.puts Raif::Utils::Colors.red(" Error in eval block: #{e.message}")
70
+ output.puts Raif::Utils::Colors.red(" #{e.backtrace.join("\n ")}")
71
+ @current_eval.add_expectation_result(
72
+ ExpectationResult.new(
73
+ description: "Eval block execution",
74
+ status: :error,
75
+ error: e
76
+ )
77
+ )
78
+ ensure
79
+ instance_eval(&self.class.teardown_block) if self.class.teardown_block
80
+ end
81
+
82
+ raise ActiveRecord::Rollback
83
+ end
84
+
85
+ @current_eval
86
+ end
87
+
88
+ def file(filename)
89
+ # Validate filename to prevent directory traversal
90
+ raise ArgumentError, "Invalid filename: cannot be empty" if filename.nil? || filename.empty?
91
+ raise ArgumentError, "Invalid filename: cannot contain '..' or absolute paths" if filename.include?("..") || filename.start_with?("/")
92
+
93
+ # Ensure we're only accessing files within the raif_evals/files directory
94
+ base_path = Rails.root.join("raif_evals", "files")
95
+ full_path = base_path.join(filename)
96
+
97
+ # Verify the resolved path is within the expected directory
98
+ unless full_path.to_s.start_with?(base_path.to_s)
99
+ raise ArgumentError, "Invalid filename: path traversal detected"
100
+ end
101
+
102
+ if full_path.exist?
103
+ full_path.read
104
+ else
105
+ raise ArgumentError, "File #{filename} does not exist in raif_evals/files/"
106
+ end
107
+ end
108
+
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module EvalSets
6
+ module Expectations
7
+
8
+ def expect(description, result_metadata: nil, &block)
9
+ result = begin
10
+ if block.call
11
+ output.puts Raif::Utils::Colors.green(" ✓ #{description}")
12
+ output.puts Raif::Utils::Colors.green(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
13
+ ExpectationResult.new(description: description, status: :passed, metadata: result_metadata)
14
+ else
15
+ output.puts Raif::Utils::Colors.red(" ✗ #{description}")
16
+ output.puts Raif::Utils::Colors.red(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
17
+ ExpectationResult.new(description: description, status: :failed, metadata: result_metadata)
18
+ end
19
+ rescue => e
20
+ output.puts Raif::Utils::Colors.red(" ✗ #{description} (Error: #{e.message})")
21
+ ExpectationResult.new(description: description, status: :error, error: e, metadata: result_metadata)
22
+ end
23
+
24
+ current_eval.add_expectation_result(result)
25
+ result
26
+ end
27
+
28
+ def expect_tool_invocation(tool_invoker, tool_type, with: {})
29
+ invocations = tool_invoker.raif_model_tool_invocations.select { |inv| inv.tool_type == tool_type }
30
+ invoked_tools = tool_invoker.raif_model_tool_invocations.map{|inv| [inv.tool_type, inv.tool_arguments] }.to_h
31
+
32
+ if with.any?
33
+ invocations = invocations.select do |invocation|
34
+ with.all? { |key, value| invocation.tool_arguments[key.to_s] == value }
35
+ end
36
+ end
37
+
38
+ result_metadata = { invoked_tools: invoked_tools }
39
+ expect "invokes #{tool_type}#{with.any? ? " with #{with.to_json}" : ""}", result_metadata: result_metadata do
40
+ invocations.any?
41
+ end
42
+ end
43
+
44
+ def expect_no_tool_invocation(tool_invoker, tool_name)
45
+ expect "does not invoke #{tool_name}" do
46
+ tool_invoker.raif_model_tool_invocations.none? { |inv| inv.tool_name == tool_name }
47
+ end
48
+ end
49
+
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,255 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module EvalSets
6
+ module LlmJudgeExpectations
7
+
8
+ # Uses an LLM judge to evaluate whether content meets specific criteria with a binary pass/fail result.
9
+ #
10
+ # This method leverages the Binary LLM judge to assess content against provided criteria,
11
+ # returning a pass or fail judgment with reasoning and confidence scores.
12
+ #
13
+ # @param content [String] The content to be evaluated by the LLM judge
14
+ # @param criteria [String] The evaluation criteria that the content must meet
15
+ # @param examples [Array<Hash>] Optional examples showing how to evaluate similar content.
16
+ # Each example should have keys: :content, :passes (boolean), :reasoning
17
+ # @param strict [Boolean] Whether to apply criteria strictly (true) or with reasonable judgment (false)
18
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
19
+ # If nil, uses the configured default judge model or falls back to default LLM
20
+ # @param additional_context [String, nil] Optional additional context to be provided to the judge
21
+ #
22
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
23
+ #
24
+ # @example Basic usage
25
+ # expect_llm_judge_passes(
26
+ # task.parsed_response,
27
+ # criteria: "Response is polite and professional"
28
+ # )
29
+ #
30
+ # @example With examples and strict mode
31
+ # expect_llm_judge_passes(
32
+ # content,
33
+ # criteria: "Contains a proper greeting",
34
+ # examples: [
35
+ # { content: "Hello, how can I help?", passes: true, reasoning: "Contains greeting" },
36
+ # { content: "What do you want?", passes: false, reasoning: "No greeting, rude tone" }
37
+ # ],
38
+ # strict: true
39
+ # )
40
+ #
41
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
42
+ # - :passes - Boolean result
43
+ # - :reasoning - Detailed explanation
44
+ # - :confidence - Confidence score (0.0-1.0)
45
+ def expect_llm_judge_passes(content, criteria:, examples: [], strict: false, llm_judge_model_key: nil, additional_context: nil,
46
+ result_metadata: {})
47
+ judge_task = LlmJudges::Binary.run(
48
+ content_to_judge: content,
49
+ criteria: criteria,
50
+ examples: examples,
51
+ strict_mode: strict,
52
+ llm_model_key: llm_judge_model_key,
53
+ additional_context: additional_context
54
+ )
55
+
56
+ if judge_task.low_confidence? && output.respond_to?(:puts)
57
+ output.puts Raif::Utils::Colors.yellow(" ⚠ Low confidence: #{judge_task.judgment_confidence}")
58
+ end
59
+
60
+ if Raif.config.evals_verbose_output && output.respond_to?(:puts)
61
+ output.puts " #{judge_task.judgment_reasoning}"
62
+ end
63
+
64
+ judge_metadata = {
65
+ passes: judge_task.passes?,
66
+ reasoning: judge_task.judgment_reasoning,
67
+ confidence: judge_task.judgment_confidence,
68
+ }.compact
69
+
70
+ # Merge user metadata with judge metadata
71
+ combined_metadata = result_metadata.merge(judge_metadata)
72
+
73
+ expectation_result = expect "LLM judge: #{criteria}", result_metadata: combined_metadata do
74
+ judge_task.passes?
75
+ end
76
+
77
+ if expectation_result && judge_task.errors.any?
78
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
79
+ end
80
+
81
+ expectation_result
82
+ end
83
+
84
+ # Uses an LLM judge to evaluate content with a numerical score based on a detailed rubric.
85
+ #
86
+ # This method leverages the Scored LLM judge to assess content against a scoring rubric,
87
+ # providing a numerical score with detailed reasoning and determining pass/fail based on
88
+ # the minimum passing score threshold.
89
+ #
90
+ # @param output [String] The content to be evaluated by the LLM judge
91
+ # @param scoring_rubric [ScoringRubric, String] The rubric to use for scoring. Can be a
92
+ # ScoringRubric object with structured levels or a plain string description
93
+ # @param min_passing_score [Integer] Minimum score required to pass
94
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
95
+ # If nil, uses the configured default judge model or falls back to default LLM
96
+ # @param additional_context [String, nil] Optional additional context to be provided to the judge
97
+ #
98
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
99
+ #
100
+ # @example Using a built-in rubric
101
+ # expect_llm_judge_score(
102
+ # task.parsed_response,
103
+ # scoring_rubric: ScoringRubric.accuracy,
104
+ # min_passing_score: 8
105
+ # )
106
+ #
107
+ # @example Using a custom rubric
108
+ # rubric = ScoringRubric.new(
109
+ # name: :technical_writing,
110
+ # description: "Evaluates technical writing quality",
111
+ # levels: [
112
+ # { score_range: (9..10), description: "Expert-level technical content" },
113
+ # { score_range: (7..8), description: "Strong technical content" },
114
+ # { score_range: (5..6), description: "Adequate technical content" },
115
+ # { score_range: (3..4), description: "Weak technical content" },
116
+ # { score_range: (0..2), description: "Poor technical content" }
117
+ # ]
118
+ # )
119
+ # expect_llm_judge_score(output, scoring_rubric: rubric, min_passing_score: 7)
120
+ #
121
+ # @example Using a simple string rubric
122
+ # expect_llm_judge_score(
123
+ # output,
124
+ # scoring_rubric: "Rate clarity from 0-5 where 5 is crystal clear",
125
+ # min_passing_score: 4
126
+ # )
127
+ #
128
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
129
+ # - :score - Numerical score given
130
+ # - :reasoning - Detailed explanation
131
+ # - :confidence - Confidence score (0.0-1.0)
132
+ def expect_llm_judge_score(output, scoring_rubric:, min_passing_score:, llm_judge_model_key: nil, additional_context: nil,
133
+ result_metadata: {})
134
+ scoring_rubric_obj = scoring_rubric
135
+
136
+ judge_task = LlmJudges::Scored.run(
137
+ content_to_judge: output,
138
+ scoring_rubric: scoring_rubric_obj,
139
+ llm_model_key: llm_judge_model_key,
140
+ additional_context: additional_context
141
+ )
142
+
143
+ rubric_name = scoring_rubric_obj.respond_to?(:name) ? scoring_rubric_obj.name : "custom"
144
+ if output.respond_to?(:puts)
145
+ output.puts " Score: #{judge_task.judgment_score}"
146
+ output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
147
+ end
148
+
149
+ judge_metadata = {
150
+ score: judge_task.judgment_score,
151
+ reasoning: judge_task.judgment_reasoning,
152
+ confidence: judge_task.judgment_confidence,
153
+ }.compact
154
+
155
+ # Merge user metadata with judge metadata
156
+ combined_metadata = result_metadata.merge(judge_metadata)
157
+
158
+ expectation_result = expect "LLM judge score (#{rubric_name}): >= #{min_passing_score}", result_metadata: combined_metadata do
159
+ judge_task.completed? && judge_task.judgment_score && judge_task.judgment_score >= min_passing_score
160
+ end
161
+
162
+ if expectation_result && judge_task.errors.any?
163
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
164
+ end
165
+
166
+ expectation_result
167
+ end
168
+
169
+ # Uses an LLM judge to compare two pieces of content and determine which better meets specified criteria.
170
+ #
171
+ # This method leverages the Comparative LLM judge to perform A/B testing between two pieces
172
+ # of content. Content placement is randomized to avoid position bias, and the judge determines
173
+ # which content better satisfies the comparison criteria.
174
+ #
175
+ # @param content_to_judge [String] The primary content being evaluated (will be randomly assigned to position A or B)
176
+ # @param over [String] The comparison content to evaluate against (will be randomly assigned to position A or B)
177
+ # @param criteria [String] The comparison criteria to use for evaluation
178
+ # @param allow_ties [Boolean] Whether the judge can declare a tie if both contents are equal (default: true)
179
+ # @param llm_judge_model_key [Symbol, nil] Optional specific LLM model to use for judging.
180
+ # If nil, uses the configured default judge model or falls back to default LLM
181
+ # @param additional_context [String, nil] Optional additional context to help the judge
182
+ #
183
+ # @return [ExpectationResult] Result object containing pass/fail status and judge metadata
184
+ #
185
+ # @example Basic A/B comparison
186
+ # expect_llm_judge_prefers(
187
+ # new_response,
188
+ # over: baseline_response,
189
+ # criteria: "More comprehensive and accurate response"
190
+ # )
191
+ #
192
+ # @example Model comparison with no ties allowed
193
+ # expect_llm_judge_prefers(
194
+ # claude_response,
195
+ # over: gpt_response,
196
+ # criteria: "Better follows the specific instructions given",
197
+ # allow_ties: false
198
+ # )
199
+ #
200
+ # @example With additional context
201
+ # expect_llm_judge_prefers(
202
+ # response_a,
203
+ # over: response_b,
204
+ # criteria: "More helpful for a beginner audience",
205
+ # additional_context: "The user identified themselves as new to programming"
206
+ # )
207
+ #
208
+ # @note The expectation passes if the judge correctly identifies the expected winner.
209
+ # Due to randomization, content_to_judge may be assigned to either position A or B,
210
+ # and the judge's choice is validated against the expected winner.
211
+ #
212
+ # @note The judge result includes metadata accessible via expectation_result.metadata:
213
+ # - :winner - Which content won ("A", "B", or "tie")
214
+ # - :reasoning - Detailed explanation of the choice
215
+ # - :confidence - Confidence score (0.0-1.0)
216
+ def expect_llm_judge_prefers(content_to_judge, over:, criteria:, allow_ties: true, llm_judge_model_key: nil, additional_context: nil,
217
+ result_metadata: {})
218
+ judge_task = LlmJudges::Comparative.run(
219
+ content_to_judge: content_to_judge,
220
+ over_content: over,
221
+ comparison_criteria: criteria,
222
+ allow_ties: allow_ties,
223
+ llm_model_key: llm_judge_model_key,
224
+ additional_context: additional_context
225
+ )
226
+
227
+ if output.respond_to?(:puts)
228
+ output.puts " Winner: #{judge_task.winner}"
229
+ output.puts " #{judge_task.judgment_reasoning}" if Raif.config.evals_verbose_output
230
+ end
231
+
232
+ judge_metadata = {
233
+ winner: judge_task.winner,
234
+ reasoning: judge_task.judgment_reasoning,
235
+ confidence: judge_task.judgment_confidence,
236
+ }.compact
237
+
238
+ # Merge user metadata with judge metadata
239
+ combined_metadata = result_metadata.merge(judge_metadata)
240
+
241
+ expectation_result = expect "LLM judge prefers A over B: #{criteria}", result_metadata: combined_metadata do
242
+ judge_task.completed? && judge_task.correct_expected_winner?
243
+ end
244
+
245
+ if expectation_result && judge_task.errors.any?
246
+ expectation_result.error_message = judge_task.errors.full_messages.join(", ")
247
+ end
248
+
249
+ expectation_result
250
+ end
251
+
252
+ end
253
+ end
254
+ end
255
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class ExpectationResult
6
+ attr_reader :description, :status, :error
7
+ attr_accessor :metadata, :error_message
8
+
9
+ def initialize(description:, status:, error: nil, error_message: nil, metadata: nil)
10
+ @description = description
11
+ @status = status
12
+ @error = error
13
+ @error_message = error_message
14
+ @metadata = metadata
15
+ end
16
+
17
+ def passed?
18
+ @status == :passed
19
+ end
20
+
21
+ def failed?
22
+ @status == :failed
23
+ end
24
+
25
+ def error?
26
+ @status == :error
27
+ end
28
+
29
+ def to_h
30
+ {
31
+ description: description,
32
+ status: status,
33
+ error: error_message.presence || error&.message,
34
+ metadata: metadata
35
+ }.compact
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class LlmJudge < Raif::Task
6
+ # Set default temperature for consistent judging
7
+ llm_temperature 0.0
8
+
9
+ # Default to JSON response format for structured output
10
+ llm_response_format :json
11
+
12
+ task_run_arg :content_to_judge # the content to judge
13
+ task_run_arg :additional_context # additional context to be provided to the judge
14
+
15
+ def default_llm_model_key
16
+ Raif.config.evals_default_llm_judge_model_key || super
17
+ end
18
+
19
+ def judgment_reasoning
20
+ parsed_response["reasoning"] if completed?
21
+ end
22
+
23
+ def judgment_confidence
24
+ parsed_response["confidence"] if completed?
25
+ end
26
+
27
+ def low_confidence?
28
+ judgment_confidence && judgment_confidence < 0.5
29
+ end
30
+ end
31
+ end
32
+ end