raif 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/app/jobs/raif/conversation_entry_job.rb +1 -1
  3. data/app/models/raif/agents/re_act_step.rb +1 -2
  4. data/app/models/raif/concerns/has_llm.rb +1 -1
  5. data/app/models/raif/concerns/task_run_args.rb +62 -0
  6. data/app/models/raif/conversation.rb +5 -1
  7. data/app/models/raif/conversation_entry.rb +6 -8
  8. data/app/models/raif/llm.rb +1 -1
  9. data/app/models/raif/llms/open_router.rb +3 -1
  10. data/app/models/raif/task.rb +22 -9
  11. data/app/views/raif/conversation_entries/_form.html.erb +1 -1
  12. data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
  13. data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
  14. data/config/locales/en.yml +8 -0
  15. data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
  16. data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
  17. data/exe/raif +7 -0
  18. data/lib/generators/raif/agent/agent_generator.rb +22 -7
  19. data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
  20. data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
  21. data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
  22. data/lib/generators/raif/base_generator.rb +19 -0
  23. data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
  24. data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
  25. data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
  26. data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
  27. data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
  28. data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
  29. data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
  30. data/lib/generators/raif/install/install_generator.rb +15 -0
  31. data/lib/generators/raif/install/templates/initializer.rb +11 -0
  32. data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
  33. data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
  34. data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
  35. data/lib/generators/raif/task/task_generator.rb +22 -3
  36. data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
  37. data/lib/generators/raif/task/templates/task.rb.tt +55 -59
  38. data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
  39. data/lib/raif/cli/base.rb +39 -0
  40. data/lib/raif/cli/evals.rb +47 -0
  41. data/lib/raif/cli/evals_setup.rb +27 -0
  42. data/lib/raif/cli.rb +67 -0
  43. data/lib/raif/configuration.rb +20 -6
  44. data/lib/raif/evals/eval.rb +30 -0
  45. data/lib/raif/evals/eval_set.rb +111 -0
  46. data/lib/raif/evals/eval_sets/expectations.rb +53 -0
  47. data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
  48. data/lib/raif/evals/expectation_result.rb +39 -0
  49. data/lib/raif/evals/llm_judge.rb +32 -0
  50. data/lib/raif/evals/llm_judges/binary.rb +94 -0
  51. data/lib/raif/evals/llm_judges/comparative.rb +89 -0
  52. data/lib/raif/evals/llm_judges/scored.rb +63 -0
  53. data/lib/raif/evals/llm_judges/summarization.rb +166 -0
  54. data/lib/raif/evals/run.rb +201 -0
  55. data/lib/raif/evals/scoring_rubric.rb +174 -0
  56. data/lib/raif/evals.rb +26 -0
  57. data/lib/raif/llm_registry.rb +33 -0
  58. data/lib/raif/migration_checker.rb +3 -3
  59. data/lib/raif/utils/colors.rb +23 -0
  60. data/lib/raif/utils.rb +1 -0
  61. data/lib/raif/version.rb +1 -1
  62. data/lib/raif.rb +4 -0
  63. data/spec/support/current_temperature_test_tool.rb +34 -0
  64. data/spec/support/test_conversation.rb +1 -1
  65. metadata +35 -3
@@ -1,63 +1,59 @@
1
- # frozen_string_literal: true
2
-
3
- module Raif
4
- module Tasks
5
- class <%= task_class_name %> < Raif::ApplicationTask
6
- # Set the response format for the task. Options are :html, :text, or :json.
7
- llm_response_format :<%= options[:response_format] %>
8
-
9
- # Set the temperature for the task
10
- # llm_temperature 0.7
11
-
12
- # Optional: Set the allowed tags for the task. Only relevant if response_format is :html.
13
- # Defaults to Rails::HTML5::SafeListSanitizer.allowed_tags
14
- # llm_response_allowed_tags %w[p b i div strong]
15
-
16
- # Optional: Set the allowed attributes for the task. Only relevant if response_format is :html.
17
- # Defaults to Rails::HTML5::SafeListSanitizer.allowed_attributes
18
- # llm_response_allowed_attributes %w[style]
19
-
20
- # Define any attributes that are needed for the task.
21
- # You can then pass them when running the task and they will be available in build_prompt:
22
- # Raif::Tasks::<%= task_class_name %>.run(your_attribute: "some value")
23
- # attr_accessor :your_attribute
24
-
25
- <%- if options[:response_format] == "json" -%>
26
- # Define a JSON schema that the model's response should adhere to
1
+ <% raif_module_namespacing(["Tasks"]) do -%>
2
+ class <%= class_name.demodulize %> < Raif::ApplicationTask
3
+ # Set the response format for the task. Options are :html, :text, or :json.
4
+ llm_response_format :<%= options[:response_format] %>
5
+
6
+ # Set the temperature for the task
7
+ # llm_temperature 0.7
8
+
9
+ # Optional: Set the allowed tags for the task. Only relevant if response_format is :html.
10
+ # Defaults to Rails::HTML5::SafeListSanitizer.allowed_tags
11
+ # llm_response_allowed_tags %w[p b i div strong]
12
+
13
+ # Optional: Set the allowed attributes for the task. Only relevant if response_format is :html.
14
+ # Defaults to Rails::HTML5::SafeListSanitizer.allowed_attributes
15
+ # llm_response_allowed_attributes %w[style]
16
+
17
+ # Define any attributes that are needed for the task.
18
+ # You can then pass them when running the task and they will be available in build_prompt:
19
+ # Raif::Tasks::<%= class_name %>.run(your_attribute: "some value")
20
+ # task_run_arg :your_attribute
21
+ <%- if options[:response_format] == "json" -%>
22
+
23
+ # Define a JSON schema that the model's response should adhere to
24
+ #
25
+ # All attributes will be required and additionalProperties will be set to false.
26
+ json_response_schema do
27
+ # string :title, description: "The title of the operation", minLength: 3
27
28
  #
28
- # All attributes will be required and additionalProperties will be set to false.
29
- json_response_schema do
30
- # string :title, description: "The title of the operation", minLength: 3
31
- #
32
- # object :widget, description: "A widget's description" do
33
- # boolean :is_red, description: "Whether the widget is red"
34
- # integer :rating, description: "A rating of the widget from 1 to 10", minimum: 1, maximum: 10
35
- # array :tags, description: "Associated tags" do
36
- # items type: "string"
37
- # end
38
- # end
39
- #
40
- # array :products, description: "List of products" do
41
- # object do
42
- # integer :id, description: "Product identifier"
43
- # string :name, description: "Product name"
44
- # number :price, description: "Product price", minimum: 0
45
- # end
46
- # end
47
- end
48
- <%- end -%>
49
-
50
- def build_prompt
51
- # Implement the LLM prompt for this task.
52
- raise NotImplementedError, "Implement #build_prompt in #{self.class.name}"
53
- end
54
-
55
- # Optional: Override build_system_prompt if you need custom system instructions.
56
- # The default implementation, which you'll get if you call super, will use Raif.config.task_system_prompt_intro
57
- # and append the system_prompt_language_preference if the task's requested_language_key is set.
58
- # def build_system_prompt
59
- # super + "\nAdditional system instructions..."
29
+ # object :widget, description: "A widget's description" do
30
+ # boolean :is_red, description: "Whether the widget is red"
31
+ # integer :rating, description: "A rating of the widget from 1 to 10", minimum: 1, maximum: 10
32
+ # array :tags, description: "Associated tags" do
33
+ # items type: "string"
34
+ # end
35
+ # end
36
+ #
37
+ # array :products, description: "List of products" do
38
+ # object do
39
+ # integer :id, description: "Product identifier"
40
+ # string :name, description: "Product name"
41
+ # number :price, description: "Product price", minimum: 0
42
+ # end
60
43
  # end
61
44
  end
45
+ <%- end -%>
46
+
47
+ def build_prompt
48
+ # Implement the LLM prompt for this task.
49
+ raise NotImplementedError, "Implement #build_prompt in #{self.class.name}"
50
+ end
51
+
52
+ # Optional: Override build_system_prompt if you need custom system instructions.
53
+ # The default implementation, which you'll get if you call super, will use Raif.config.task_system_prompt_intro
54
+ # and append the system_prompt_language_preference if the task's requested_language_key is set.
55
+ # def build_system_prompt
56
+ # super + "\nAdditional system instructions..."
57
+ # end
62
58
  end
63
- end
59
+ <% end -%>
@@ -0,0 +1,54 @@
1
+ <% raif_module_namespacing(["Evals", "Tasks"]) do -%>
2
+ class <%= class_name.demodulize %>EvalSet < Raif::Evals::EvalSet
3
+ # Run this eval set with:
4
+ # bundle exec raif evals ./<%= eval_set_file_path %>
5
+
6
+ # Setup method runs before each eval
7
+ setup do
8
+ # Common setup code
9
+ end
10
+
11
+ # Teardown runs after each eval
12
+ teardown do
13
+ # Cleanup code
14
+ end
15
+
16
+ eval "<%= class_name %> produces expected output" do
17
+ # task = Raif::Tasks::<%= class_name %>.run(
18
+ # Add your task parameters here that produce the expected output
19
+ # )
20
+
21
+ # The return value of the block determines if the expectation passes or fails
22
+ # expect "task completes successfully" do
23
+ # task.completed?
24
+ # end
25
+
26
+ # expect "contains the word 'hello' in the output" do
27
+ # task.parsed_response.include?("hello")
28
+ # end
29
+
30
+ # Add more specific expectations based on your task's behavior
31
+ end
32
+
33
+ eval "properly handles refusals" do
34
+ # task = Raif::Tasks::<%= class_name %>.run(
35
+ # Add your task parameters here to trigger a refusal
36
+ # )
37
+
38
+ # expect "returns exactly the text 'I'm sorry, I can't do that.'" do
39
+ # task.parsed_response == "I'm sorry, I can't do that."
40
+ # end
41
+ end
42
+
43
+ eval "<%= class_name %> uses appropriate LLM tools" do
44
+ # Test that the task uses the expected tools if applicable
45
+ # task = Raif::Tasks::<%= class_name %>.run(
46
+ # Add parameters that trigger the use of the expected tools
47
+ # )
48
+
49
+ # Example tool invocation expectations (if your task uses tools):
50
+ # expect_tool_invocation(task, "tool_name", with: { param: "value" })
51
+ # expect_no_tool_invocation(task, "unwanted_tool")
52
+ end
53
+ end
54
+ <% end -%>
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module CLI
5
+ class Base
6
+ attr_reader :args, :options
7
+
8
+ def initialize(args = [])
9
+ @args = args
10
+ @options = {}
11
+ end
12
+
13
+ protected
14
+
15
+ def find_rails_root
16
+ current = Dir.pwd
17
+
18
+ until File.exist?(File.join(current, "config", "environment.rb"))
19
+ parent = File.dirname(current)
20
+ if parent == current
21
+ puts "Error: Could not find Rails application root"
22
+ puts "Please run this command from within a Rails application directory"
23
+ exit 1
24
+ end
25
+
26
+ current = parent
27
+ end
28
+
29
+ current
30
+ end
31
+
32
+ def load_rails_application
33
+ rails_root = find_rails_root
34
+ Dir.chdir(rails_root)
35
+ require File.join(rails_root, "config", "environment")
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+ require_relative "base"
5
+
6
+ module Raif
7
+ module CLI
8
+ class Evals < Base
9
+ def run
10
+ # Set test environment by default for evals
11
+ ENV["RAILS_ENV"] ||= "test"
12
+ ENV["RAIF_RUNNING_EVALS"] = "true"
13
+
14
+ OptionParser.new do |opts|
15
+ opts.banner = "Usage: raif evals [options] [FILE_PATHS]"
16
+
17
+ opts.on("-e", "--environment ENV", "Rails environment (default: test)") do |env|
18
+ ENV["RAILS_ENV"] = env
19
+ end
20
+
21
+ opts.on("-h", "--help", "Show this help message") do
22
+ puts opts
23
+ exit
24
+ end
25
+ end.parse!(args)
26
+
27
+ # Parse file paths with optional line numbers
28
+ file_paths = args.map do |arg|
29
+ if arg.include?(":")
30
+ file_path, line_number = arg.split(":", 2)
31
+ { file_path: file_path, line_number: line_number.to_i }
32
+ else
33
+ { file_path: arg, line_number: nil }
34
+ end
35
+ end if args.any?
36
+
37
+ # Find and load Rails application
38
+ load_rails_application
39
+
40
+ require "raif/evals"
41
+
42
+ run = Raif::Evals::Run.new(file_paths: file_paths)
43
+ run.execute
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+ require_relative "base"
5
+
6
+ module Raif
7
+ module CLI
8
+ class EvalsSetup < Base
9
+ def run
10
+ OptionParser.new do |opts|
11
+ opts.banner = "Usage: raif evals:setup [options]"
12
+ opts.on("-h", "--help", "Show this help message") do
13
+ puts opts
14
+ exit
15
+ end
16
+ end.parse!(args)
17
+
18
+ # Load Rails application to use generators
19
+ load_rails_application
20
+
21
+ # Invoke the Rails generator
22
+ require "rails/generators"
23
+ Rails::Generators.invoke("raif:evals:setup", args)
24
+ end
25
+ end
26
+ end
27
+ end
data/lib/raif/cli.rb ADDED
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cli/base"
4
+ require_relative "cli/evals"
5
+ require_relative "cli/evals_setup"
6
+
7
+ module Raif
8
+ module CLI
9
+ COMMANDS = {
10
+ "evals" => "Run Raif evaluation sets",
11
+ "evals:setup" => "Setup Raif evals directory structure",
12
+ "version" => "Show Raif version",
13
+ "help" => "Show this help message"
14
+ }.freeze
15
+
16
+ class Runner
17
+ def initialize(args)
18
+ @args = args
19
+ @command = args.shift
20
+ end
21
+
22
+ def run
23
+ case @command
24
+ when "evals"
25
+ Evals.new(@args).run
26
+ when "evals:setup"
27
+ EvalsSetup.new(@args).run
28
+ when "version", "--version", "-v"
29
+ show_version
30
+ when "help", "--help", "-h", nil
31
+ show_help
32
+ else
33
+ puts "Unknown command: #{@command}"
34
+ puts ""
35
+ show_help
36
+ exit 1
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def show_version
43
+ require_relative "../raif/version"
44
+ puts "Raif #{Raif::VERSION}"
45
+ end
46
+
47
+ def show_help
48
+ puts "Usage: raif COMMAND [options]"
49
+ puts ""
50
+ puts "Commands:"
51
+ COMMANDS.each do |command, description|
52
+ puts format(" %-12s %s", command, description)
53
+ end
54
+ puts ""
55
+ puts "For help on a specific command:"
56
+ puts " raif COMMAND --help"
57
+ puts ""
58
+ puts "Examples:"
59
+ puts " raif evals:setup # Setup eval directory structure"
60
+ puts " raif evals # Run all eval sets in test environment"
61
+ puts " raif evals CustomerSupportEvalSet # Run specific eval set"
62
+ puts " raif evals -e development # Run evals in development environment"
63
+ puts " raif version # Show Raif version"
64
+ end
65
+ end
66
+ end
67
+ end
@@ -18,6 +18,8 @@ module Raif
18
18
  :current_user_method,
19
19
  :default_embedding_model_key,
20
20
  :default_llm_model_key,
21
+ :evals_default_llm_judge_model_key,
22
+ :evals_verbose_output,
21
23
  :llm_api_requests_enabled,
22
24
  :llm_request_max_retries,
23
25
  :llm_request_retriable_exceptions,
@@ -30,6 +32,7 @@ module Raif
30
32
  :open_router_app_name,
31
33
  :open_router_site_url,
32
34
  :streaming_update_chunk_size_threshold,
35
+ :task_creator_optional,
33
36
  :task_system_prompt_intro,
34
37
  :user_tool_types
35
38
 
@@ -40,9 +43,8 @@ module Raif
40
43
  alias_method :aws_bedrock_titan_embedding_models_enabled=, :bedrock_embedding_models_enabled=
41
44
 
42
45
  def initialize
43
- # Set default config
44
46
  @agent_types = Set.new(["Raif::Agents::ReActAgent", "Raif::Agents::NativeToolCallingAgent"])
45
- @anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
47
+ @anthropic_api_key = default_disable_llm_api_requests? ? "placeholder-anthropic-api-key" : ENV["ANTHROPIC_API_KEY"]
46
48
  @bedrock_models_enabled = false
47
49
  @anthropic_models_enabled = ENV["ANTHROPIC_API_KEY"].present?
48
50
  @authorize_admin_controller_action = ->{ false }
@@ -57,8 +59,10 @@ module Raif
57
59
  @conversations_controller = "Raif::ConversationsController"
58
60
  @current_user_method = :current_user
59
61
  @default_embedding_model_key = "open_ai_text_embedding_3_small"
60
- @default_llm_model_key = "open_ai_gpt_4o"
61
- @llm_api_requests_enabled = true
62
+ @default_llm_model_key = default_disable_llm_api_requests? ? :raif_test_llm : (ENV["RAIF_DEFAULT_LLM_MODEL_KEY"].presence || "open_ai_gpt_4o")
63
+ @evals_default_llm_judge_model_key = ENV["RAIF_EVALS_DEFAULT_LLM_JUDGE_MODEL_KEY"].presence
64
+ @evals_verbose_output = false
65
+ @llm_api_requests_enabled = !default_disable_llm_api_requests?
62
66
  @llm_request_max_retries = 2
63
67
  @llm_request_retriable_exceptions = [
64
68
  Faraday::ConnectionFailed,
@@ -66,14 +70,16 @@ module Raif
66
70
  Faraday::ServerError,
67
71
  ]
68
72
  @model_superclass = "ApplicationRecord"
69
- @open_ai_api_key = ENV["OPENAI_API_KEY"]
73
+ @open_ai_api_key = default_disable_llm_api_requests? ? "placeholder-open-ai-api-key" : ENV["OPENAI_API_KEY"]
70
74
  @open_ai_embedding_models_enabled = ENV["OPENAI_API_KEY"].present?
71
75
  @open_ai_models_enabled = ENV["OPENAI_API_KEY"].present?
72
- @open_router_api_key = ENV["OPEN_ROUTER_API_KEY"].presence || ENV["OPENROUTER_API_KEY"]
76
+ open_router_api_key = ENV["OPEN_ROUTER_API_KEY"].presence || ENV["OPENROUTER_API_KEY"]
77
+ @open_router_api_key = default_disable_llm_api_requests? ? "placeholder-open-router-api-key" : open_router_api_key
73
78
  @open_router_models_enabled = @open_router_api_key.present?
74
79
  @open_router_app_name = nil
75
80
  @open_router_site_url = nil
76
81
  @streaming_update_chunk_size_threshold = 25
82
+ @task_creator_optional = true
77
83
  @user_tool_types = []
78
84
  end
79
85
 
@@ -137,5 +143,13 @@ module Raif
137
143
  end
138
144
  end
139
145
 
146
+ private
147
+
148
+ # By default, evals run in the test environment, but need real API keys.
149
+ # In normal tests, we insert placeholders to make it hard to accidentally rack up an LLM API bill.
150
+ def default_disable_llm_api_requests?
151
+ Rails.env.test? && !Raif.running_evals?
152
+ end
153
+
140
154
  end
141
155
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ class Eval
6
+ attr_reader :description, :expectation_results
7
+
8
+ def initialize(description:)
9
+ @description = description
10
+ @expectation_results = []
11
+ end
12
+
13
+ def add_expectation_result(result)
14
+ @expectation_results << result
15
+ end
16
+
17
+ def passed?
18
+ expectation_results.all?(&:passed?)
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ description: description,
24
+ passed: passed?,
25
+ expectation_results: expectation_results.map(&:to_h)
26
+ }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "raif/evals/eval_sets/expectations"
4
+ require "raif/evals/eval_sets/llm_judge_expectations"
5
+
6
+ module Raif
7
+ module Evals
8
+ class EvalSet
9
+ include Raif::Evals::EvalSets::Expectations
10
+ include Raif::Evals::EvalSets::LlmJudgeExpectations
11
+
12
+ attr_reader :current_eval, :output, :results
13
+
14
+ def initialize(output: $stdout)
15
+ @output = output
16
+ end
17
+
18
+ class << self
19
+ attr_reader :setup_block
20
+ attr_reader :teardown_block
21
+
22
+ def inherited(subclass)
23
+ subclass.instance_variable_set(:@evals, [])
24
+ super
25
+ end
26
+
27
+ def evals
28
+ @evals ||= []
29
+ end
30
+
31
+ def eval(description, &block)
32
+ evals << { description: description, block: block, definition_line_number: caller_locations(1, 1).first.lineno }
33
+ end
34
+
35
+ def setup(&block)
36
+ @setup_block = block
37
+ end
38
+
39
+ def teardown(&block)
40
+ @teardown_block = block
41
+ end
42
+
43
+ def run(output: $stdout)
44
+ new(output: output).run
45
+ end
46
+ end
47
+
48
+ def run
49
+ @results = []
50
+
51
+ self.class.evals.each do |eval_definition|
52
+ @results << run_eval(eval_definition)
53
+ end
54
+
55
+ @results
56
+ end
57
+
58
+ def run_eval(eval_definition)
59
+ @current_eval = Eval.new(description: eval_definition[:description])
60
+
61
+ output.puts "Running: #{eval_definition[:description]}"
62
+
63
+ ActiveRecord::Base.transaction do
64
+ instance_eval(&self.class.setup_block) if self.class.setup_block
65
+
66
+ begin
67
+ instance_eval(&eval_definition[:block])
68
+ rescue => e
69
+ output.puts Raif::Utils::Colors.red(" Error in eval block: #{e.message}")
70
+ output.puts Raif::Utils::Colors.red(" #{e.backtrace.join("\n ")}")
71
+ @current_eval.add_expectation_result(
72
+ ExpectationResult.new(
73
+ description: "Eval block execution",
74
+ status: :error,
75
+ error: e
76
+ )
77
+ )
78
+ ensure
79
+ instance_eval(&self.class.teardown_block) if self.class.teardown_block
80
+ end
81
+
82
+ raise ActiveRecord::Rollback
83
+ end
84
+
85
+ @current_eval
86
+ end
87
+
88
+ def file(filename)
89
+ # Validate filename to prevent directory traversal
90
+ raise ArgumentError, "Invalid filename: cannot be empty" if filename.nil? || filename.empty?
91
+ raise ArgumentError, "Invalid filename: cannot contain '..' or absolute paths" if filename.include?("..") || filename.start_with?("/")
92
+
93
+ # Ensure we're only accessing files within the raif_evals/files directory
94
+ base_path = Rails.root.join("raif_evals", "files")
95
+ full_path = base_path.join(filename)
96
+
97
+ # Verify the resolved path is within the expected directory
98
+ unless full_path.to_s.start_with?(base_path.to_s)
99
+ raise ArgumentError, "Invalid filename: path traversal detected"
100
+ end
101
+
102
+ if full_path.exist?
103
+ full_path.read
104
+ else
105
+ raise ArgumentError, "File #{filename} does not exist in raif_evals/files/"
106
+ end
107
+ end
108
+
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Raif
4
+ module Evals
5
+ module EvalSets
6
+ module Expectations
7
+
8
+ def expect(description, result_metadata: nil, &block)
9
+ result = begin
10
+ if block.call
11
+ output.puts Raif::Utils::Colors.green(" ✓ #{description}")
12
+ output.puts Raif::Utils::Colors.green(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
13
+ ExpectationResult.new(description: description, status: :passed, metadata: result_metadata)
14
+ else
15
+ output.puts Raif::Utils::Colors.red(" ✗ #{description}")
16
+ output.puts Raif::Utils::Colors.red(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
17
+ ExpectationResult.new(description: description, status: :failed, metadata: result_metadata)
18
+ end
19
+ rescue => e
20
+ output.puts Raif::Utils::Colors.red(" ✗ #{description} (Error: #{e.message})")
21
+ ExpectationResult.new(description: description, status: :error, error: e, metadata: result_metadata)
22
+ end
23
+
24
+ current_eval.add_expectation_result(result)
25
+ result
26
+ end
27
+
28
+ def expect_tool_invocation(tool_invoker, tool_type, with: {})
29
+ invocations = tool_invoker.raif_model_tool_invocations.select { |inv| inv.tool_type == tool_type }
30
+ invoked_tools = tool_invoker.raif_model_tool_invocations.map{|inv| [inv.tool_type, inv.tool_arguments] }.to_h
31
+
32
+ if with.any?
33
+ invocations = invocations.select do |invocation|
34
+ with.all? { |key, value| invocation.tool_arguments[key.to_s] == value }
35
+ end
36
+ end
37
+
38
+ result_metadata = { invoked_tools: invoked_tools }
39
+ expect "invokes #{tool_type}#{with.any? ? " with #{with.to_json}" : ""}", result_metadata: result_metadata do
40
+ invocations.any?
41
+ end
42
+ end
43
+
44
+ def expect_no_tool_invocation(tool_invoker, tool_name)
45
+ expect "does not invoke #{tool_name}" do
46
+ tool_invoker.raif_model_tool_invocations.none? { |inv| inv.tool_name == tool_name }
47
+ end
48
+ end
49
+
50
+ end
51
+ end
52
+ end
53
+ end