raif 1.2.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -5
- data/app/assets/builds/raif.css +4 -1
- data/app/assets/builds/raif_admin.css +13 -1
- data/app/assets/javascript/raif/controllers/conversations_controller.js +1 -1
- data/app/assets/stylesheets/raif/admin/conversation.scss +16 -0
- data/app/assets/stylesheets/raif/conversations.scss +3 -0
- data/app/assets/stylesheets/raif.scss +2 -1
- data/app/controllers/raif/admin/application_controller.rb +16 -0
- data/app/controllers/raif/admin/configs_controller.rb +94 -0
- data/app/controllers/raif/admin/model_completions_controller.rb +18 -1
- data/app/controllers/raif/admin/model_tool_invocations_controller.rb +7 -1
- data/app/controllers/raif/admin/stats/model_tool_invocations_controller.rb +21 -0
- data/app/controllers/raif/admin/stats/tasks_controller.rb +15 -6
- data/app/controllers/raif/admin/stats_controller.rb +32 -3
- data/app/controllers/raif/conversation_entries_controller.rb +1 -0
- data/app/controllers/raif/conversations_controller.rb +10 -2
- data/app/jobs/raif/conversation_entry_job.rb +8 -6
- data/app/models/raif/admin/task_stat.rb +7 -0
- data/app/models/raif/agent.rb +63 -2
- data/app/models/raif/agents/native_tool_calling_agent.rb +101 -56
- data/app/models/raif/application_record.rb +18 -0
- data/app/models/raif/concerns/agent_inference_stats.rb +35 -0
- data/app/models/raif/concerns/has_llm.rb +1 -1
- data/app/models/raif/concerns/json_schema_definition.rb +40 -5
- data/app/models/raif/concerns/llms/anthropic/message_formatting.rb +28 -0
- data/app/models/raif/concerns/llms/anthropic/response_tool_calls.rb +24 -0
- data/app/models/raif/concerns/llms/anthropic/tool_formatting.rb +4 -0
- data/app/models/raif/concerns/llms/bedrock/message_formatting.rb +36 -0
- data/app/models/raif/concerns/llms/bedrock/response_tool_calls.rb +26 -0
- data/app/models/raif/concerns/llms/bedrock/tool_formatting.rb +4 -0
- data/app/models/raif/concerns/llms/google/message_formatting.rb +109 -0
- data/app/models/raif/concerns/llms/google/response_tool_calls.rb +32 -0
- data/app/models/raif/concerns/llms/google/tool_formatting.rb +72 -0
- data/app/models/raif/concerns/llms/message_formatting.rb +11 -5
- data/app/models/raif/concerns/llms/open_ai/json_schema_validation.rb +3 -3
- data/app/models/raif/concerns/llms/open_ai_completions/message_formatting.rb +22 -0
- data/app/models/raif/concerns/llms/open_ai_completions/response_tool_calls.rb +22 -0
- data/app/models/raif/concerns/llms/open_ai_completions/tool_formatting.rb +4 -0
- data/app/models/raif/concerns/llms/open_ai_responses/message_formatting.rb +17 -0
- data/app/models/raif/concerns/llms/open_ai_responses/response_tool_calls.rb +26 -0
- data/app/models/raif/concerns/llms/open_ai_responses/tool_formatting.rb +4 -0
- data/app/models/raif/concerns/run_with.rb +127 -0
- data/app/models/raif/conversation.rb +96 -9
- data/app/models/raif/conversation_entry.rb +37 -8
- data/app/models/raif/embedding_model.rb +2 -1
- data/app/models/raif/embedding_models/open_ai.rb +1 -1
- data/app/models/raif/llm.rb +28 -3
- data/app/models/raif/llms/anthropic.rb +7 -19
- data/app/models/raif/llms/bedrock.rb +6 -20
- data/app/models/raif/llms/google.rb +140 -0
- data/app/models/raif/llms/open_ai_base.rb +19 -5
- data/app/models/raif/llms/open_ai_completions.rb +6 -11
- data/app/models/raif/llms/open_ai_responses.rb +6 -16
- data/app/models/raif/llms/open_router.rb +10 -14
- data/app/models/raif/model_completion.rb +61 -0
- data/app/models/raif/model_tool.rb +10 -2
- data/app/models/raif/model_tool_invocation.rb +38 -6
- data/app/models/raif/model_tools/agent_final_answer.rb +2 -7
- data/app/models/raif/model_tools/provider_managed/code_execution.rb +4 -0
- data/app/models/raif/model_tools/provider_managed/image_generation.rb +4 -0
- data/app/models/raif/model_tools/provider_managed/web_search.rb +4 -0
- data/app/models/raif/streaming_responses/google.rb +71 -0
- data/app/models/raif/task.rb +74 -18
- data/app/models/raif/user_tool_invocation.rb +19 -0
- data/app/views/layouts/raif/admin.html.erb +12 -1
- data/app/views/raif/admin/agents/_agent.html.erb +8 -0
- data/app/views/raif/admin/agents/_conversation_message.html.erb +28 -6
- data/app/views/raif/admin/agents/index.html.erb +2 -0
- data/app/views/raif/admin/agents/show.html.erb +46 -1
- data/app/views/raif/admin/configs/show.html.erb +117 -0
- data/app/views/raif/admin/conversations/_conversation_entry.html.erb +29 -34
- data/app/views/raif/admin/conversations/show.html.erb +2 -0
- data/app/views/raif/admin/model_completions/_model_completion.html.erb +9 -0
- data/app/views/raif/admin/model_completions/index.html.erb +26 -0
- data/app/views/raif/admin/model_completions/show.html.erb +124 -61
- data/app/views/raif/admin/model_tool_invocations/index.html.erb +22 -1
- data/app/views/raif/admin/model_tools/_list.html.erb +16 -0
- data/app/views/raif/admin/model_tools/_model_tool.html.erb +36 -0
- data/app/views/raif/admin/stats/_stats_tile.html.erb +34 -0
- data/app/views/raif/admin/stats/index.html.erb +71 -88
- data/app/views/raif/admin/stats/model_tool_invocations/index.html.erb +43 -0
- data/app/views/raif/admin/stats/tasks/index.html.erb +20 -6
- data/app/views/raif/admin/tasks/index.html.erb +6 -1
- data/app/views/raif/admin/tasks/show.html.erb +36 -3
- data/app/views/raif/conversation_entries/_form.html.erb +4 -1
- data/app/views/raif/conversations/_conversation.html.erb +10 -0
- data/app/views/raif/conversations/_entry_processed.turbo_stream.erb +12 -0
- data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
- data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
- data/app/views/raif/conversations/index.html.erb +23 -0
- data/config/locales/admin.en.yml +33 -1
- data/config/locales/en.yml +41 -4
- data/config/routes.rb +2 -0
- data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
- data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
- data/db/migrate/20250904194456_add_generating_entry_response_to_raif_conversations.rb +7 -0
- data/db/migrate/20250911125234_add_source_to_raif_tasks.rb +7 -0
- data/db/migrate/20251020005853_add_source_to_raif_agents.rb +7 -0
- data/db/migrate/20251020011346_rename_task_run_args_to_run_with.rb +7 -0
- data/db/migrate/20251020011405_add_run_with_to_raif_agents.rb +13 -0
- data/db/migrate/20251024160119_add_llm_messages_max_length_to_raif_conversations.rb +14 -0
- data/db/migrate/20251124185033_add_provider_tool_call_id_to_raif_model_tool_invocations.rb +7 -0
- data/db/migrate/20251128202941_add_tool_choice_to_raif_model_completions.rb +7 -0
- data/db/migrate/20260118144846_add_source_to_raif_conversations.rb +7 -0
- data/db/migrate/20260119000000_add_failure_tracking_to_raif_model_completions.rb +10 -0
- data/db/migrate/20260119000001_add_completed_at_to_raif_model_completions.rb +8 -0
- data/db/migrate/20260119000002_add_started_at_to_raif_model_completions.rb +8 -0
- data/exe/raif +7 -0
- data/lib/generators/raif/agent/agent_generator.rb +22 -7
- data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
- data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
- data/lib/generators/raif/agent/templates/application_agent.rb.tt +1 -3
- data/lib/generators/raif/base_generator.rb +19 -0
- data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
- data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
- data/lib/generators/raif/conversation/templates/conversation.rb.tt +34 -32
- data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
- data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
- data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
- data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
- data/lib/generators/raif/install/install_generator.rb +15 -0
- data/lib/generators/raif/install/templates/initializer.rb +89 -10
- data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -5
- data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -78
- data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +1 -1
- data/lib/generators/raif/task/task_generator.rb +22 -3
- data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
- data/lib/generators/raif/task/templates/task.rb.tt +55 -59
- data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
- data/lib/raif/cli/base.rb +39 -0
- data/lib/raif/cli/evals.rb +47 -0
- data/lib/raif/cli/evals_setup.rb +27 -0
- data/lib/raif/cli.rb +67 -0
- data/lib/raif/configuration.rb +57 -8
- data/lib/raif/engine.rb +8 -0
- data/lib/raif/errors/instance_dependent_schema_error.rb +8 -0
- data/lib/raif/errors/streaming_error.rb +6 -3
- data/lib/raif/errors.rb +1 -0
- data/lib/raif/evals/eval.rb +30 -0
- data/lib/raif/evals/eval_set.rb +111 -0
- data/lib/raif/evals/eval_sets/expectations.rb +53 -0
- data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
- data/lib/raif/evals/expectation_result.rb +39 -0
- data/lib/raif/evals/llm_judge.rb +32 -0
- data/lib/raif/evals/llm_judges/binary.rb +94 -0
- data/lib/raif/evals/llm_judges/comparative.rb +89 -0
- data/lib/raif/evals/llm_judges/scored.rb +63 -0
- data/lib/raif/evals/llm_judges/summarization.rb +166 -0
- data/lib/raif/evals/run.rb +202 -0
- data/lib/raif/evals/scoring_rubric.rb +174 -0
- data/lib/raif/evals.rb +26 -0
- data/lib/raif/json_schema_builder.rb +14 -0
- data/lib/raif/llm_registry.rb +218 -15
- data/lib/raif/messages.rb +180 -0
- data/lib/raif/migration_checker.rb +3 -3
- data/lib/raif/utils/colors.rb +23 -0
- data/lib/raif/utils.rb +1 -0
- data/lib/raif/version.rb +1 -1
- data/lib/raif.rb +13 -0
- data/lib/tasks/annotate_rb.rake +10 -0
- data/spec/support/current_temperature_test_tool.rb +34 -0
- data/spec/support/rspec_helpers.rb +8 -8
- data/spec/support/test_conversation.rb +1 -1
- metadata +77 -10
- data/app/models/raif/agents/re_act_agent.rb +0 -127
- data/app/models/raif/agents/re_act_step.rb +0 -33
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
<% raif_module_namespacing(["Evals", "Tasks"]) do -%>
|
|
2
|
+
class <%= class_name.demodulize %>EvalSet < Raif::Evals::EvalSet
|
|
3
|
+
# Run this eval set with:
|
|
4
|
+
# bundle exec raif evals ./<%= eval_set_file_path %>
|
|
5
|
+
|
|
6
|
+
# Setup method runs before each eval
|
|
7
|
+
setup do
|
|
8
|
+
# Common setup code
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Teardown runs after each eval
|
|
12
|
+
teardown do
|
|
13
|
+
# Cleanup code
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
eval "<%= class_name %> produces expected output" do
|
|
17
|
+
# task = Raif::Tasks::<%= class_name %>.run(
|
|
18
|
+
# Add your task parameters here that produce the expected output
|
|
19
|
+
# )
|
|
20
|
+
|
|
21
|
+
# The return value of the block determines if the expectation passes or fails
|
|
22
|
+
# expect "task completes successfully" do
|
|
23
|
+
# task.completed?
|
|
24
|
+
# end
|
|
25
|
+
|
|
26
|
+
# expect "contains the word 'hello' in the output" do
|
|
27
|
+
# task.parsed_response.include?("hello")
|
|
28
|
+
# end
|
|
29
|
+
|
|
30
|
+
# Add more specific expectations based on your task's behavior
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
eval "properly handles refusals" do
|
|
34
|
+
# task = Raif::Tasks::<%= class_name %>.run(
|
|
35
|
+
# Add your task parameters here to trigger a refusal
|
|
36
|
+
# )
|
|
37
|
+
|
|
38
|
+
# expect "returns exactly the text 'I'm sorry, I can't do that.'" do
|
|
39
|
+
# task.parsed_response == "I'm sorry, I can't do that."
|
|
40
|
+
# end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
eval "<%= class_name %> uses appropriate LLM tools" do
|
|
44
|
+
# Test that the task uses the expected tools if applicable
|
|
45
|
+
# task = Raif::Tasks::<%= class_name %>.run(
|
|
46
|
+
# Add parameters that trigger the use of the expected tools
|
|
47
|
+
# )
|
|
48
|
+
|
|
49
|
+
# Example tool invocation expectations (if your task uses tools):
|
|
50
|
+
# expect_tool_invocation(task, "tool_name", with: { param: "value" })
|
|
51
|
+
# expect_no_tool_invocation(task, "unwanted_tool")
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
<% end -%>
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Raif
|
|
4
|
+
module CLI
|
|
5
|
+
class Base
|
|
6
|
+
attr_reader :args, :options
|
|
7
|
+
|
|
8
|
+
def initialize(args = [])
|
|
9
|
+
@args = args
|
|
10
|
+
@options = {}
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
protected
|
|
14
|
+
|
|
15
|
+
def find_rails_root
|
|
16
|
+
current = Dir.pwd
|
|
17
|
+
|
|
18
|
+
until File.exist?(File.join(current, "config", "environment.rb"))
|
|
19
|
+
parent = File.dirname(current)
|
|
20
|
+
if parent == current
|
|
21
|
+
puts "Error: Could not find Rails application root"
|
|
22
|
+
puts "Please run this command from within a Rails application directory"
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
current = parent
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
current
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def load_rails_application
|
|
33
|
+
rails_root = find_rails_root
|
|
34
|
+
Dir.chdir(rails_root)
|
|
35
|
+
require File.join(rails_root, "config", "environment")
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
require_relative "base"
|
|
5
|
+
|
|
6
|
+
module Raif
|
|
7
|
+
module CLI
|
|
8
|
+
class Evals < Base
|
|
9
|
+
def run
|
|
10
|
+
# Set test environment by default for evals
|
|
11
|
+
ENV["RAILS_ENV"] ||= "test"
|
|
12
|
+
ENV["RAIF_RUNNING_EVALS"] = "true"
|
|
13
|
+
|
|
14
|
+
OptionParser.new do |opts|
|
|
15
|
+
opts.banner = "Usage: raif evals [options] [FILE_PATHS]"
|
|
16
|
+
|
|
17
|
+
opts.on("-e", "--environment ENV", "Rails environment (default: test)") do |env|
|
|
18
|
+
ENV["RAILS_ENV"] = env
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("-h", "--help", "Show this help message") do
|
|
22
|
+
puts opts
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
end.parse!(args)
|
|
26
|
+
|
|
27
|
+
# Parse file paths with optional line numbers
|
|
28
|
+
file_paths = args.map do |arg|
|
|
29
|
+
if arg.include?(":")
|
|
30
|
+
file_path, line_number = arg.split(":", 2)
|
|
31
|
+
{ file_path: file_path, line_number: line_number.to_i }
|
|
32
|
+
else
|
|
33
|
+
{ file_path: arg, line_number: nil }
|
|
34
|
+
end
|
|
35
|
+
end if args.any?
|
|
36
|
+
|
|
37
|
+
# Find and load Rails application
|
|
38
|
+
load_rails_application
|
|
39
|
+
|
|
40
|
+
require "raif/evals"
|
|
41
|
+
|
|
42
|
+
run = Raif::Evals::Run.new(file_paths: file_paths)
|
|
43
|
+
run.execute
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
require_relative "base"
|
|
5
|
+
|
|
6
|
+
module Raif
|
|
7
|
+
module CLI
|
|
8
|
+
class EvalsSetup < Base
|
|
9
|
+
def run
|
|
10
|
+
OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: raif evals:setup [options]"
|
|
12
|
+
opts.on("-h", "--help", "Show this help message") do
|
|
13
|
+
puts opts
|
|
14
|
+
exit
|
|
15
|
+
end
|
|
16
|
+
end.parse!(args)
|
|
17
|
+
|
|
18
|
+
# Load Rails application to use generators
|
|
19
|
+
load_rails_application
|
|
20
|
+
|
|
21
|
+
# Invoke the Rails generator
|
|
22
|
+
require "rails/generators"
|
|
23
|
+
Rails::Generators.invoke("raif:evals:setup", args)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
data/lib/raif/cli.rb
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "cli/base"
|
|
4
|
+
require_relative "cli/evals"
|
|
5
|
+
require_relative "cli/evals_setup"
|
|
6
|
+
|
|
7
|
+
module Raif
|
|
8
|
+
module CLI
|
|
9
|
+
COMMANDS = {
|
|
10
|
+
"evals" => "Run Raif evaluation sets",
|
|
11
|
+
"evals:setup" => "Setup Raif evals directory structure",
|
|
12
|
+
"version" => "Show Raif version",
|
|
13
|
+
"help" => "Show this help message"
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
class Runner
|
|
17
|
+
def initialize(args)
|
|
18
|
+
@args = args
|
|
19
|
+
@command = args.shift
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def run
|
|
23
|
+
case @command
|
|
24
|
+
when "evals"
|
|
25
|
+
Evals.new(@args).run
|
|
26
|
+
when "evals:setup"
|
|
27
|
+
EvalsSetup.new(@args).run
|
|
28
|
+
when "version", "--version", "-v"
|
|
29
|
+
show_version
|
|
30
|
+
when "help", "--help", "-h", nil
|
|
31
|
+
show_help
|
|
32
|
+
else
|
|
33
|
+
puts "Unknown command: #{@command}"
|
|
34
|
+
puts ""
|
|
35
|
+
show_help
|
|
36
|
+
exit 1
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def show_version
|
|
43
|
+
require_relative "../raif/version"
|
|
44
|
+
puts "Raif #{Raif::VERSION}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def show_help
|
|
48
|
+
puts "Usage: raif COMMAND [options]"
|
|
49
|
+
puts ""
|
|
50
|
+
puts "Commands:"
|
|
51
|
+
COMMANDS.each do |command, description|
|
|
52
|
+
puts format(" %-12s %s", command, description)
|
|
53
|
+
end
|
|
54
|
+
puts ""
|
|
55
|
+
puts "For help on a specific command:"
|
|
56
|
+
puts " raif COMMAND --help"
|
|
57
|
+
puts ""
|
|
58
|
+
puts "Examples:"
|
|
59
|
+
puts " raif evals:setup # Setup eval directory structure"
|
|
60
|
+
puts " raif evals # Run all eval sets in test environment"
|
|
61
|
+
puts " raif evals CustomerSupportEvalSet # Run specific eval set"
|
|
62
|
+
puts " raif evals -e development # Run evals in development environment"
|
|
63
|
+
puts " raif version # Show Raif version"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
data/lib/raif/configuration.rb
CHANGED
|
@@ -12,24 +12,37 @@ module Raif
|
|
|
12
12
|
:aws_bedrock_region,
|
|
13
13
|
:bedrock_embedding_models_enabled,
|
|
14
14
|
:conversation_entries_controller,
|
|
15
|
+
:conversation_llm_messages_max_length_default,
|
|
15
16
|
:conversation_system_prompt_intro,
|
|
16
17
|
:conversation_types,
|
|
17
18
|
:conversations_controller,
|
|
18
19
|
:current_user_method,
|
|
19
20
|
:default_embedding_model_key,
|
|
20
21
|
:default_llm_model_key,
|
|
22
|
+
:evals_default_llm_judge_model_key,
|
|
23
|
+
:evals_verbose_output,
|
|
24
|
+
:google_api_key,
|
|
25
|
+
:google_models_enabled,
|
|
21
26
|
:llm_api_requests_enabled,
|
|
22
27
|
:llm_request_max_retries,
|
|
23
28
|
:llm_request_retriable_exceptions,
|
|
24
29
|
:model_superclass,
|
|
25
30
|
:open_ai_api_key,
|
|
31
|
+
:open_ai_api_version,
|
|
32
|
+
:open_ai_auth_header_style,
|
|
33
|
+
:open_ai_base_url,
|
|
34
|
+
:open_ai_embedding_base_url,
|
|
26
35
|
:open_ai_embedding_models_enabled,
|
|
27
36
|
:open_ai_models_enabled,
|
|
28
37
|
:open_router_api_key,
|
|
29
38
|
:open_router_models_enabled,
|
|
30
39
|
:open_router_app_name,
|
|
31
40
|
:open_router_site_url,
|
|
41
|
+
:request_open_timeout,
|
|
42
|
+
:request_read_timeout,
|
|
43
|
+
:request_write_timeout,
|
|
32
44
|
:streaming_update_chunk_size_threshold,
|
|
45
|
+
:task_creator_optional,
|
|
33
46
|
:task_system_prompt_intro,
|
|
34
47
|
:user_tool_types
|
|
35
48
|
|
|
@@ -40,9 +53,8 @@ module Raif
|
|
|
40
53
|
alias_method :aws_bedrock_titan_embedding_models_enabled=, :bedrock_embedding_models_enabled=
|
|
41
54
|
|
|
42
55
|
def initialize
|
|
43
|
-
|
|
44
|
-
@
|
|
45
|
-
@anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
|
|
56
|
+
@agent_types = Set.new(["Raif::Agents::NativeToolCallingAgent"])
|
|
57
|
+
@anthropic_api_key = default_disable_llm_api_requests? ? "placeholder-anthropic-api-key" : ENV["ANTHROPIC_API_KEY"]
|
|
46
58
|
@bedrock_models_enabled = false
|
|
47
59
|
@anthropic_models_enabled = ENV["ANTHROPIC_API_KEY"].present?
|
|
48
60
|
@authorize_admin_controller_action = ->{ false }
|
|
@@ -52,28 +64,45 @@ module Raif
|
|
|
52
64
|
@bedrock_embedding_models_enabled = false
|
|
53
65
|
@task_system_prompt_intro = "You are a helpful assistant."
|
|
54
66
|
@conversation_entries_controller = "Raif::ConversationEntriesController"
|
|
67
|
+
@conversation_llm_messages_max_length_default = 50
|
|
55
68
|
@conversation_system_prompt_intro = "You are a helpful assistant who is collaborating with a teammate."
|
|
56
69
|
@conversation_types = Set.new(["Raif::Conversation"])
|
|
57
70
|
@conversations_controller = "Raif::ConversationsController"
|
|
58
71
|
@current_user_method = :current_user
|
|
59
72
|
@default_embedding_model_key = "open_ai_text_embedding_3_small"
|
|
60
|
-
@default_llm_model_key = "open_ai_gpt_4o"
|
|
61
|
-
@
|
|
73
|
+
@default_llm_model_key = default_disable_llm_api_requests? ? :raif_test_llm : (ENV["RAIF_DEFAULT_LLM_MODEL_KEY"].presence || "open_ai_gpt_4o")
|
|
74
|
+
@evals_default_llm_judge_model_key = ENV["RAIF_EVALS_DEFAULT_LLM_JUDGE_MODEL_KEY"].presence
|
|
75
|
+
@evals_verbose_output = false
|
|
76
|
+
google_api_key = ENV["GOOGLE_AI_API_KEY"].presence || ENV["GOOGLE_API_KEY"]
|
|
77
|
+
@google_api_key = default_disable_llm_api_requests? ? "placeholder-google-api-key" : google_api_key
|
|
78
|
+
@google_models_enabled = @google_api_key.present?
|
|
79
|
+
@llm_api_requests_enabled = !default_disable_llm_api_requests?
|
|
62
80
|
@llm_request_max_retries = 2
|
|
63
81
|
@llm_request_retriable_exceptions = [
|
|
64
82
|
Faraday::ConnectionFailed,
|
|
65
83
|
Faraday::TimeoutError,
|
|
66
84
|
Faraday::ServerError,
|
|
85
|
+
Net::ReadTimeout,
|
|
86
|
+
Net::OpenTimeout,
|
|
67
87
|
]
|
|
68
88
|
@model_superclass = "ApplicationRecord"
|
|
69
|
-
@open_ai_api_key = ENV["OPENAI_API_KEY"]
|
|
89
|
+
@open_ai_api_key = default_disable_llm_api_requests? ? "placeholder-open-ai-api-key" : ENV["OPENAI_API_KEY"]
|
|
90
|
+
@open_ai_api_version = nil
|
|
91
|
+
@open_ai_auth_header_style = :bearer
|
|
92
|
+
@open_ai_base_url = "https://api.openai.com/v1"
|
|
93
|
+
@open_ai_embedding_base_url = "https://api.openai.com/v1"
|
|
70
94
|
@open_ai_embedding_models_enabled = ENV["OPENAI_API_KEY"].present?
|
|
71
95
|
@open_ai_models_enabled = ENV["OPENAI_API_KEY"].present?
|
|
72
|
-
|
|
96
|
+
open_router_api_key = ENV["OPEN_ROUTER_API_KEY"].presence || ENV["OPENROUTER_API_KEY"]
|
|
97
|
+
@open_router_api_key = default_disable_llm_api_requests? ? "placeholder-open-router-api-key" : open_router_api_key
|
|
73
98
|
@open_router_models_enabled = @open_router_api_key.present?
|
|
74
99
|
@open_router_app_name = nil
|
|
75
100
|
@open_router_site_url = nil
|
|
101
|
+
@request_open_timeout = nil
|
|
102
|
+
@request_read_timeout = nil
|
|
103
|
+
@request_write_timeout = nil
|
|
76
104
|
@streaming_update_chunk_size_threshold = 25
|
|
105
|
+
@task_creator_optional = true
|
|
77
106
|
@user_tool_types = []
|
|
78
107
|
end
|
|
79
108
|
|
|
@@ -97,7 +126,9 @@ module Raif
|
|
|
97
126
|
"Raif.config.default_llm_model_key was set to #{default_llm_model_key}, but must be one of: #{Raif.available_llm_keys.join(", ")}"
|
|
98
127
|
end
|
|
99
128
|
|
|
100
|
-
if
|
|
129
|
+
if default_embedding_model_key.present? &&
|
|
130
|
+
Raif.embedding_model_registry.present? &&
|
|
131
|
+
!Raif.available_embedding_model_keys.include?(default_embedding_model_key.to_sym)
|
|
101
132
|
raise Raif::Errors::InvalidConfigError,
|
|
102
133
|
"Raif.config.default_embedding_model_key was set to #{default_embedding_model_key}, but must be one of: #{Raif.available_embedding_model_keys.join(", ")}" # rubocop:disable Layout/LineLength
|
|
103
134
|
end
|
|
@@ -121,6 +152,11 @@ module Raif
|
|
|
121
152
|
"Raif.config.open_ai_api_key is required when Raif.config.open_ai_models_enabled is true. Set it via Raif.config.open_ai_api_key or ENV[\"OPENAI_API_KEY\"]" # rubocop:disable Layout/LineLength
|
|
122
153
|
end
|
|
123
154
|
|
|
155
|
+
if open_ai_models_enabled && ![:bearer, :api_key].include?(open_ai_auth_header_style)
|
|
156
|
+
raise Raif::Errors::InvalidConfigError,
|
|
157
|
+
"Raif.config.open_ai_auth_header_style must be either :bearer or :api_key"
|
|
158
|
+
end
|
|
159
|
+
|
|
124
160
|
if open_ai_embedding_models_enabled && open_ai_api_key.blank?
|
|
125
161
|
raise Raif::Errors::InvalidConfigError,
|
|
126
162
|
"Raif.config.open_ai_api_key is required when Raif.config.open_ai_embedding_models_enabled is true. Set it via Raif.config.open_ai_api_key or ENV[\"OPENAI_API_KEY\"]" # rubocop:disable Layout/LineLength
|
|
@@ -135,6 +171,19 @@ module Raif
|
|
|
135
171
|
raise Raif::Errors::InvalidConfigError,
|
|
136
172
|
"Raif.config.open_router_api_key is required when Raif.config.open_router_models_enabled is true. Set it via Raif.config.open_router_api_key or ENV['OPEN_ROUTER_API_KEY']" # rubocop:disable Layout/LineLength
|
|
137
173
|
end
|
|
174
|
+
|
|
175
|
+
if google_models_enabled && google_api_key.blank?
|
|
176
|
+
raise Raif::Errors::InvalidConfigError,
|
|
177
|
+
"Raif.config.google_api_key is required when Raif.config.google_models_enabled is true. Set it via Raif.config.google_api_key or ENV['GOOGLE_API_KEY']" # rubocop:disable Layout/LineLength
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
private
|
|
182
|
+
|
|
183
|
+
# By default, evals run in the test environment, but need real API keys.
|
|
184
|
+
# In normal tests, we insert placeholders to make it hard to accidentally rack up an LLM API bill.
|
|
185
|
+
def default_disable_llm_api_requests?
|
|
186
|
+
Rails.env.test? && !Raif.running_evals?
|
|
138
187
|
end
|
|
139
188
|
|
|
140
189
|
end
|
data/lib/raif/engine.rb
CHANGED
|
@@ -72,6 +72,14 @@ module Raif
|
|
|
72
72
|
end
|
|
73
73
|
end
|
|
74
74
|
|
|
75
|
+
config.after_initialize do
|
|
76
|
+
next unless Raif.config.google_models_enabled
|
|
77
|
+
|
|
78
|
+
Raif.default_llms[Raif::Llms::Google].each do |llm_config|
|
|
79
|
+
Raif.register_llm(Raif::Llms::Google, **llm_config)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
75
83
|
config.after_initialize do
|
|
76
84
|
next unless Raif.config.bedrock_embedding_models_enabled
|
|
77
85
|
|
|
@@ -3,16 +3,19 @@
|
|
|
3
3
|
module Raif
|
|
4
4
|
module Errors
|
|
5
5
|
class StreamingError < StandardError
|
|
6
|
-
attr_reader :
|
|
6
|
+
attr_reader :type, :code, :event
|
|
7
7
|
|
|
8
8
|
def initialize(message:, type:, event:, code: nil)
|
|
9
|
-
super
|
|
9
|
+
super(message)
|
|
10
10
|
|
|
11
|
-
@message = message
|
|
12
11
|
@type = type
|
|
13
12
|
@code = code
|
|
14
13
|
@event = event
|
|
15
14
|
end
|
|
15
|
+
|
|
16
|
+
def to_s
|
|
17
|
+
"[#{type}] #{super} (code=#{code}, event=#{event})"
|
|
18
|
+
end
|
|
16
19
|
end
|
|
17
20
|
end
|
|
18
21
|
end
|
data/lib/raif/errors.rb
CHANGED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Raif
|
|
4
|
+
module Evals
|
|
5
|
+
class Eval
|
|
6
|
+
attr_reader :description, :expectation_results
|
|
7
|
+
|
|
8
|
+
def initialize(description:)
|
|
9
|
+
@description = description
|
|
10
|
+
@expectation_results = []
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def add_expectation_result(result)
|
|
14
|
+
@expectation_results << result
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def passed?
|
|
18
|
+
expectation_results.all?(&:passed?)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def to_h
|
|
22
|
+
{
|
|
23
|
+
description: description,
|
|
24
|
+
passed: passed?,
|
|
25
|
+
expectation_results: expectation_results.map(&:to_h)
|
|
26
|
+
}
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "raif/evals/eval_sets/expectations"
|
|
4
|
+
require "raif/evals/eval_sets/llm_judge_expectations"
|
|
5
|
+
|
|
6
|
+
module Raif
|
|
7
|
+
module Evals
|
|
8
|
+
class EvalSet
|
|
9
|
+
include Raif::Evals::EvalSets::Expectations
|
|
10
|
+
include Raif::Evals::EvalSets::LlmJudgeExpectations
|
|
11
|
+
|
|
12
|
+
attr_reader :current_eval, :output, :results
|
|
13
|
+
|
|
14
|
+
def initialize(output: $stdout)
|
|
15
|
+
@output = output
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
attr_reader :setup_block
|
|
20
|
+
attr_reader :teardown_block
|
|
21
|
+
|
|
22
|
+
def inherited(subclass)
|
|
23
|
+
subclass.instance_variable_set(:@evals, [])
|
|
24
|
+
super
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def evals
|
|
28
|
+
@evals ||= []
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def eval(description, &block)
|
|
32
|
+
evals << { description: description, block: block, definition_line_number: caller_locations(1, 1).first.lineno }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def setup(&block)
|
|
36
|
+
@setup_block = block
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def teardown(&block)
|
|
40
|
+
@teardown_block = block
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def run(output: $stdout)
|
|
44
|
+
new(output: output).run
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def run
|
|
49
|
+
@results = []
|
|
50
|
+
|
|
51
|
+
self.class.evals.each do |eval_definition|
|
|
52
|
+
@results << run_eval(eval_definition)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
@results
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def run_eval(eval_definition)
|
|
59
|
+
@current_eval = Eval.new(description: eval_definition[:description])
|
|
60
|
+
|
|
61
|
+
output.puts "Running: #{eval_definition[:description]}"
|
|
62
|
+
|
|
63
|
+
ActiveRecord::Base.transaction do
|
|
64
|
+
instance_eval(&self.class.setup_block) if self.class.setup_block
|
|
65
|
+
|
|
66
|
+
begin
|
|
67
|
+
instance_eval(&eval_definition[:block])
|
|
68
|
+
rescue => e
|
|
69
|
+
output.puts Raif::Utils::Colors.red(" Error in eval block: #{e.message}")
|
|
70
|
+
output.puts Raif::Utils::Colors.red(" #{e.backtrace.join("\n ")}")
|
|
71
|
+
@current_eval.add_expectation_result(
|
|
72
|
+
ExpectationResult.new(
|
|
73
|
+
description: "Eval block execution",
|
|
74
|
+
status: :error,
|
|
75
|
+
error: e
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
ensure
|
|
79
|
+
instance_eval(&self.class.teardown_block) if self.class.teardown_block
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
raise ActiveRecord::Rollback
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
@current_eval
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def file(filename)
|
|
89
|
+
# Validate filename to prevent directory traversal
|
|
90
|
+
raise ArgumentError, "Invalid filename: cannot be empty" if filename.nil? || filename.empty?
|
|
91
|
+
raise ArgumentError, "Invalid filename: cannot contain '..' or absolute paths" if filename.include?("..") || filename.start_with?("/")
|
|
92
|
+
|
|
93
|
+
# Ensure we're only accessing files within the raif_evals/files directory
|
|
94
|
+
base_path = Rails.root.join("raif_evals", "files")
|
|
95
|
+
full_path = base_path.join(filename)
|
|
96
|
+
|
|
97
|
+
# Verify the resolved path is within the expected directory
|
|
98
|
+
unless full_path.to_s.start_with?(base_path.to_s)
|
|
99
|
+
raise ArgumentError, "Invalid filename: path traversal detected"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
if full_path.exist?
|
|
103
|
+
full_path.read
|
|
104
|
+
else
|
|
105
|
+
raise ArgumentError, "File #{filename} does not exist in raif_evals/files/"
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Raif
|
|
4
|
+
module Evals
|
|
5
|
+
module EvalSets
|
|
6
|
+
module Expectations
|
|
7
|
+
|
|
8
|
+
def expect(description, result_metadata: nil, &block)
|
|
9
|
+
result = begin
|
|
10
|
+
if block.call
|
|
11
|
+
output.puts Raif::Utils::Colors.green(" ✓ #{description}")
|
|
12
|
+
output.puts Raif::Utils::Colors.green(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
|
|
13
|
+
ExpectationResult.new(description: description, status: :passed, metadata: result_metadata)
|
|
14
|
+
else
|
|
15
|
+
output.puts Raif::Utils::Colors.red(" ✗ #{description}")
|
|
16
|
+
output.puts Raif::Utils::Colors.red(" ⎿ #{result_metadata.inspect}") if result_metadata && Raif.config.evals_verbose_output
|
|
17
|
+
ExpectationResult.new(description: description, status: :failed, metadata: result_metadata)
|
|
18
|
+
end
|
|
19
|
+
rescue => e
|
|
20
|
+
output.puts Raif::Utils::Colors.red(" ✗ #{description} (Error: #{e.message})")
|
|
21
|
+
ExpectationResult.new(description: description, status: :error, error: e, metadata: result_metadata)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
current_eval.add_expectation_result(result)
|
|
25
|
+
result
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def expect_tool_invocation(tool_invoker, tool_type, with: {})
|
|
29
|
+
invocations = tool_invoker.raif_model_tool_invocations.select { |inv| inv.tool_type == tool_type }
|
|
30
|
+
invoked_tools = tool_invoker.raif_model_tool_invocations.map{|inv| [inv.tool_type, inv.tool_arguments] }.to_h
|
|
31
|
+
|
|
32
|
+
if with.any?
|
|
33
|
+
invocations = invocations.select do |invocation|
|
|
34
|
+
with.all? { |key, value| invocation.tool_arguments[key.to_s] == value }
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
result_metadata = { invoked_tools: invoked_tools }
|
|
39
|
+
expect "invokes #{tool_type}#{with.any? ? " with #{with.to_json}" : ""}", result_metadata: result_metadata do
|
|
40
|
+
invocations.any?
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def expect_no_tool_invocation(tool_invoker, tool_name)
|
|
45
|
+
expect "does not invoke #{tool_name}" do
|
|
46
|
+
tool_invoker.raif_model_tool_invocations.none? { |inv| inv.tool_name == tool_name }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|