raif 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -7
- data/app/assets/builds/raif.css +4 -1
- data/app/assets/builds/raif_admin.css +52 -2
- data/app/assets/builds/raif_admin_sprockets.js +2709 -0
- data/app/assets/javascript/raif/admin/copy_to_clipboard_controller.js +132 -0
- data/app/assets/javascript/raif/admin/cost_estimate_controller.js +80 -0
- data/app/assets/javascript/raif/admin/judge_config_controller.js +23 -0
- data/app/assets/javascript/raif/admin/select_all_checkboxes_controller.js +33 -0
- data/app/assets/javascript/raif/admin/sortable_table_controller.js +51 -0
- data/app/assets/javascript/raif/admin/table_search_controller.js +15 -0
- data/app/assets/javascript/raif/admin/tom_select_controller.js +33 -0
- data/app/assets/javascript/raif/controllers/conversations_controller.js +1 -1
- data/app/assets/javascript/raif_admin.js +23 -0
- data/app/assets/javascript/raif_admin_sprockets.js +24 -0
- data/app/assets/stylesheets/raif/admin/conversation.scss +16 -0
- data/app/assets/stylesheets/raif/conversations.scss +3 -0
- data/app/assets/stylesheets/raif.scss +2 -1
- data/app/assets/stylesheets/raif_admin.scss +50 -1
- data/app/controllers/raif/admin/agents_controller.rb +27 -1
- data/app/controllers/raif/admin/application_controller.rb +16 -0
- data/app/controllers/raif/admin/configs_controller.rb +95 -0
- data/app/controllers/raif/admin/llms_controller.rb +27 -0
- data/app/controllers/raif/admin/model_completions_controller.rb +24 -1
- data/app/controllers/raif/admin/model_tool_invocations_controller.rb +7 -1
- data/app/controllers/raif/admin/prompt_studio/agents_controller.rb +25 -0
- data/app/controllers/raif/admin/prompt_studio/base_controller.rb +32 -0
- data/app/controllers/raif/admin/prompt_studio/batch_runs_controller.rb +102 -0
- data/app/controllers/raif/admin/prompt_studio/conversations_controller.rb +25 -0
- data/app/controllers/raif/admin/prompt_studio/tasks_controller.rb +64 -0
- data/app/controllers/raif/admin/stats/model_tool_invocations_controller.rb +21 -0
- data/app/controllers/raif/admin/stats/tasks_controller.rb +15 -6
- data/app/controllers/raif/admin/stats_controller.rb +32 -3
- data/app/controllers/raif/admin/tasks_controller.rb +5 -0
- data/app/controllers/raif/conversation_entries_controller.rb +1 -0
- data/app/controllers/raif/conversations_controller.rb +10 -2
- data/app/helpers/raif/application_helper.rb +40 -0
- data/app/jobs/raif/conversation_entry_job.rb +8 -6
- data/app/jobs/raif/prompt_studio_batch_run_item_job.rb +11 -0
- data/app/jobs/raif/prompt_studio_batch_run_job.rb +15 -0
- data/app/jobs/raif/prompt_studio_task_run_job.rb +36 -0
- data/app/models/raif/admin/task_stat.rb +7 -0
- data/app/models/raif/agent.rb +98 -6
- data/app/models/raif/agents/native_tool_calling_agent.rb +179 -52
- data/app/models/raif/application_record.rb +18 -0
- data/app/models/raif/concerns/agent_inference_stats.rb +35 -0
- data/app/models/raif/concerns/has_prompt_templates.rb +88 -0
- data/app/models/raif/concerns/has_runtime_duration.rb +41 -0
- data/app/models/raif/concerns/json_schema_definition.rb +54 -6
- data/app/models/raif/concerns/llm_prompt_caching.rb +20 -0
- data/app/models/raif/concerns/llms/anthropic/message_formatting.rb +34 -0
- data/app/models/raif/concerns/llms/anthropic/response_tool_calls.rb +24 -0
- data/app/models/raif/concerns/llms/anthropic/tool_formatting.rb +8 -0
- data/app/models/raif/concerns/llms/bedrock/message_formatting.rb +43 -0
- data/app/models/raif/concerns/llms/bedrock/response_tool_calls.rb +26 -0
- data/app/models/raif/concerns/llms/bedrock/tool_formatting.rb +8 -0
- data/app/models/raif/concerns/llms/google/message_formatting.rb +112 -0
- data/app/models/raif/concerns/llms/google/response_tool_calls.rb +32 -0
- data/app/models/raif/concerns/llms/google/tool_formatting.rb +76 -0
- data/app/models/raif/concerns/llms/message_formatting.rb +41 -5
- data/app/models/raif/concerns/llms/open_ai/json_schema_validation.rb +3 -3
- data/app/models/raif/concerns/llms/open_ai_completions/message_formatting.rb +22 -0
- data/app/models/raif/concerns/llms/open_ai_completions/response_tool_calls.rb +22 -0
- data/app/models/raif/concerns/llms/open_ai_completions/tool_formatting.rb +8 -0
- data/app/models/raif/concerns/llms/open_ai_responses/message_formatting.rb +17 -0
- data/app/models/raif/concerns/llms/open_ai_responses/response_tool_calls.rb +26 -0
- data/app/models/raif/concerns/llms/open_ai_responses/tool_formatting.rb +8 -0
- data/app/models/raif/concerns/provider_managed_tool_calls.rb +162 -0
- data/app/models/raif/concerns/run_with.rb +127 -0
- data/app/models/raif/conversation.rb +112 -8
- data/app/models/raif/conversation_entry.rb +38 -4
- data/app/models/raif/embedding_model.rb +2 -1
- data/app/models/raif/embedding_models/bedrock.rb +10 -1
- data/app/models/raif/embedding_models/google.rb +37 -0
- data/app/models/raif/embedding_models/open_ai.rb +1 -1
- data/app/models/raif/evals/llm_judge.rb +70 -0
- data/{lib → app/models}/raif/evals/llm_judges/binary.rb +41 -3
- data/{lib → app/models}/raif/evals/llm_judges/comparative.rb +41 -3
- data/{lib → app/models}/raif/evals/llm_judges/scored.rb +39 -1
- data/{lib → app/models}/raif/evals/llm_judges/summarization.rb +40 -2
- data/app/models/raif/llm.rb +104 -4
- data/app/models/raif/llms/anthropic.rb +32 -22
- data/app/models/raif/llms/bedrock.rb +64 -24
- data/app/models/raif/llms/google.rb +166 -0
- data/app/models/raif/llms/open_ai_base.rb +23 -5
- data/app/models/raif/llms/open_ai_completions.rb +14 -12
- data/app/models/raif/llms/open_ai_responses.rb +14 -17
- data/app/models/raif/llms/open_router.rb +16 -15
- data/app/models/raif/model_completion.rb +103 -1
- data/app/models/raif/model_tool.rb +55 -5
- data/app/models/raif/model_tool_invocation.rb +68 -6
- data/app/models/raif/model_tools/agent_final_answer.rb +2 -7
- data/app/models/raif/model_tools/provider_managed/code_execution.rb +4 -0
- data/app/models/raif/model_tools/provider_managed/image_generation.rb +4 -0
- data/app/models/raif/model_tools/provider_managed/web_search.rb +4 -0
- data/app/models/raif/prompt_studio_batch_run.rb +155 -0
- data/app/models/raif/prompt_studio_batch_run_item.rb +220 -0
- data/app/models/raif/streaming_responses/bedrock.rb +60 -1
- data/app/models/raif/streaming_responses/google.rb +71 -0
- data/app/models/raif/task.rb +85 -18
- data/app/models/raif/user_tool_invocation.rb +19 -0
- data/app/views/layouts/raif/admin.html.erb +43 -2
- data/app/views/raif/admin/agents/_agent.html.erb +9 -0
- data/app/views/raif/admin/agents/_conversation_message.html.erb +28 -6
- data/app/views/raif/admin/agents/index.html.erb +50 -0
- data/app/views/raif/admin/agents/show.html.erb +50 -1
- data/app/views/raif/admin/configs/show.html.erb +117 -0
- data/app/views/raif/admin/conversations/_conversation_entry.html.erb +29 -34
- data/app/views/raif/admin/conversations/show.html.erb +2 -0
- data/app/views/raif/admin/llms/index.html.erb +110 -0
- data/app/views/raif/admin/model_completions/_model_completion.html.erb +10 -5
- data/app/views/raif/admin/model_completions/index.html.erb +40 -1
- data/app/views/raif/admin/model_completions/show.html.erb +256 -84
- data/app/views/raif/admin/model_tool_invocations/index.html.erb +22 -1
- data/app/views/raif/admin/model_tool_invocations/show.html.erb +18 -0
- data/app/views/raif/admin/model_tools/_list.html.erb +16 -0
- data/app/views/raif/admin/model_tools/_model_tool.html.erb +36 -0
- data/app/views/raif/admin/prompt_studio/agents/index.html.erb +56 -0
- data/app/views/raif/admin/prompt_studio/agents/show.html.erb +57 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/_batch_run_item.html.erb +54 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/_judge_config_fields.html.erb +76 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/_judge_detail_modal.html.erb +27 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/_modal.html.erb +35 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/_progress.html.erb +78 -0
- data/app/views/raif/admin/prompt_studio/batch_runs/show.html.erb +49 -0
- data/app/views/raif/admin/prompt_studio/conversations/index.html.erb +48 -0
- data/app/views/raif/admin/prompt_studio/conversations/show.html.erb +36 -0
- data/app/views/raif/admin/prompt_studio/shared/_nav_tabs.html.erb +17 -0
- data/app/views/raif/admin/prompt_studio/shared/_prompt_comparison.html.erb +87 -0
- data/app/views/raif/admin/prompt_studio/shared/_type_filter.html.erb +54 -0
- data/app/views/raif/admin/prompt_studio/tasks/_task_result.html.erb +145 -0
- data/app/views/raif/admin/prompt_studio/tasks/_task_row.html.erb +12 -0
- data/app/views/raif/admin/prompt_studio/tasks/_task_type_filter.html.erb +58 -0
- data/app/views/raif/admin/prompt_studio/tasks/_tasks_table.html.erb +22 -0
- data/app/views/raif/admin/prompt_studio/tasks/index.html.erb +35 -0
- data/app/views/raif/admin/prompt_studio/tasks/show.html.erb +19 -0
- data/app/views/raif/admin/stats/_stats_tile.html.erb +34 -0
- data/app/views/raif/admin/stats/index.html.erb +71 -88
- data/app/views/raif/admin/stats/model_tool_invocations/index.html.erb +43 -0
- data/app/views/raif/admin/stats/tasks/index.html.erb +20 -6
- data/app/views/raif/admin/tasks/_task.html.erb +1 -0
- data/app/views/raif/admin/tasks/index.html.erb +23 -6
- data/app/views/raif/admin/tasks/show.html.erb +56 -3
- data/app/views/raif/conversation_entries/_form.html.erb +3 -0
- data/app/views/raif/conversation_entries/_message.html.erb +10 -6
- data/app/views/raif/conversations/_conversation.html.erb +10 -0
- data/app/views/raif/conversations/_entry_processed.turbo_stream.erb +12 -0
- data/app/views/raif/conversations/index.html.erb +23 -0
- data/config/importmap.rb +8 -0
- data/config/locales/admin.en.yml +161 -1
- data/config/locales/en.yml +67 -4
- data/config/routes.rb +10 -0
- data/db/migrate/20250904194456_add_generating_entry_response_to_raif_conversations.rb +7 -0
- data/db/migrate/20250911125234_add_source_to_raif_tasks.rb +7 -0
- data/db/migrate/20251020005853_add_source_to_raif_agents.rb +7 -0
- data/db/migrate/20251020011346_rename_task_run_args_to_run_with.rb +7 -0
- data/db/migrate/20251020011405_add_run_with_to_raif_agents.rb +13 -0
- data/db/migrate/20251024160119_add_llm_messages_max_length_to_raif_conversations.rb +14 -0
- data/db/migrate/20251124185033_add_provider_tool_call_id_to_raif_model_tool_invocations.rb +7 -0
- data/db/migrate/20251128202941_add_tool_choice_to_raif_model_completions.rb +7 -0
- data/db/migrate/20260118144846_add_source_to_raif_conversations.rb +7 -0
- data/db/migrate/20260119000000_add_failure_tracking_to_raif_model_completions.rb +10 -0
- data/db/migrate/20260119000001_add_completed_at_to_raif_model_completions.rb +8 -0
- data/db/migrate/20260119000002_add_started_at_to_raif_model_completions.rb +8 -0
- data/db/migrate/20260307000000_add_prompt_studio_run_to_raif_tasks.rb +7 -0
- data/db/migrate/20260308000000_create_raif_prompt_studio_batch_runs.rb +27 -0
- data/db/migrate/20260308000001_create_raif_prompt_studio_batch_run_items.rb +24 -0
- data/db/migrate/20260407000000_add_cache_token_columns_to_raif_model_completions.rb +8 -0
- data/lib/generators/raif/agent/agent_generator.rb +18 -0
- data/lib/generators/raif/agent/templates/agent.rb.tt +7 -5
- data/lib/generators/raif/agent/templates/application_agent.rb.tt +1 -1
- data/lib/generators/raif/agent/templates/system_prompt.erb.tt +3 -0
- data/lib/generators/raif/conversation/conversation_generator.rb +19 -1
- data/lib/generators/raif/conversation/templates/conversation.rb.tt +6 -0
- data/lib/generators/raif/conversation/templates/system_prompt.erb.tt +4 -0
- data/lib/generators/raif/install/templates/initializer.rb +117 -8
- data/lib/generators/raif/task/task_generator.rb +18 -0
- data/lib/generators/raif/task/templates/prompt.erb.tt +4 -0
- data/lib/generators/raif/task/templates/task.rb.tt +10 -9
- data/lib/raif/configuration.rb +47 -2
- data/lib/raif/embedding_model_registry.rb +8 -0
- data/lib/raif/engine.rb +24 -1
- data/lib/raif/errors/blank_response_error.rb +8 -0
- data/lib/raif/errors/instance_dependent_schema_error.rb +8 -0
- data/lib/raif/errors/prompt_template_error.rb +15 -0
- data/lib/raif/errors/streaming_error.rb +6 -3
- data/lib/raif/errors.rb +3 -0
- data/lib/raif/evals/run.rb +1 -0
- data/lib/raif/evals.rb +0 -6
- data/lib/raif/json_schema_builder.rb +14 -0
- data/lib/raif/llm_registry.rb +433 -42
- data/lib/raif/messages.rb +180 -0
- data/lib/raif/prompt_studio_comparison_builder.rb +138 -0
- data/lib/raif/token_estimator.rb +28 -0
- data/lib/raif/version.rb +1 -1
- data/lib/raif.rb +11 -0
- data/lib/tasks/annotate_rb.rake +10 -0
- data/spec/support/rspec_helpers.rb +15 -9
- data/spec/support/test_task.rb +9 -0
- data/spec/support/test_template_task.rb +41 -0
- metadata +108 -15
- data/app/models/raif/agents/re_act_agent.rb +0 -127
- data/app/models/raif/agents/re_act_step.rb +0 -32
- data/app/models/raif/concerns/task_run_args.rb +0 -62
- data/lib/raif/evals/llm_judge.rb +0 -32
- /data/{lib → app/models}/raif/evals/scoring_rubric.rb +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# == Schema Information
|
|
4
|
+
#
|
|
5
|
+
# Table name: raif_tasks
|
|
6
|
+
#
|
|
7
|
+
# id :bigint not null, primary key
|
|
8
|
+
# available_model_tools :jsonb not null
|
|
9
|
+
# completed_at :datetime
|
|
10
|
+
# creator_type :string
|
|
11
|
+
# failed_at :datetime
|
|
12
|
+
# llm_model_key :string not null
|
|
13
|
+
# prompt :text
|
|
14
|
+
# prompt_studio_run :boolean default(FALSE), not null
|
|
15
|
+
# raw_response :text
|
|
16
|
+
# requested_language_key :string
|
|
17
|
+
# response_format :integer default("text"), not null
|
|
18
|
+
# run_with :jsonb
|
|
19
|
+
# source_type :string
|
|
20
|
+
# started_at :datetime
|
|
21
|
+
# system_prompt :text
|
|
22
|
+
# type :string not null
|
|
23
|
+
# created_at :datetime not null
|
|
24
|
+
# updated_at :datetime not null
|
|
25
|
+
# creator_id :bigint
|
|
26
|
+
# source_id :bigint
|
|
27
|
+
#
|
|
28
|
+
# Indexes
|
|
29
|
+
#
|
|
30
|
+
# index_raif_tasks_on_completed_at (completed_at)
|
|
31
|
+
# index_raif_tasks_on_created_at (created_at)
|
|
32
|
+
# index_raif_tasks_on_creator (creator_type,creator_id)
|
|
33
|
+
# index_raif_tasks_on_failed_at (failed_at)
|
|
34
|
+
# index_raif_tasks_on_source (source_type,source_id)
|
|
35
|
+
# index_raif_tasks_on_started_at (started_at)
|
|
36
|
+
# index_raif_tasks_on_type (type)
|
|
37
|
+
# index_raif_tasks_on_type_and_completed_at (type,completed_at)
|
|
38
|
+
# index_raif_tasks_on_type_and_failed_at (type,failed_at)
|
|
39
|
+
# index_raif_tasks_on_type_and_started_at (type,started_at)
|
|
40
|
+
#
|
|
41
|
+
module Raif
|
|
42
|
+
module Evals
|
|
43
|
+
class LlmJudge < Raif::Task
|
|
44
|
+
# Set default temperature for consistent judging
|
|
45
|
+
llm_temperature 0.0
|
|
46
|
+
|
|
47
|
+
# Default to JSON response format for structured output
|
|
48
|
+
llm_response_format :json
|
|
49
|
+
|
|
50
|
+
run_with :content_to_judge # the content to judge
|
|
51
|
+
run_with :additional_context # additional context to be provided to the judge
|
|
52
|
+
|
|
53
|
+
def default_llm_model_key
|
|
54
|
+
Raif.config.evals_default_llm_judge_model_key || super
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def judgment_reasoning
|
|
58
|
+
parsed_response["reasoning"] if completed?
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def judgment_confidence
|
|
62
|
+
parsed_response["confidence"] if completed?
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def low_confidence?
|
|
66
|
+
judgment_confidence && judgment_confidence < 0.5
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -1,12 +1,50 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# == Schema Information
|
|
4
|
+
#
|
|
5
|
+
# Table name: raif_tasks
|
|
6
|
+
#
|
|
7
|
+
# id :bigint not null, primary key
|
|
8
|
+
# available_model_tools :jsonb not null
|
|
9
|
+
# completed_at :datetime
|
|
10
|
+
# creator_type :string
|
|
11
|
+
# failed_at :datetime
|
|
12
|
+
# llm_model_key :string not null
|
|
13
|
+
# prompt :text
|
|
14
|
+
# prompt_studio_run :boolean default(FALSE), not null
|
|
15
|
+
# raw_response :text
|
|
16
|
+
# requested_language_key :string
|
|
17
|
+
# response_format :integer default("text"), not null
|
|
18
|
+
# run_with :jsonb
|
|
19
|
+
# source_type :string
|
|
20
|
+
# started_at :datetime
|
|
21
|
+
# system_prompt :text
|
|
22
|
+
# type :string not null
|
|
23
|
+
# created_at :datetime not null
|
|
24
|
+
# updated_at :datetime not null
|
|
25
|
+
# creator_id :bigint
|
|
26
|
+
# source_id :bigint
|
|
27
|
+
#
|
|
28
|
+
# Indexes
|
|
29
|
+
#
|
|
30
|
+
# index_raif_tasks_on_completed_at (completed_at)
|
|
31
|
+
# index_raif_tasks_on_created_at (created_at)
|
|
32
|
+
# index_raif_tasks_on_creator (creator_type,creator_id)
|
|
33
|
+
# index_raif_tasks_on_failed_at (failed_at)
|
|
34
|
+
# index_raif_tasks_on_source (source_type,source_id)
|
|
35
|
+
# index_raif_tasks_on_started_at (started_at)
|
|
36
|
+
# index_raif_tasks_on_type (type)
|
|
37
|
+
# index_raif_tasks_on_type_and_completed_at (type,completed_at)
|
|
38
|
+
# index_raif_tasks_on_type_and_failed_at (type,failed_at)
|
|
39
|
+
# index_raif_tasks_on_type_and_started_at (type,started_at)
|
|
40
|
+
#
|
|
3
41
|
module Raif
|
|
4
42
|
module Evals
|
|
5
43
|
module LlmJudges
|
|
6
44
|
class Binary < Raif::Evals::LlmJudge
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
45
|
+
run_with :criteria
|
|
46
|
+
run_with :examples
|
|
47
|
+
run_with :strict_mode
|
|
10
48
|
|
|
11
49
|
json_response_schema do
|
|
12
50
|
boolean :passes, description: "Whether the content passes the criteria"
|
|
@@ -1,12 +1,50 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# == Schema Information
|
|
4
|
+
#
|
|
5
|
+
# Table name: raif_tasks
|
|
6
|
+
#
|
|
7
|
+
# id :bigint not null, primary key
|
|
8
|
+
# available_model_tools :jsonb not null
|
|
9
|
+
# completed_at :datetime
|
|
10
|
+
# creator_type :string
|
|
11
|
+
# failed_at :datetime
|
|
12
|
+
# llm_model_key :string not null
|
|
13
|
+
# prompt :text
|
|
14
|
+
# prompt_studio_run :boolean default(FALSE), not null
|
|
15
|
+
# raw_response :text
|
|
16
|
+
# requested_language_key :string
|
|
17
|
+
# response_format :integer default("text"), not null
|
|
18
|
+
# run_with :jsonb
|
|
19
|
+
# source_type :string
|
|
20
|
+
# started_at :datetime
|
|
21
|
+
# system_prompt :text
|
|
22
|
+
# type :string not null
|
|
23
|
+
# created_at :datetime not null
|
|
24
|
+
# updated_at :datetime not null
|
|
25
|
+
# creator_id :bigint
|
|
26
|
+
# source_id :bigint
|
|
27
|
+
#
|
|
28
|
+
# Indexes
|
|
29
|
+
#
|
|
30
|
+
# index_raif_tasks_on_completed_at (completed_at)
|
|
31
|
+
# index_raif_tasks_on_created_at (created_at)
|
|
32
|
+
# index_raif_tasks_on_creator (creator_type,creator_id)
|
|
33
|
+
# index_raif_tasks_on_failed_at (failed_at)
|
|
34
|
+
# index_raif_tasks_on_source (source_type,source_id)
|
|
35
|
+
# index_raif_tasks_on_started_at (started_at)
|
|
36
|
+
# index_raif_tasks_on_type (type)
|
|
37
|
+
# index_raif_tasks_on_type_and_completed_at (type,completed_at)
|
|
38
|
+
# index_raif_tasks_on_type_and_failed_at (type,failed_at)
|
|
39
|
+
# index_raif_tasks_on_type_and_started_at (type,started_at)
|
|
40
|
+
#
|
|
3
41
|
module Raif
|
|
4
42
|
module Evals
|
|
5
43
|
module LlmJudges
|
|
6
44
|
class Comparative < Raif::Evals::LlmJudge
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
45
|
+
run_with :over_content # the content to compare against
|
|
46
|
+
run_with :comparison_criteria # the criteria to use when comparing content_to_judge to over_content
|
|
47
|
+
run_with :allow_ties # whether to allow ties in the comparison
|
|
10
48
|
|
|
11
49
|
attr_accessor :content_a, :content_b, :expected_winner
|
|
12
50
|
|
|
@@ -1,10 +1,48 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# == Schema Information
|
|
4
|
+
#
|
|
5
|
+
# Table name: raif_tasks
|
|
6
|
+
#
|
|
7
|
+
# id :bigint not null, primary key
|
|
8
|
+
# available_model_tools :jsonb not null
|
|
9
|
+
# completed_at :datetime
|
|
10
|
+
# creator_type :string
|
|
11
|
+
# failed_at :datetime
|
|
12
|
+
# llm_model_key :string not null
|
|
13
|
+
# prompt :text
|
|
14
|
+
# prompt_studio_run :boolean default(FALSE), not null
|
|
15
|
+
# raw_response :text
|
|
16
|
+
# requested_language_key :string
|
|
17
|
+
# response_format :integer default("text"), not null
|
|
18
|
+
# run_with :jsonb
|
|
19
|
+
# source_type :string
|
|
20
|
+
# started_at :datetime
|
|
21
|
+
# system_prompt :text
|
|
22
|
+
# type :string not null
|
|
23
|
+
# created_at :datetime not null
|
|
24
|
+
# updated_at :datetime not null
|
|
25
|
+
# creator_id :bigint
|
|
26
|
+
# source_id :bigint
|
|
27
|
+
#
|
|
28
|
+
# Indexes
|
|
29
|
+
#
|
|
30
|
+
# index_raif_tasks_on_completed_at (completed_at)
|
|
31
|
+
# index_raif_tasks_on_created_at (created_at)
|
|
32
|
+
# index_raif_tasks_on_creator (creator_type,creator_id)
|
|
33
|
+
# index_raif_tasks_on_failed_at (failed_at)
|
|
34
|
+
# index_raif_tasks_on_source (source_type,source_id)
|
|
35
|
+
# index_raif_tasks_on_started_at (started_at)
|
|
36
|
+
# index_raif_tasks_on_type (type)
|
|
37
|
+
# index_raif_tasks_on_type_and_completed_at (type,completed_at)
|
|
38
|
+
# index_raif_tasks_on_type_and_failed_at (type,failed_at)
|
|
39
|
+
# index_raif_tasks_on_type_and_started_at (type,started_at)
|
|
40
|
+
#
|
|
3
41
|
module Raif
|
|
4
42
|
module Evals
|
|
5
43
|
module LlmJudges
|
|
6
44
|
class Scored < Raif::Evals::LlmJudge
|
|
7
|
-
|
|
45
|
+
run_with :scoring_rubric # the scoring rubric to use when evaluating the content
|
|
8
46
|
|
|
9
47
|
json_response_schema do
|
|
10
48
|
number :score, description: "Numerical score based on the rubric"
|
|
@@ -1,11 +1,49 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# == Schema Information
|
|
4
|
+
#
|
|
5
|
+
# Table name: raif_tasks
|
|
6
|
+
#
|
|
7
|
+
# id :bigint not null, primary key
|
|
8
|
+
# available_model_tools :jsonb not null
|
|
9
|
+
# completed_at :datetime
|
|
10
|
+
# creator_type :string
|
|
11
|
+
# failed_at :datetime
|
|
12
|
+
# llm_model_key :string not null
|
|
13
|
+
# prompt :text
|
|
14
|
+
# prompt_studio_run :boolean default(FALSE), not null
|
|
15
|
+
# raw_response :text
|
|
16
|
+
# requested_language_key :string
|
|
17
|
+
# response_format :integer default("text"), not null
|
|
18
|
+
# run_with :jsonb
|
|
19
|
+
# source_type :string
|
|
20
|
+
# started_at :datetime
|
|
21
|
+
# system_prompt :text
|
|
22
|
+
# type :string not null
|
|
23
|
+
# created_at :datetime not null
|
|
24
|
+
# updated_at :datetime not null
|
|
25
|
+
# creator_id :bigint
|
|
26
|
+
# source_id :bigint
|
|
27
|
+
#
|
|
28
|
+
# Indexes
|
|
29
|
+
#
|
|
30
|
+
# index_raif_tasks_on_completed_at (completed_at)
|
|
31
|
+
# index_raif_tasks_on_created_at (created_at)
|
|
32
|
+
# index_raif_tasks_on_creator (creator_type,creator_id)
|
|
33
|
+
# index_raif_tasks_on_failed_at (failed_at)
|
|
34
|
+
# index_raif_tasks_on_source (source_type,source_id)
|
|
35
|
+
# index_raif_tasks_on_started_at (started_at)
|
|
36
|
+
# index_raif_tasks_on_type (type)
|
|
37
|
+
# index_raif_tasks_on_type_and_completed_at (type,completed_at)
|
|
38
|
+
# index_raif_tasks_on_type_and_failed_at (type,failed_at)
|
|
39
|
+
# index_raif_tasks_on_type_and_started_at (type,started_at)
|
|
40
|
+
#
|
|
3
41
|
module Raif
|
|
4
42
|
module Evals
|
|
5
43
|
module LlmJudges
|
|
6
44
|
class Summarization < Raif::Evals::LlmJudge
|
|
7
|
-
|
|
8
|
-
|
|
45
|
+
run_with :original_content # the original content to evaluate the summary against
|
|
46
|
+
run_with :summary # the summary to evaluate against the original content
|
|
9
47
|
|
|
10
48
|
json_response_schema do
|
|
11
49
|
object :coverage do
|
data/app/models/raif/llm.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Raif
|
|
|
7
7
|
|
|
8
8
|
attr_accessor :key,
|
|
9
9
|
:api_name,
|
|
10
|
+
:display_name,
|
|
10
11
|
:default_temperature,
|
|
11
12
|
:default_max_completion_tokens,
|
|
12
13
|
:supports_native_tool_use,
|
|
@@ -25,6 +26,7 @@ module Raif
|
|
|
25
26
|
def initialize(
|
|
26
27
|
key:,
|
|
27
28
|
api_name:,
|
|
29
|
+
display_name: nil,
|
|
28
30
|
model_provider_settings: {},
|
|
29
31
|
supported_provider_managed_tools: [],
|
|
30
32
|
supports_native_tool_use: true,
|
|
@@ -35,6 +37,7 @@ module Raif
|
|
|
35
37
|
)
|
|
36
38
|
@key = key
|
|
37
39
|
@api_name = api_name
|
|
40
|
+
@display_name = display_name
|
|
38
41
|
@provider_settings = model_provider_settings
|
|
39
42
|
@supports_native_tool_use = supports_native_tool_use
|
|
40
43
|
@default_temperature = temperature || 0.7
|
|
@@ -45,11 +48,11 @@ module Raif
|
|
|
45
48
|
end
|
|
46
49
|
|
|
47
50
|
def name
|
|
48
|
-
I18n.t("raif.model_names.#{key}")
|
|
51
|
+
I18n.t("raif.model_names.#{key}", default: display_name || key.to_s.humanize)
|
|
49
52
|
end
|
|
50
53
|
|
|
51
54
|
def chat(message: nil, messages: nil, response_format: :text, available_model_tools: [], source: nil, system_prompt: nil, temperature: nil,
|
|
52
|
-
max_completion_tokens: nil, &block)
|
|
55
|
+
max_completion_tokens: nil, tool_choice: nil, anthropic_prompt_caching_enabled: false, bedrock_prompt_caching_enabled: false, &block)
|
|
53
56
|
unless response_format.is_a?(Symbol)
|
|
54
57
|
raise ArgumentError,
|
|
55
58
|
"Raif::Llm#chat - Invalid response format: #{response_format}. Must be a symbol (you passed #{response_format.class}) and be one of: #{VALID_RESPONSE_FORMATS.join(", ")}" # rubocop:disable Layout/LineLength
|
|
@@ -67,6 +70,19 @@ module Raif
|
|
|
67
70
|
raise ArgumentError, "Raif::Llm#chat - You must provide either a message: or messages: argument, not both"
|
|
68
71
|
end
|
|
69
72
|
|
|
73
|
+
# Normalize :required / "required" to the symbol form for validation
|
|
74
|
+
tool_choice = :required if tool_choice.to_s == "required"
|
|
75
|
+
|
|
76
|
+
if tool_choice == :required
|
|
77
|
+
if available_model_tools.blank?
|
|
78
|
+
raise ArgumentError,
|
|
79
|
+
"Raif::Llm#chat - tool_choice: :required requires at least one available model tool"
|
|
80
|
+
end
|
|
81
|
+
elsif tool_choice.present? && !available_model_tools.map(&:to_s).include?(tool_choice.to_s)
|
|
82
|
+
raise ArgumentError,
|
|
83
|
+
"Raif::Llm#chat - Invalid tool choice: #{tool_choice} is not included in the available model tools: #{available_model_tools.join(", ")}"
|
|
84
|
+
end
|
|
85
|
+
|
|
70
86
|
unless Raif.config.llm_api_requests_enabled
|
|
71
87
|
Raif.logger.warn("LLM API requests are disabled. Skipping request to #{api_name}.")
|
|
72
88
|
return
|
|
@@ -87,20 +103,33 @@ module Raif
|
|
|
87
103
|
temperature: temperature,
|
|
88
104
|
max_completion_tokens: max_completion_tokens,
|
|
89
105
|
available_model_tools: available_model_tools,
|
|
106
|
+
tool_choice: tool_choice&.to_s,
|
|
90
107
|
stream_response: block_given?
|
|
91
108
|
)
|
|
92
109
|
|
|
110
|
+
model_completion.anthropic_prompt_caching_enabled = anthropic_prompt_caching_enabled
|
|
111
|
+
model_completion.bedrock_prompt_caching_enabled = bedrock_prompt_caching_enabled
|
|
112
|
+
|
|
113
|
+
model_completion.started!
|
|
114
|
+
|
|
93
115
|
retry_with_backoff(model_completion) do
|
|
94
116
|
perform_model_completion!(model_completion, &block)
|
|
117
|
+
ensure_model_completion_present!(model_completion)
|
|
95
118
|
end
|
|
96
119
|
|
|
120
|
+
model_completion.completed!
|
|
97
121
|
model_completion
|
|
98
122
|
rescue Raif::Errors::StreamingError => e
|
|
99
123
|
Rails.logger.error("Raif streaming error -- code: #{e.code} -- type: #{e.type} -- message: #{e.message} -- event: #{e.event}")
|
|
124
|
+
model_completion&.record_failure!(e) unless model_completion&.failed?
|
|
100
125
|
raise e
|
|
101
126
|
rescue Faraday::Error => e
|
|
102
127
|
Raif.logger.error("LLM API request failed (status: #{e.response_status}): #{e.message}")
|
|
103
128
|
Raif.logger.error(e.response_body)
|
|
129
|
+
model_completion&.record_failure!(e) unless model_completion&.failed?
|
|
130
|
+
raise e
|
|
131
|
+
rescue StandardError => e
|
|
132
|
+
model_completion&.record_failure!(e) unless model_completion&.failed?
|
|
104
133
|
raise e
|
|
105
134
|
end
|
|
106
135
|
|
|
@@ -112,10 +141,52 @@ module Raif
|
|
|
112
141
|
VALID_RESPONSE_FORMATS
|
|
113
142
|
end
|
|
114
143
|
|
|
144
|
+
# Override in subclasses to indicate whether prompt_tokens reported by the
|
|
145
|
+
# provider already include cached tokens as a subset (OpenAI, Google,
|
|
146
|
+
# OpenRouter) or whether cached tokens are reported separately and are
|
|
147
|
+
# additive to prompt_tokens (Anthropic, Bedrock).
|
|
148
|
+
def self.prompt_tokens_include_cached_tokens?
|
|
149
|
+
true
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Multiplier applied to the base input_token_cost to derive the per-token
|
|
153
|
+
# cost for cache reads. Return nil when the provider has no cache pricing.
|
|
154
|
+
def self.cache_read_input_token_cost_multiplier
|
|
155
|
+
nil
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Multiplier applied to the base input_token_cost to derive the per-token
|
|
159
|
+
# cost for cache creation writes. Return nil when there is no write surcharge.
|
|
160
|
+
def self.cache_creation_input_token_cost_multiplier
|
|
161
|
+
nil
|
|
162
|
+
end
|
|
163
|
+
|
|
115
164
|
def supports_provider_managed_tool?(tool_klass)
|
|
116
165
|
supported_provider_managed_tools&.include?(tool_klass.to_s)
|
|
117
166
|
end
|
|
118
167
|
|
|
168
|
+
# Build the tool_choice parameter to force a specific tool to be called.
|
|
169
|
+
# Each provider implements this to return the correct format.
|
|
170
|
+
# @param tool_name [String] The name of the tool to force
|
|
171
|
+
# @return [Hash] The tool_choice parameter for the provider's API
|
|
172
|
+
def build_forced_tool_choice(tool_name)
|
|
173
|
+
raise NotImplementedError, "#{self.class.name} must implement #build_forced_tool_choice"
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Build the tool_choice parameter to require the model to call any tool (but not a specific one).
|
|
177
|
+
# Each provider implements this to return the correct format.
|
|
178
|
+
# @return [Hash, String] The tool_choice parameter for the provider's API
|
|
179
|
+
def build_required_tool_choice
|
|
180
|
+
raise NotImplementedError, "#{self.class.name} must implement #build_required_tool_choice"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Whether the provider can faithfully enforce tool_choice: :required for
|
|
184
|
+
# the given tool set. Override in subclasses when a provider can only
|
|
185
|
+
# enforce required tool use for some tool types.
|
|
186
|
+
def supports_faithful_required_tool_choice?(available_model_tools)
|
|
187
|
+
available_model_tools.present?
|
|
188
|
+
end
|
|
189
|
+
|
|
119
190
|
def validate_provider_managed_tool_support!(tool)
|
|
120
191
|
unless supports_provider_managed_tool?(tool)
|
|
121
192
|
raise Raif::Errors::UnsupportedFeatureError,
|
|
@@ -125,6 +196,10 @@ module Raif
|
|
|
125
196
|
|
|
126
197
|
private
|
|
127
198
|
|
|
199
|
+
def retriable_exceptions
|
|
200
|
+
Raif.config.llm_request_retriable_exceptions
|
|
201
|
+
end
|
|
202
|
+
|
|
128
203
|
def retry_with_backoff(model_completion)
|
|
129
204
|
retries = 0
|
|
130
205
|
max_retries = Raif.config.llm_request_max_retries
|
|
@@ -133,11 +208,11 @@ module Raif
|
|
|
133
208
|
|
|
134
209
|
begin
|
|
135
210
|
yield
|
|
136
|
-
rescue *
|
|
211
|
+
rescue *retriable_exceptions => e
|
|
137
212
|
retries += 1
|
|
138
213
|
if retries <= max_retries
|
|
139
214
|
delay = [base_delay * (2**(retries - 1)), max_delay].min
|
|
140
|
-
|
|
215
|
+
log_retry(e, model_completion, retries, max_retries, delay)
|
|
141
216
|
model_completion.increment!(:retry_count)
|
|
142
217
|
sleep delay
|
|
143
218
|
retry
|
|
@@ -148,10 +223,35 @@ module Raif
|
|
|
148
223
|
end
|
|
149
224
|
end
|
|
150
225
|
|
|
226
|
+
def log_retry(error, model_completion, attempt, max_retries, delay)
|
|
227
|
+
if error.is_a?(Raif::Errors::BlankResponseError)
|
|
228
|
+
has_reasoning = model_completion.response_array&.any? do |block|
|
|
229
|
+
block.is_a?(Hash) ? block.key?("reasoning_content") : block.respond_to?(:reasoning_content)
|
|
230
|
+
end
|
|
231
|
+
Raif.logger.warn(
|
|
232
|
+
"Blank response retry #{attempt}/#{max_retries} for #{api_name} " \
|
|
233
|
+
"(ModelCompletion##{model_completion.id}, source: #{model_completion.source_type}##{model_completion.source_id}, " \
|
|
234
|
+
"completion_tokens: #{model_completion.completion_tokens}, reasoning_content_present: #{has_reasoning}). " \
|
|
235
|
+
"Waiting #{delay} seconds..."
|
|
236
|
+
)
|
|
237
|
+
else
|
|
238
|
+
Raif.logger.warn("Retrying LLM API request after error: #{error.message}. Attempt #{attempt}/#{max_retries}. Waiting #{delay} seconds...")
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
151
242
|
def streaming_response_type
|
|
152
243
|
raise NotImplementedError, "#{self.class.name} must implement #streaming_response_type"
|
|
153
244
|
end
|
|
154
245
|
|
|
246
|
+
def ensure_model_completion_present!(model_completion)
|
|
247
|
+
# response_array/raw provider data may still be present for debugging even when
|
|
248
|
+
# the normalized response has no text or tool calls.
|
|
249
|
+
return if model_completion.raw_response.present? || model_completion.response_tool_calls.present?
|
|
250
|
+
|
|
251
|
+
raise Raif::Errors::BlankResponseError,
|
|
252
|
+
"Model completion #{model_completion.id} returned no text response and no tool calls"
|
|
253
|
+
end
|
|
254
|
+
|
|
155
255
|
def streaming_chunk_handler(model_completion, &block)
|
|
156
256
|
return unless model_completion.stream_response?
|
|
157
257
|
|
|
@@ -3,6 +3,19 @@
|
|
|
3
3
|
class Raif::Llms::Anthropic < Raif::Llm
|
|
4
4
|
include Raif::Concerns::Llms::Anthropic::MessageFormatting
|
|
5
5
|
include Raif::Concerns::Llms::Anthropic::ToolFormatting
|
|
6
|
+
include Raif::Concerns::Llms::Anthropic::ResponseToolCalls
|
|
7
|
+
|
|
8
|
+
def self.prompt_tokens_include_cached_tokens?
|
|
9
|
+
false
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.cache_read_input_token_cost_multiplier
|
|
13
|
+
0.1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.cache_creation_input_token_cost_multiplier
|
|
17
|
+
1.25
|
|
18
|
+
end
|
|
6
19
|
|
|
7
20
|
def perform_model_completion!(model_completion, &block)
|
|
8
21
|
params = build_request_parameters(model_completion)
|
|
@@ -21,7 +34,7 @@ class Raif::Llms::Anthropic < Raif::Llm
|
|
|
21
34
|
private
|
|
22
35
|
|
|
23
36
|
def connection
|
|
24
|
-
@connection ||= Faraday.new(url: "https://api.anthropic.com/v1") do |f|
|
|
37
|
+
@connection ||= Faraday.new(url: "https://api.anthropic.com/v1", request: Raif.default_request_options) do |f|
|
|
25
38
|
f.headers["x-api-key"] = Raif.config.anthropic_api_key
|
|
26
39
|
f.headers["anthropic-version"] = "2023-06-01"
|
|
27
40
|
f.request :json
|
|
@@ -48,22 +61,33 @@ private
|
|
|
48
61
|
model_completion.completion_tokens = response_json&.dig("usage", "output_tokens")
|
|
49
62
|
model_completion.prompt_tokens = response_json&.dig("usage", "input_tokens")
|
|
50
63
|
model_completion.total_tokens = model_completion.completion_tokens.to_i + model_completion.prompt_tokens.to_i
|
|
64
|
+
model_completion.cache_read_input_tokens = response_json&.dig("usage", "cache_read_input_tokens")
|
|
65
|
+
model_completion.cache_creation_input_tokens = response_json&.dig("usage", "cache_creation_input_tokens")
|
|
51
66
|
model_completion.save!
|
|
52
67
|
end
|
|
53
68
|
|
|
54
69
|
def build_request_parameters(model_completion)
|
|
55
70
|
params = {
|
|
56
71
|
model: model_completion.model_api_name,
|
|
57
|
-
messages: model_completion.messages
|
|
58
|
-
temperature: (model_completion.temperature || default_temperature).to_f,
|
|
59
|
-
max_tokens: model_completion.max_completion_tokens || default_max_completion_tokens
|
|
72
|
+
messages: model_completion.messages
|
|
60
73
|
}
|
|
61
74
|
|
|
75
|
+
params[:temperature] = (model_completion.temperature || default_temperature).to_f if supports_temperature?
|
|
76
|
+
params[:max_tokens] = model_completion.max_completion_tokens || default_max_completion_tokens
|
|
77
|
+
|
|
62
78
|
params[:system] = model_completion.system_prompt if model_completion.system_prompt.present?
|
|
79
|
+
params[:cache_control] = { type: "ephemeral" } if model_completion.anthropic_prompt_caching_enabled
|
|
63
80
|
|
|
64
81
|
if supports_native_tool_use?
|
|
65
82
|
tools = build_tools_parameter(model_completion)
|
|
66
83
|
params[:tools] = tools unless tools.blank?
|
|
84
|
+
|
|
85
|
+
if model_completion.tool_choice == "required"
|
|
86
|
+
params[:tool_choice] = build_required_tool_choice
|
|
87
|
+
elsif model_completion.tool_choice.present?
|
|
88
|
+
tool_klass = model_completion.tool_choice.constantize
|
|
89
|
+
params[:tool_choice] = build_forced_tool_choice(tool_klass.tool_name)
|
|
90
|
+
end
|
|
67
91
|
end
|
|
68
92
|
|
|
69
93
|
params[:stream] = true if model_completion.stream_response?
|
|
@@ -71,6 +95,10 @@ private
|
|
|
71
95
|
params
|
|
72
96
|
end
|
|
73
97
|
|
|
98
|
+
def supports_temperature?
|
|
99
|
+
provider_settings.key?(:supports_temperature) ? provider_settings[:supports_temperature] : true
|
|
100
|
+
end
|
|
101
|
+
|
|
74
102
|
def extract_text_response(resp)
|
|
75
103
|
return if resp&.dig("content").blank?
|
|
76
104
|
|
|
@@ -92,24 +120,6 @@ private
|
|
|
92
120
|
end
|
|
93
121
|
end
|
|
94
122
|
|
|
95
|
-
def extract_response_tool_calls(resp)
|
|
96
|
-
return if resp&.dig("content").nil?
|
|
97
|
-
|
|
98
|
-
# Find any tool_use content blocks
|
|
99
|
-
tool_uses = resp&.dig("content")&.select do |content|
|
|
100
|
-
content["type"] == "tool_use"
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
return if tool_uses.blank?
|
|
104
|
-
|
|
105
|
-
tool_uses.map do |tool_use|
|
|
106
|
-
{
|
|
107
|
-
"name" => tool_use["name"],
|
|
108
|
-
"arguments" => tool_use["input"]
|
|
109
|
-
}
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
|
|
113
123
|
def extract_citations(resp)
|
|
114
124
|
return [] if resp&.dig("content").nil?
|
|
115
125
|
|