raif 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +8 -7
  3. data/app/assets/builds/raif.css +4 -1
  4. data/app/assets/builds/raif_admin.css +52 -2
  5. data/app/assets/builds/raif_admin_sprockets.js +2709 -0
  6. data/app/assets/javascript/raif/admin/copy_to_clipboard_controller.js +132 -0
  7. data/app/assets/javascript/raif/admin/cost_estimate_controller.js +80 -0
  8. data/app/assets/javascript/raif/admin/judge_config_controller.js +23 -0
  9. data/app/assets/javascript/raif/admin/select_all_checkboxes_controller.js +33 -0
  10. data/app/assets/javascript/raif/admin/sortable_table_controller.js +51 -0
  11. data/app/assets/javascript/raif/admin/table_search_controller.js +15 -0
  12. data/app/assets/javascript/raif/admin/tom_select_controller.js +33 -0
  13. data/app/assets/javascript/raif/controllers/conversations_controller.js +1 -1
  14. data/app/assets/javascript/raif_admin.js +23 -0
  15. data/app/assets/javascript/raif_admin_sprockets.js +24 -0
  16. data/app/assets/stylesheets/raif/admin/conversation.scss +16 -0
  17. data/app/assets/stylesheets/raif/conversations.scss +3 -0
  18. data/app/assets/stylesheets/raif.scss +2 -1
  19. data/app/assets/stylesheets/raif_admin.scss +50 -1
  20. data/app/controllers/raif/admin/agents_controller.rb +27 -1
  21. data/app/controllers/raif/admin/application_controller.rb +16 -0
  22. data/app/controllers/raif/admin/configs_controller.rb +95 -0
  23. data/app/controllers/raif/admin/llms_controller.rb +27 -0
  24. data/app/controllers/raif/admin/model_completions_controller.rb +24 -1
  25. data/app/controllers/raif/admin/model_tool_invocations_controller.rb +7 -1
  26. data/app/controllers/raif/admin/prompt_studio/agents_controller.rb +25 -0
  27. data/app/controllers/raif/admin/prompt_studio/base_controller.rb +32 -0
  28. data/app/controllers/raif/admin/prompt_studio/batch_runs_controller.rb +102 -0
  29. data/app/controllers/raif/admin/prompt_studio/conversations_controller.rb +25 -0
  30. data/app/controllers/raif/admin/prompt_studio/tasks_controller.rb +64 -0
  31. data/app/controllers/raif/admin/stats/model_tool_invocations_controller.rb +21 -0
  32. data/app/controllers/raif/admin/stats/tasks_controller.rb +15 -6
  33. data/app/controllers/raif/admin/stats_controller.rb +32 -3
  34. data/app/controllers/raif/admin/tasks_controller.rb +5 -0
  35. data/app/controllers/raif/conversation_entries_controller.rb +1 -0
  36. data/app/controllers/raif/conversations_controller.rb +10 -2
  37. data/app/helpers/raif/application_helper.rb +40 -0
  38. data/app/jobs/raif/conversation_entry_job.rb +8 -6
  39. data/app/jobs/raif/prompt_studio_batch_run_item_job.rb +11 -0
  40. data/app/jobs/raif/prompt_studio_batch_run_job.rb +15 -0
  41. data/app/jobs/raif/prompt_studio_task_run_job.rb +36 -0
  42. data/app/models/raif/admin/task_stat.rb +7 -0
  43. data/app/models/raif/agent.rb +98 -6
  44. data/app/models/raif/agents/native_tool_calling_agent.rb +179 -52
  45. data/app/models/raif/application_record.rb +18 -0
  46. data/app/models/raif/concerns/agent_inference_stats.rb +35 -0
  47. data/app/models/raif/concerns/has_prompt_templates.rb +88 -0
  48. data/app/models/raif/concerns/has_runtime_duration.rb +41 -0
  49. data/app/models/raif/concerns/json_schema_definition.rb +54 -6
  50. data/app/models/raif/concerns/llm_prompt_caching.rb +20 -0
  51. data/app/models/raif/concerns/llms/anthropic/message_formatting.rb +34 -0
  52. data/app/models/raif/concerns/llms/anthropic/response_tool_calls.rb +24 -0
  53. data/app/models/raif/concerns/llms/anthropic/tool_formatting.rb +8 -0
  54. data/app/models/raif/concerns/llms/bedrock/message_formatting.rb +43 -0
  55. data/app/models/raif/concerns/llms/bedrock/response_tool_calls.rb +26 -0
  56. data/app/models/raif/concerns/llms/bedrock/tool_formatting.rb +8 -0
  57. data/app/models/raif/concerns/llms/google/message_formatting.rb +112 -0
  58. data/app/models/raif/concerns/llms/google/response_tool_calls.rb +32 -0
  59. data/app/models/raif/concerns/llms/google/tool_formatting.rb +76 -0
  60. data/app/models/raif/concerns/llms/message_formatting.rb +41 -5
  61. data/app/models/raif/concerns/llms/open_ai/json_schema_validation.rb +3 -3
  62. data/app/models/raif/concerns/llms/open_ai_completions/message_formatting.rb +22 -0
  63. data/app/models/raif/concerns/llms/open_ai_completions/response_tool_calls.rb +22 -0
  64. data/app/models/raif/concerns/llms/open_ai_completions/tool_formatting.rb +8 -0
  65. data/app/models/raif/concerns/llms/open_ai_responses/message_formatting.rb +17 -0
  66. data/app/models/raif/concerns/llms/open_ai_responses/response_tool_calls.rb +26 -0
  67. data/app/models/raif/concerns/llms/open_ai_responses/tool_formatting.rb +8 -0
  68. data/app/models/raif/concerns/provider_managed_tool_calls.rb +162 -0
  69. data/app/models/raif/concerns/run_with.rb +127 -0
  70. data/app/models/raif/conversation.rb +112 -8
  71. data/app/models/raif/conversation_entry.rb +38 -4
  72. data/app/models/raif/embedding_model.rb +2 -1
  73. data/app/models/raif/embedding_models/bedrock.rb +10 -1
  74. data/app/models/raif/embedding_models/google.rb +37 -0
  75. data/app/models/raif/embedding_models/open_ai.rb +1 -1
  76. data/app/models/raif/evals/llm_judge.rb +70 -0
  77. data/{lib → app/models}/raif/evals/llm_judges/binary.rb +41 -3
  78. data/{lib → app/models}/raif/evals/llm_judges/comparative.rb +41 -3
  79. data/{lib → app/models}/raif/evals/llm_judges/scored.rb +39 -1
  80. data/{lib → app/models}/raif/evals/llm_judges/summarization.rb +40 -2
  81. data/app/models/raif/llm.rb +104 -4
  82. data/app/models/raif/llms/anthropic.rb +32 -22
  83. data/app/models/raif/llms/bedrock.rb +64 -24
  84. data/app/models/raif/llms/google.rb +166 -0
  85. data/app/models/raif/llms/open_ai_base.rb +23 -5
  86. data/app/models/raif/llms/open_ai_completions.rb +14 -12
  87. data/app/models/raif/llms/open_ai_responses.rb +14 -17
  88. data/app/models/raif/llms/open_router.rb +16 -15
  89. data/app/models/raif/model_completion.rb +103 -1
  90. data/app/models/raif/model_tool.rb +55 -5
  91. data/app/models/raif/model_tool_invocation.rb +68 -6
  92. data/app/models/raif/model_tools/agent_final_answer.rb +2 -7
  93. data/app/models/raif/model_tools/provider_managed/code_execution.rb +4 -0
  94. data/app/models/raif/model_tools/provider_managed/image_generation.rb +4 -0
  95. data/app/models/raif/model_tools/provider_managed/web_search.rb +4 -0
  96. data/app/models/raif/prompt_studio_batch_run.rb +155 -0
  97. data/app/models/raif/prompt_studio_batch_run_item.rb +220 -0
  98. data/app/models/raif/streaming_responses/bedrock.rb +60 -1
  99. data/app/models/raif/streaming_responses/google.rb +71 -0
  100. data/app/models/raif/task.rb +85 -18
  101. data/app/models/raif/user_tool_invocation.rb +19 -0
  102. data/app/views/layouts/raif/admin.html.erb +43 -2
  103. data/app/views/raif/admin/agents/_agent.html.erb +9 -0
  104. data/app/views/raif/admin/agents/_conversation_message.html.erb +28 -6
  105. data/app/views/raif/admin/agents/index.html.erb +50 -0
  106. data/app/views/raif/admin/agents/show.html.erb +50 -1
  107. data/app/views/raif/admin/configs/show.html.erb +117 -0
  108. data/app/views/raif/admin/conversations/_conversation_entry.html.erb +29 -34
  109. data/app/views/raif/admin/conversations/show.html.erb +2 -0
  110. data/app/views/raif/admin/llms/index.html.erb +110 -0
  111. data/app/views/raif/admin/model_completions/_model_completion.html.erb +10 -5
  112. data/app/views/raif/admin/model_completions/index.html.erb +40 -1
  113. data/app/views/raif/admin/model_completions/show.html.erb +256 -84
  114. data/app/views/raif/admin/model_tool_invocations/index.html.erb +22 -1
  115. data/app/views/raif/admin/model_tool_invocations/show.html.erb +18 -0
  116. data/app/views/raif/admin/model_tools/_list.html.erb +16 -0
  117. data/app/views/raif/admin/model_tools/_model_tool.html.erb +36 -0
  118. data/app/views/raif/admin/prompt_studio/agents/index.html.erb +56 -0
  119. data/app/views/raif/admin/prompt_studio/agents/show.html.erb +57 -0
  120. data/app/views/raif/admin/prompt_studio/batch_runs/_batch_run_item.html.erb +54 -0
  121. data/app/views/raif/admin/prompt_studio/batch_runs/_judge_config_fields.html.erb +76 -0
  122. data/app/views/raif/admin/prompt_studio/batch_runs/_judge_detail_modal.html.erb +27 -0
  123. data/app/views/raif/admin/prompt_studio/batch_runs/_modal.html.erb +35 -0
  124. data/app/views/raif/admin/prompt_studio/batch_runs/_progress.html.erb +78 -0
  125. data/app/views/raif/admin/prompt_studio/batch_runs/show.html.erb +49 -0
  126. data/app/views/raif/admin/prompt_studio/conversations/index.html.erb +48 -0
  127. data/app/views/raif/admin/prompt_studio/conversations/show.html.erb +36 -0
  128. data/app/views/raif/admin/prompt_studio/shared/_nav_tabs.html.erb +17 -0
  129. data/app/views/raif/admin/prompt_studio/shared/_prompt_comparison.html.erb +87 -0
  130. data/app/views/raif/admin/prompt_studio/shared/_type_filter.html.erb +54 -0
  131. data/app/views/raif/admin/prompt_studio/tasks/_task_result.html.erb +145 -0
  132. data/app/views/raif/admin/prompt_studio/tasks/_task_row.html.erb +12 -0
  133. data/app/views/raif/admin/prompt_studio/tasks/_task_type_filter.html.erb +58 -0
  134. data/app/views/raif/admin/prompt_studio/tasks/_tasks_table.html.erb +22 -0
  135. data/app/views/raif/admin/prompt_studio/tasks/index.html.erb +35 -0
  136. data/app/views/raif/admin/prompt_studio/tasks/show.html.erb +19 -0
  137. data/app/views/raif/admin/stats/_stats_tile.html.erb +34 -0
  138. data/app/views/raif/admin/stats/index.html.erb +71 -88
  139. data/app/views/raif/admin/stats/model_tool_invocations/index.html.erb +43 -0
  140. data/app/views/raif/admin/stats/tasks/index.html.erb +20 -6
  141. data/app/views/raif/admin/tasks/_task.html.erb +1 -0
  142. data/app/views/raif/admin/tasks/index.html.erb +23 -6
  143. data/app/views/raif/admin/tasks/show.html.erb +56 -3
  144. data/app/views/raif/conversation_entries/_form.html.erb +3 -0
  145. data/app/views/raif/conversation_entries/_message.html.erb +10 -6
  146. data/app/views/raif/conversations/_conversation.html.erb +10 -0
  147. data/app/views/raif/conversations/_entry_processed.turbo_stream.erb +12 -0
  148. data/app/views/raif/conversations/index.html.erb +23 -0
  149. data/config/importmap.rb +8 -0
  150. data/config/locales/admin.en.yml +161 -1
  151. data/config/locales/en.yml +67 -4
  152. data/config/routes.rb +10 -0
  153. data/db/migrate/20250904194456_add_generating_entry_response_to_raif_conversations.rb +7 -0
  154. data/db/migrate/20250911125234_add_source_to_raif_tasks.rb +7 -0
  155. data/db/migrate/20251020005853_add_source_to_raif_agents.rb +7 -0
  156. data/db/migrate/20251020011346_rename_task_run_args_to_run_with.rb +7 -0
  157. data/db/migrate/20251020011405_add_run_with_to_raif_agents.rb +13 -0
  158. data/db/migrate/20251024160119_add_llm_messages_max_length_to_raif_conversations.rb +14 -0
  159. data/db/migrate/20251124185033_add_provider_tool_call_id_to_raif_model_tool_invocations.rb +7 -0
  160. data/db/migrate/20251128202941_add_tool_choice_to_raif_model_completions.rb +7 -0
  161. data/db/migrate/20260118144846_add_source_to_raif_conversations.rb +7 -0
  162. data/db/migrate/20260119000000_add_failure_tracking_to_raif_model_completions.rb +10 -0
  163. data/db/migrate/20260119000001_add_completed_at_to_raif_model_completions.rb +8 -0
  164. data/db/migrate/20260119000002_add_started_at_to_raif_model_completions.rb +8 -0
  165. data/db/migrate/20260307000000_add_prompt_studio_run_to_raif_tasks.rb +7 -0
  166. data/db/migrate/20260308000000_create_raif_prompt_studio_batch_runs.rb +27 -0
  167. data/db/migrate/20260308000001_create_raif_prompt_studio_batch_run_items.rb +24 -0
  168. data/db/migrate/20260407000000_add_cache_token_columns_to_raif_model_completions.rb +8 -0
  169. data/lib/generators/raif/agent/agent_generator.rb +18 -0
  170. data/lib/generators/raif/agent/templates/agent.rb.tt +7 -5
  171. data/lib/generators/raif/agent/templates/application_agent.rb.tt +1 -1
  172. data/lib/generators/raif/agent/templates/system_prompt.erb.tt +3 -0
  173. data/lib/generators/raif/conversation/conversation_generator.rb +19 -1
  174. data/lib/generators/raif/conversation/templates/conversation.rb.tt +6 -0
  175. data/lib/generators/raif/conversation/templates/system_prompt.erb.tt +4 -0
  176. data/lib/generators/raif/install/templates/initializer.rb +117 -8
  177. data/lib/generators/raif/task/task_generator.rb +18 -0
  178. data/lib/generators/raif/task/templates/prompt.erb.tt +4 -0
  179. data/lib/generators/raif/task/templates/task.rb.tt +10 -9
  180. data/lib/raif/configuration.rb +47 -2
  181. data/lib/raif/embedding_model_registry.rb +8 -0
  182. data/lib/raif/engine.rb +24 -1
  183. data/lib/raif/errors/blank_response_error.rb +8 -0
  184. data/lib/raif/errors/instance_dependent_schema_error.rb +8 -0
  185. data/lib/raif/errors/prompt_template_error.rb +15 -0
  186. data/lib/raif/errors/streaming_error.rb +6 -3
  187. data/lib/raif/errors.rb +3 -0
  188. data/lib/raif/evals/run.rb +1 -0
  189. data/lib/raif/evals.rb +0 -6
  190. data/lib/raif/json_schema_builder.rb +14 -0
  191. data/lib/raif/llm_registry.rb +433 -42
  192. data/lib/raif/messages.rb +180 -0
  193. data/lib/raif/prompt_studio_comparison_builder.rb +138 -0
  194. data/lib/raif/token_estimator.rb +28 -0
  195. data/lib/raif/version.rb +1 -1
  196. data/lib/raif.rb +11 -0
  197. data/lib/tasks/annotate_rb.rake +10 -0
  198. data/spec/support/rspec_helpers.rb +15 -9
  199. data/spec/support/test_task.rb +9 -0
  200. data/spec/support/test_template_task.rb +41 -0
  201. metadata +108 -15
  202. data/app/models/raif/agents/re_act_agent.rb +0 -127
  203. data/app/models/raif/agents/re_act_step.rb +0 -32
  204. data/app/models/raif/concerns/task_run_args.rb +0 -62
  205. data/lib/raif/evals/llm_judge.rb +0 -32
  206. /data/{lib → app/models}/raif/evals/scoring_rubric.rb +0 -0
@@ -3,7 +3,7 @@
3
3
  class Raif::ModelTool
4
4
  include Raif::Concerns::JsonSchemaDefinition
5
5
 
6
- delegate :tool_name, :tool_description, :tool_arguments_schema, :example_model_invocation, to: :class
6
+ delegate :tool_name, :tool_description, :example_model_invocation, to: :class
7
7
 
8
8
  class << self
9
9
  # The description of the tool that will be provided to the model
@@ -53,9 +53,9 @@ class Raif::ModelTool
53
53
  name.gsub("Raif::ModelTools::", "").underscore
54
54
  end
55
55
 
56
- def tool_arguments_schema(&block)
56
+ def tool_arguments_schema(dynamic: false, &block)
57
57
  if block_given?
58
- json_schema_definition(:tool_arguments, &block)
58
+ json_schema_definition(:tool_arguments, dynamic: dynamic, &block)
59
59
  elsif schema_defined?(:tool_arguments)
60
60
  schema_for(:tool_arguments)
61
61
  else
@@ -76,11 +76,14 @@ class Raif::ModelTool
76
76
  false
77
77
  end
78
78
 
79
- def invoke_tool(tool_arguments:, source:)
79
+ def invoke_tool(provider_tool_call_id:, tool_arguments:, source:)
80
+ prepared_arguments = prepare_tool_arguments(tool_arguments)
81
+
80
82
  tool_invocation = Raif::ModelToolInvocation.new(
83
+ provider_tool_call_id: provider_tool_call_id,
81
84
  source: source,
82
85
  tool_type: name,
83
- tool_arguments: tool_arguments
86
+ tool_arguments: prepared_arguments
84
87
  )
85
88
 
86
89
  ActiveRecord::Base.transaction do
@@ -94,6 +97,53 @@ class Raif::ModelTool
94
97
  tool_invocation.failed!
95
98
  raise e
96
99
  end
100
+
101
+ # Prepares tool arguments before validation and invocation. Override in subclasses
102
+ # to add tool-specific argument processing (e.g. type coercion, default injection).
103
+ # The base implementation strips keys not declared in the tool's argument schema,
104
+ # which handles LLMs that hallucinate extra parameters.
105
+ #
106
+ # @param arguments [Hash] The raw tool arguments from the LLM response
107
+ # @return [Hash] The prepared arguments ready for validation and processing
108
+ def prepare_tool_arguments(arguments)
109
+ strip_unknown_tool_arguments(arguments)
110
+ end
111
+
112
+ private
113
+
114
+ # Removes keys from the arguments hash that are not declared in the tool's
115
+ # argument schema. Logs a warning when keys are stripped so hallucination
116
+ # patterns can be monitored. Normalizes all keys to strings for consistent
117
+ # comparison since the schema builder uses symbol keys and LLM responses
118
+ # use string keys.
119
+ #
120
+ # @param arguments [Hash] The raw tool arguments
121
+ # @return [Hash] The arguments with only schema-declared keys
122
+ def strip_unknown_tool_arguments(arguments)
123
+ return arguments unless arguments.is_a?(Hash)
124
+
125
+ schema_properties = tool_arguments_schema[:properties] || tool_arguments_schema["properties"]
126
+ return arguments if schema_properties.blank?
127
+
128
+ normalized_arguments = arguments.deep_stringify_keys
129
+ allowed_keys = schema_properties.keys.map(&:to_s)
130
+ dropped_keys = normalized_arguments.keys - allowed_keys
131
+
132
+ if dropped_keys.any?
133
+ Rails.logger.warn(
134
+ "[Raif::ModelTool] Stripped unexpected tool arguments for #{name}: #{dropped_keys.join(", ")}"
135
+ )
136
+ end
137
+
138
+ normalized_arguments.slice(*allowed_keys)
139
+ end
140
+ end
141
+
142
+ # Instance method to get the tool arguments schema
143
+ # For instance-dependent schemas, builds the schema with this instance as context
144
+ # For class-level schemas, returns the class-level schema
145
+ def tool_arguments_schema
146
+ schema_for_instance(:tool_arguments)
97
147
  end
98
148
 
99
149
  end
@@ -1,5 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # == Schema Information
4
+ #
5
+ # Table name: raif_model_tool_invocations
6
+ #
7
+ # id :bigint not null, primary key
8
+ # completed_at :datetime
9
+ # failed_at :datetime
10
+ # result :jsonb not null
11
+ # source_type :string not null
12
+ # tool_arguments :jsonb not null
13
+ # tool_type :string not null
14
+ # created_at :datetime not null
15
+ # updated_at :datetime not null
16
+ # provider_tool_call_id :string
17
+ # source_id :bigint not null
18
+ #
19
+ # Indexes
20
+ #
21
+ # index_raif_model_tool_invocations_on_source (source_type,source_id)
22
+ #
3
23
  class Raif::ModelToolInvocation < Raif::ApplicationRecord
4
24
  belongs_to :source, polymorphic: true
5
25
 
@@ -22,24 +42,66 @@ class Raif::ModelToolInvocation < Raif::ApplicationRecord
22
42
  @tool ||= tool_type.constantize
23
43
  end
24
44
 
25
- def as_llm_message
26
- "Invoking tool: #{tool_name} with arguments: #{tool_arguments.to_json}"
45
+ # Returns tool call in the format expected by LLM message formatting
46
+ # @param assistant_message [String, nil] Optional assistant message accompanying the tool call
47
+ # @return [Hash] Hash representation for JSONB storage and LLM APIs
48
+ def as_tool_call_message(assistant_message: nil)
49
+ Raif::Messages::ToolCall.new(
50
+ provider_tool_call_id: provider_tool_call_id,
51
+ name: tool_name,
52
+ arguments: tool_arguments,
53
+ assistant_message: assistant_message
54
+ ).to_h
27
55
  end
28
56
 
29
- def result_llm_message
30
- return unless tool.respond_to?(:observation_for_invocation)
31
-
32
- tool.observation_for_invocation(self)
57
+ # Returns tool result in the format expected by LLM message formatting
58
+ # @return [Hash] Hash representation for JSONB storage and LLM APIs
59
+ def as_tool_call_result_message(result: self.result)
60
+ Raif::Messages::ToolCallResult.new(
61
+ provider_tool_call_id: provider_tool_call_id,
62
+ name: tool_name,
63
+ result: result
64
+ ).to_h
33
65
  end
34
66
 
35
67
  def to_partial_path
36
68
  "raif/model_tool_invocations/#{tool.invocation_partial_name}"
37
69
  end
38
70
 
71
+ def admin_observation
72
+ admin_observation_result[:observation]
73
+ end
74
+
75
+ def admin_observation_error
76
+ admin_observation_result[:error]
77
+ end
78
+
79
+ def admin_observation_available?
80
+ admin_observation.present? || admin_observation_error.present?
81
+ end
82
+
39
83
  def ensure_valid_tool_argument_schema
40
84
  unless JSON::Validator.validate(tool_arguments_schema, tool_arguments)
41
85
  errors.add(:tool_arguments, "does not match schema")
42
86
  end
43
87
  end
44
88
 
89
+ private
90
+
91
+ # Best-effort reconstruction of the observation shown in admin. This uses the
92
+ # current formatter code against persisted invocation data, so failures are
93
+ # captured for display instead of breaking the page render.
94
+ def admin_observation_result
95
+ @admin_observation_result ||= if completed? && triggers_observation_to_model?
96
+ begin
97
+ observation = tool.observation_for_invocation(self)
98
+ { observation: observation.presence, error: nil }
99
+ rescue StandardError => e
100
+ { observation: nil, error: e.message }
101
+ end
102
+ else
103
+ { observation: nil, error: nil }
104
+ end
105
+ end
106
+
45
107
  end
@@ -20,16 +20,11 @@ class Raif::ModelTools::AgentFinalAnswer < Raif::ModelTool
20
20
  def observation_for_invocation(tool_invocation)
21
21
  return "No answer provided" unless tool_invocation.result.present?
22
22
 
23
- tool_invocation.result["final_answer"]
23
+ tool_invocation.result
24
24
  end
25
25
 
26
26
  def process_invocation(tool_invocation)
27
- tool_invocation.update!(
28
- result: {
29
- final_answer: tool_invocation.tool_arguments["final_answer"]
30
- }
31
- )
32
-
27
+ tool_invocation.update!(result: tool_invocation.tool_arguments["final_answer"])
33
28
  tool_invocation.result
34
29
  end
35
30
  end
@@ -2,4 +2,8 @@
2
2
 
3
3
  class Raif::ModelTools::ProviderManaged::CodeExecution < Raif::ModelTools::ProviderManaged::Base
4
4
 
5
+ tool_description do
6
+ "Utilizes the model provider's built-in code execution capabilities."
7
+ end
8
+
5
9
  end
@@ -2,4 +2,8 @@
2
2
 
3
3
  class Raif::ModelTools::ProviderManaged::ImageGeneration < Raif::ModelTools::ProviderManaged::Base
4
4
 
5
+ tool_description do
6
+ "Utilizes the model provider's built-in image generation capabilities."
7
+ end
8
+
5
9
  end
@@ -2,4 +2,8 @@
2
2
 
3
3
  class Raif::ModelTools::ProviderManaged::WebSearch < Raif::ModelTools::ProviderManaged::Base
4
4
 
5
+ tool_description do
6
+ "Utilizes the model provider's built-in web search capabilities."
7
+ end
8
+
5
9
  end
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ # == Schema Information
4
+ #
5
+ # Table name: raif_prompt_studio_batch_runs
6
+ #
7
+ # id :bigint not null, primary key
8
+ # completed_at :datetime
9
+ # completed_count :integer default(0), not null
10
+ # failed_at :datetime
11
+ # failed_count :integer default(0), not null
12
+ # judge_config :jsonb not null
13
+ # judge_llm_model_key :string
14
+ # judge_type :string
15
+ # llm_model_key :string not null
16
+ # started_at :datetime
17
+ # task_type :string not null
18
+ # total_count :integer default(0), not null
19
+ # created_at :datetime not null
20
+ # updated_at :datetime not null
21
+ #
22
+
23
+ module Raif
24
+ class PromptStudioBatchRun < Raif::ApplicationRecord
25
+ ALLOWED_JUDGE_TYPES = [
26
+ "Raif::Evals::LlmJudges::Binary",
27
+ "Raif::Evals::LlmJudges::Scored",
28
+ "Raif::Evals::LlmJudges::Comparative",
29
+ "Raif::Evals::LlmJudges::Summarization"
30
+ ].freeze
31
+
32
+ after_initialize -> { self.judge_config ||= {} }
33
+
34
+ has_many :items,
35
+ class_name: "Raif::PromptStudioBatchRunItem",
36
+ foreign_key: :batch_run_id,
37
+ dependent: :destroy,
38
+ inverse_of: :batch_run
39
+
40
+ boolean_timestamp :started_at
41
+ boolean_timestamp :completed_at
42
+ boolean_timestamp :failed_at
43
+
44
+ validates :task_type, presence: true
45
+ validates :llm_model_key, presence: true
46
+ validates :judge_type, inclusion: { in: ALLOWED_JUDGE_TYPES }, allow_nil: true
47
+
48
+ def status
49
+ if completed_at?
50
+ :completed
51
+ elsif failed_at?
52
+ :failed
53
+ elsif started_at?
54
+ :in_progress
55
+ else
56
+ :pending
57
+ end
58
+ end
59
+
60
+ def progress_percentage
61
+ return 0 if total_count.zero?
62
+
63
+ ((completed_count + failed_count).to_f / total_count * 100).round
64
+ end
65
+
66
+ def has_judge?
67
+ judge_type.present?
68
+ end
69
+
70
+ def judge_class
71
+ judge_type&.safe_constantize
72
+ end
73
+
74
+ def judge_pass_rate
75
+ judge_tasks = completed_judge_tasks
76
+ return if judge_tasks.empty?
77
+
78
+ pass_count = judge_tasks.count(&:passes?)
79
+ percentage = ((pass_count.to_f / judge_tasks.size) * 100).round
80
+ "#{percentage}% (#{pass_count}/#{judge_tasks.size})"
81
+ end
82
+
83
+ def judge_average_score
84
+ scores = completed_judge_tasks.filter_map(&:judgment_score)
85
+ return if scores.empty?
86
+
87
+ (scores.sum.to_f / scores.size).round(1)
88
+ end
89
+
90
+ def judge_comparative_summary
91
+ completed_items = items.where.not(judge_task_id: nil).includes(:judge_task)
92
+ return if completed_items.empty?
93
+
94
+ new_wins = 0
95
+ original_wins = 0
96
+ ties = 0
97
+
98
+ completed_items.each do |item|
99
+ next unless item.judge_task&.completed?
100
+
101
+ parsed = item.judge_task.parsed_response
102
+ next unless parsed.is_a?(Hash)
103
+
104
+ winner = parsed["winner"]
105
+ if winner == "tie"
106
+ ties += 1
107
+ elsif winner == item.metadata&.dig("new_response_letter")
108
+ new_wins += 1
109
+ else
110
+ original_wins += 1
111
+ end
112
+ end
113
+
114
+ total = new_wins + original_wins + ties
115
+ return if total.zero?
116
+
117
+ {
118
+ new_wins: new_wins,
119
+ original_wins: original_wins,
120
+ ties: ties,
121
+ total: total,
122
+ new_win_pct: ((new_wins.to_f / total) * 100).round,
123
+ original_win_pct: ((original_wins.to_f / total) * 100).round,
124
+ tie_pct: ((ties.to_f / total) * 100).round
125
+ }
126
+ end
127
+
128
+ private
129
+
130
+ def completed_judge_tasks
131
+ Raif::Task.where(
132
+ id: items.where.not(judge_task_id: nil).select(:judge_task_id)
133
+ ).where.not(completed_at: nil)
134
+ end
135
+
136
+ public
137
+
138
+ def check_completion!
139
+ reload
140
+ remaining = items.where(status: %w[pending running judging]).count
141
+ self.completed_count = items.where(status: "completed").count
142
+ self.failed_count = items.where(status: "failed").count
143
+
144
+ if remaining.zero?
145
+ if failed_count > 0 && completed_count == 0
146
+ self.failed_at = Time.current
147
+ else
148
+ self.completed_at = Time.current
149
+ end
150
+ end
151
+
152
+ save!
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ # == Schema Information
4
+ #
5
+ # Table name: raif_prompt_studio_batch_run_items
6
+ #
7
+ # id :bigint not null, primary key
8
+ # metadata :jsonb
9
+ # status :string default("pending"), not null
10
+ # created_at :datetime not null
11
+ # updated_at :datetime not null
12
+ # batch_run_id :bigint not null
13
+ # judge_task_id :bigint
14
+ # result_task_id :bigint
15
+ # source_task_id :bigint not null
16
+ #
17
+ # Indexes
18
+ #
19
+ # index_raif_prompt_studio_batch_run_items_on_batch_run_id (batch_run_id)
20
+ # index_raif_prompt_studio_batch_run_items_on_judge_task_id (judge_task_id)
21
+ # index_raif_prompt_studio_batch_run_items_on_result_task_id (result_task_id)
22
+ # index_raif_prompt_studio_batch_run_items_on_source_task_id (source_task_id)
23
+ # index_raif_prompt_studio_batch_run_items_on_status (status)
24
+ #
25
+ # Foreign Keys
26
+ #
27
+ # fk_rails_... (batch_run_id => raif_prompt_studio_batch_runs.id)
28
+ # fk_rails_... (judge_task_id => raif_tasks.id)
29
+ # fk_rails_... (result_task_id => raif_tasks.id)
30
+ # fk_rails_... (source_task_id => raif_tasks.id)
31
+ #
32
+
33
+ module Raif
34
+ class PromptStudioBatchRunItem < Raif::ApplicationRecord
35
+ include ActionView::RecordIdentifier
36
+
37
+ STATUSES = %w[pending running judging completed failed].freeze
38
+
39
+ after_initialize -> { self.metadata ||= {} }
40
+
41
+ belongs_to :batch_run,
42
+ class_name: "Raif::PromptStudioBatchRun",
43
+ inverse_of: :items
44
+
45
+ belongs_to :source_task,
46
+ class_name: "Raif::Task"
47
+
48
+ belongs_to :result_task,
49
+ class_name: "Raif::Task",
50
+ optional: true
51
+
52
+ belongs_to :judge_task,
53
+ class_name: "Raif::Task",
54
+ optional: true
55
+
56
+ validates :status, inclusion: { in: STATUSES }
57
+
58
+ def execute!
59
+ update!(status: "running")
60
+ broadcast_item
61
+
62
+ new_task = create_and_run_task
63
+ run_judge_if_configured(new_task)
64
+
65
+ update!(status: "completed")
66
+ rescue StandardError => e
67
+ Rails.logger.error "Error running batch run item ##{id}: #{e.message}"
68
+ Rails.logger.error e.backtrace&.join("\n")
69
+
70
+ update!(status: "failed")
71
+ ensure
72
+ broadcast_item
73
+ batch_run.check_completion!
74
+ broadcast_progress
75
+ end
76
+
77
+ def judge_summary
78
+ return unless judge_task&.completed?
79
+
80
+ parsed = judge_task.parsed_response
81
+ return unless parsed.is_a?(Hash)
82
+
83
+ case batch_run.judge_type
84
+ when "Raif::Evals::LlmJudges::Binary"
85
+ parsed["passes"] ? "PASS" : "FAIL"
86
+ when "Raif::Evals::LlmJudges::Scored"
87
+ "Score: #{parsed["score"]}"
88
+ when "Raif::Evals::LlmJudges::Comparative"
89
+ if parsed["winner"] == "tie"
90
+ I18n.t("raif.admin.prompt_studio.batch_runs.judge.tie")
91
+ else
92
+ winner_label = comparative_winner_label(parsed["winner"])
93
+ I18n.t("raif.admin.prompt_studio.batch_runs.judge.winner", name: winner_label)
94
+ end
95
+ when "Raif::Evals::LlmJudges::Summarization"
96
+ "Overall: #{parsed.dig("overall", "score")}/5"
97
+ end
98
+ end
99
+
100
+ def judge_reasoning
101
+ return unless judge_task&.completed?
102
+
103
+ parsed = judge_task.parsed_response
104
+ return unless parsed.is_a?(Hash)
105
+
106
+ parsed["reasoning"]
107
+ end
108
+
109
+ def comparative_winner_label(winner_letter)
110
+ new_response_letter = metadata&.dig("new_response_letter")
111
+ return winner_letter unless new_response_letter
112
+
113
+ if winner_letter == new_response_letter
114
+ I18n.t("raif.admin.prompt_studio.batch_runs.judge.new_response")
115
+ else
116
+ I18n.t("raif.admin.prompt_studio.batch_runs.judge.original_response")
117
+ end
118
+ end
119
+
120
+ private
121
+
122
+ def create_and_run_task
123
+ new_task = source_task.class.new(
124
+ creator: source_task.creator,
125
+ source: source_task,
126
+ llm_model_key: batch_run.llm_model_key,
127
+ available_model_tools: source_task.available_model_tools,
128
+ run_with: source_task.run_with,
129
+ prompt_studio_run: true,
130
+ started_at: Time.current
131
+ )
132
+ new_task.assign_attributes(source_task.prompt_studio_task_attributes)
133
+ new_task.save!
134
+
135
+ update!(result_task_id: new_task.id)
136
+ new_task.run
137
+ new_task
138
+ end
139
+
140
+ def run_judge_if_configured(new_task)
141
+ return unless batch_run.has_judge? && new_task.completed?
142
+
143
+ update!(status: "judging")
144
+ broadcast_item
145
+
146
+ judge_result = invoke_judge(new_task)
147
+ update!(judge_task_id: judge_result.id)
148
+ end
149
+
150
+ def invoke_judge(new_task)
151
+ judge_class = batch_run.judge_class
152
+ config = batch_run.judge_config
153
+ judge_args = {
154
+ creator: source_task.creator,
155
+ prompt_studio_run: true,
156
+ llm_model_key: batch_run.judge_llm_model_key
157
+ }
158
+ judge_args.merge!(source_task.prompt_studio_task_attributes)
159
+
160
+ if config["include_original_prompt_as_context"]
161
+ judge_args[:additional_context] =
162
+ "The content being evaluated was generated in response to the following prompt:\n\n#{source_task.prompt}"
163
+ end
164
+
165
+ case batch_run.judge_type
166
+ when "Raif::Evals::LlmJudges::Binary"
167
+ judge_class.run(
168
+ content_to_judge: new_task.raw_response,
169
+ criteria: config["criteria"],
170
+ strict_mode: config["strict_mode"],
171
+ **judge_args
172
+ )
173
+ when "Raif::Evals::LlmJudges::Scored"
174
+ rubric = Raif::Evals::ScoringRubric.send(config["scoring_rubric"])
175
+ judge_class.run(
176
+ content_to_judge: new_task.raw_response,
177
+ scoring_rubric: rubric,
178
+ **judge_args
179
+ )
180
+ when "Raif::Evals::LlmJudges::Comparative"
181
+ result = judge_class.run(
182
+ content_to_judge: new_task.raw_response,
183
+ over_content: source_task.raw_response,
184
+ comparison_criteria: config["comparison_criteria"],
185
+ **judge_args
186
+ )
187
+ # Store which letter was assigned to the new response so we can display
188
+ # "Winner: New Response" / "Winner: Original Response" instead of "A"/"B"
189
+ update!(metadata: metadata.merge("new_response_letter" => result.expected_winner))
190
+ result
191
+ when "Raif::Evals::LlmJudges::Summarization"
192
+ judge_class.run(
193
+ original_content: source_task.prompt,
194
+ summary: new_task.raw_response,
195
+ **judge_args
196
+ )
197
+ end
198
+ end
199
+
200
+ def broadcast_item
201
+ Turbo::StreamsChannel.broadcast_replace_to(
202
+ batch_run,
203
+ target: dom_id(self),
204
+ partial: "raif/admin/prompt_studio/batch_runs/batch_run_item",
205
+ locals: { item: self }
206
+ )
207
+ end
208
+
209
+ def broadcast_progress
210
+ batch_run.reload
211
+ Turbo::StreamsChannel.broadcast_replace_to(
212
+ batch_run,
213
+ target: dom_id(batch_run, :progress),
214
+ partial: "raif/admin/prompt_studio/batch_runs/progress",
215
+ locals: { batch_run: batch_run }
216
+ )
217
+ end
218
+
219
+ end
220
+ end
@@ -3,6 +3,8 @@
3
3
  class Raif::StreamingResponses::Bedrock
4
4
 
5
5
  def initialize_new_message
6
+ @reasoning_content_blocks = {}
7
+
6
8
  # Initialize empty AWS response object
7
9
  @message = Aws::BedrockRuntime::Types::Message.new(
8
10
  role: "assistant",
@@ -62,9 +64,12 @@ class Raif::StreamingResponses::Bedrock
62
64
  )
63
65
 
64
66
  @message.content[index].tool_use.input += event.delta.tool_use.input
67
+ elsif event.delta.is_a?(Aws::BedrockRuntime::Types::ContentBlockDelta::ReasoningContent)
68
+ accumulate_reasoning_content(index, event.delta.reasoning_content)
65
69
  end
66
70
  when :content_block_stop
67
- content_block = @message.content[event.content_block_index]
71
+ index = event.content_block_index
72
+ content_block = @message.content[index]
68
73
 
69
74
  if content_block&.tool_use&.input.is_a?(String)
70
75
  begin
@@ -73,6 +78,8 @@ class Raif::StreamingResponses::Bedrock
73
78
  # If parsing fails, leave as a string
74
79
  end
75
80
  end
81
+
82
+ finalize_reasoning_content(index)
76
83
  when :message_stop
77
84
  @response.stop_reason = event.stop_reason
78
85
  when :metadata
@@ -86,4 +93,56 @@ class Raif::StreamingResponses::Bedrock
86
93
  @response
87
94
  end
88
95
 
96
+ private
97
+
98
+ def accumulate_reasoning_content(index, reasoning_delta)
99
+ reasoning_content = reasoning_content_for(index)
100
+ reasoning_content[:seen] = true
101
+
102
+ case reasoning_delta
103
+ when Aws::BedrockRuntime::Types::ReasoningContentBlockDelta::Text
104
+ reasoning_content[:text] << reasoning_delta.text.to_s
105
+ when Aws::BedrockRuntime::Types::ReasoningContentBlockDelta::Signature
106
+ reasoning_content[:signature] = reasoning_delta.signature
107
+ when Aws::BedrockRuntime::Types::ReasoningContentBlockDelta::RedactedContent
108
+ reasoning_content[:redacted_content] << reasoning_delta.redacted_content.to_s
109
+ else
110
+ reasoning_content[:unknown] = true
111
+ end
112
+ end
113
+
114
+ def finalize_reasoning_content(index)
115
+ reasoning_content = @reasoning_content_blocks.delete(index)
116
+ return unless reasoning_content&.dig(:seen)
117
+
118
+ @message.content[index] = Aws::BedrockRuntime::Types::ContentBlock::ReasoningContent.new(
119
+ reasoning_content: build_reasoning_content(reasoning_content)
120
+ )
121
+ end
122
+
123
+ def build_reasoning_content(reasoning_content)
124
+ if reasoning_content[:text].blank? && reasoning_content[:signature].blank? && reasoning_content[:redacted_content].present?
125
+ return Aws::BedrockRuntime::Types::ReasoningContentBlock::RedactedContent.new(
126
+ redacted_content: reasoning_content[:redacted_content]
127
+ )
128
+ end
129
+
130
+ Aws::BedrockRuntime::Types::ReasoningContentBlock::ReasoningText.new(
131
+ reasoning_text: Aws::BedrockRuntime::Types::ReasoningTextBlock.new(
132
+ text: reasoning_content[:text],
133
+ signature: reasoning_content[:signature]
134
+ )
135
+ )
136
+ end
137
+
138
+ def reasoning_content_for(index)
139
+ @reasoning_content_blocks[index] ||= {
140
+ seen: false,
141
+ text: +"",
142
+ signature: nil,
143
+ redacted_content: +"",
144
+ unknown: false
145
+ }
146
+ end
147
+
89
148
  end