RubyGems - raif - Versions diffs - 1.4.0 → 1.5.0 - Mend

raif 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

data/app/models/raif/llms/open_router.rb CHANGED Viewed

@@ -39,6 +39,8 @@ private
   end
   def update_model_completion(model_completion, response_json)
+    return if response_json.nil?
     raw_response = if model_completion.response_format_json?
       extract_json_response(response_json)
     else
@@ -52,7 +54,8 @@ private
       response_array: response_json["choices"],
       completion_tokens: response_json.dig("usage", "completion_tokens"),
       prompt_tokens: response_json.dig("usage", "prompt_tokens"),
-      total_tokens: response_json.dig("usage", "total_tokens")
+      total_tokens: response_json.dig("usage", "total_tokens"),
+      cache_read_input_tokens: response_json.dig("usage", "prompt_tokens_details", "cached_tokens")
     )
   end
@@ -87,9 +90,13 @@ private
       params[:tools] = tools unless tools.blank?
-      if model_completion.tool_choice.present?
+      if model_completion.tool_choice == "required"
+        params[:tool_choice] = build_required_tool_choice
+        params[:parallel_tool_calls] = false unless tools.blank?
+      elsif model_completion.tool_choice.present?
         tool_klass = model_completion.tool_choice.constantize
         params[:tool_choice] = build_forced_tool_choice(tool_klass.tool_name)
+        params[:parallel_tool_calls] = false unless tools.blank?
       end
     end
@@ -114,7 +121,7 @@ private
   end
   def extract_json_response(resp)
-    tool_calls = resp.dig("choices", 0, "message", "tool_calls")
+    tool_calls = resp&.dig("choices", 0, "message", "tool_calls")
     return extract_text_response(resp) if tool_calls.blank?
     tool_response = tool_calls.find do |tool_call|

data/app/models/raif/model_completion.rb CHANGED Viewed

@@ -4,39 +4,41 @@
 #
 # Table name: raif_model_completions
 #
-#  id                        :bigint           not null, primary key
-#  available_model_tools     :jsonb            not null
-#  citations                 :jsonb
-#  completed_at              :datetime
-#  completion_tokens         :integer
-#  failed_at                 :datetime
-#  failure_error             :string
-#  failure_reason            :text
-#  llm_model_key             :string           not null
-#  max_completion_tokens     :integer
-#  messages                  :jsonb            not null
-#  model_api_name            :string           not null
-#  output_token_cost         :decimal(10, 6)
-#  prompt_token_cost         :decimal(10, 6)
-#  prompt_tokens             :integer
-#  raw_response              :text
-#  response_array            :jsonb
-#  response_format           :integer          default("text"), not null
-#  response_format_parameter :string
-#  response_tool_calls       :jsonb
-#  retry_count               :integer          default(0), not null
-#  source_type               :string
-#  started_at                :datetime
-#  stream_response           :boolean          default(FALSE), not null
-#  system_prompt             :text
-#  temperature               :decimal(5, 3)
-#  tool_choice               :string
-#  total_cost                :decimal(10, 6)
-#  total_tokens              :integer
-#  created_at                :datetime         not null
-#  updated_at                :datetime         not null
-#  response_id               :string
-#  source_id                 :bigint
+#  id                          :bigint           not null, primary key
+#  available_model_tools       :jsonb            not null
+#  cache_creation_input_tokens :integer
+#  cache_read_input_tokens     :integer
+#  citations                   :jsonb
+#  completed_at                :datetime
+#  completion_tokens           :integer
+#  failed_at                   :datetime
+#  failure_error               :string
+#  failure_reason              :text
+#  llm_model_key               :string           not null
+#  max_completion_tokens       :integer
+#  messages                    :jsonb            not null
+#  model_api_name              :string           not null
+#  output_token_cost           :decimal(10, 6)
+#  prompt_token_cost           :decimal(10, 6)
+#  prompt_tokens               :integer
+#  raw_response                :text
+#  response_array              :jsonb
+#  response_format             :integer          default("text"), not null
+#  response_format_parameter   :string
+#  response_tool_calls         :jsonb
+#  retry_count                 :integer          default(0), not null
+#  source_type                 :string
+#  started_at                  :datetime
+#  stream_response             :boolean          default(FALSE), not null
+#  system_prompt               :text
+#  temperature                 :decimal(5, 3)
+#  tool_choice                 :string
+#  total_cost                  :decimal(10, 6)
+#  total_tokens                :integer
+#  created_at                  :datetime         not null
+#  updated_at                  :datetime         not null
+#  response_id                 :string
+#  source_id                   :bigint
 #
 # Indexes
 #
@@ -49,8 +51,12 @@
 class Raif::ModelCompletion < Raif::ApplicationRecord
   include Raif::Concerns::LlmResponseParsing
   include Raif::Concerns::HasAvailableModelTools
+  include Raif::Concerns::HasRuntimeDuration
+  include Raif::Concerns::ProviderManagedToolCalls
   include Raif::Concerns::BooleanTimestamp
+  attr_accessor :anthropic_prompt_caching_enabled, :bedrock_prompt_caching_enabled
   boolean_timestamp :started_at
   boolean_timestamp :completed_at
   boolean_timestamp :failed_at
@@ -82,8 +88,12 @@ class Raif::ModelCompletion < Raif::ApplicationRecord
   end
   def calculate_costs
+    # Each retry resends the same prompt, so the provider charges input tokens
+    # for every attempt. Factor in retry_count to reflect actual billing.
+    total_attempts = (retry_count || 0) + 1
     if prompt_tokens.present? && llm_config[:input_token_cost].present?
-      self.prompt_token_cost = llm_config[:input_token_cost] * prompt_tokens
+      self.prompt_token_cost = calculate_prompt_token_cost(total_attempts)
     end
     if completion_tokens.present? && llm_config[:output_token_cost].present?
@@ -104,6 +114,37 @@ class Raif::ModelCompletion < Raif::ApplicationRecord
 private
+  def calculate_prompt_token_cost(total_attempts)
+    input_cost = llm_config[:input_token_cost]
+    llm_class = llm_config[:llm_class]
+    cache_read_multiplier = llm_class&.cache_read_input_token_cost_multiplier
+    cache_creation_multiplier = llm_class&.cache_creation_input_token_cost_multiplier
+    cached_reads = cache_read_input_tokens.to_i
+    cached_writes = cache_creation_input_tokens.to_i
+    if cached_reads > 0 && cache_read_multiplier.present?
+      cache_read_cost = input_cost * cache_read_multiplier
+      if llm_class.prompt_tokens_include_cached_tokens?
+        # OpenAI / Google / OpenRouter: cached tokens are a subset of prompt_tokens
+        non_cached = prompt_tokens - cached_reads
+        cost = (non_cached * input_cost) + (cached_reads * cache_read_cost)
+      else
+        # Anthropic / Bedrock: cached tokens are separate from prompt_tokens
+        cost = (prompt_tokens * input_cost) + (cached_reads * cache_read_cost)
+      end
+    else
+      cost = prompt_tokens * input_cost
+    end
+    # Cache creation surcharge (Anthropic / Bedrock)
+    if cached_writes > 0 && cache_creation_multiplier.present?
+      cost += cached_writes * input_cost * cache_creation_multiplier
+    end
+    cost * total_attempts
+  end
   def llm_config
     @llm_config ||= Raif.llm_config(llm_model_key.to_sym)
   end

data/app/models/raif/model_tool.rb CHANGED Viewed

@@ -53,9 +53,9 @@ class Raif::ModelTool
       name.gsub("Raif::ModelTools::", "").underscore
     end
-    def tool_arguments_schema(&block)
+    def tool_arguments_schema(dynamic: false, &block)
       if block_given?
-        json_schema_definition(:tool_arguments, &block)
+        json_schema_definition(:tool_arguments, dynamic: dynamic, &block)
       elsif schema_defined?(:tool_arguments)
         schema_for(:tool_arguments)
       else
@@ -77,11 +77,13 @@ class Raif::ModelTool
     end
     def invoke_tool(provider_tool_call_id:, tool_arguments:, source:)
+      prepared_arguments = prepare_tool_arguments(tool_arguments)
       tool_invocation = Raif::ModelToolInvocation.new(
         provider_tool_call_id: provider_tool_call_id,
         source: source,
         tool_type: name,
-        tool_arguments: tool_arguments
+        tool_arguments: prepared_arguments
       )
       ActiveRecord::Base.transaction do
@@ -95,6 +97,46 @@ class Raif::ModelTool
       tool_invocation.failed!
       raise e
     end
+    # Prepares tool arguments before validation and invocation. Override in subclasses
+    # to add tool-specific argument processing (e.g. type coercion, default injection).
+    # The base implementation strips keys not declared in the tool's argument schema,
+    # which handles LLMs that hallucinate extra parameters.
+    #
+    # @param arguments [Hash] The raw tool arguments from the LLM response
+    # @return [Hash] The prepared arguments ready for validation and processing
+    def prepare_tool_arguments(arguments)
+      strip_unknown_tool_arguments(arguments)
+    end
+  private
+    # Removes keys from the arguments hash that are not declared in the tool's
+    # argument schema. Logs a warning when keys are stripped so hallucination
+    # patterns can be monitored. Normalizes all keys to strings for consistent
+    # comparison since the schema builder uses symbol keys and LLM responses
+    # use string keys.
+    #
+    # @param arguments [Hash] The raw tool arguments
+    # @return [Hash] The arguments with only schema-declared keys
+    def strip_unknown_tool_arguments(arguments)
+      return arguments unless arguments.is_a?(Hash)
+      schema_properties = tool_arguments_schema[:properties] || tool_arguments_schema["properties"]
+      return arguments if schema_properties.blank?
+      normalized_arguments = arguments.deep_stringify_keys
+      allowed_keys = schema_properties.keys.map(&:to_s)
+      dropped_keys = normalized_arguments.keys - allowed_keys
+      if dropped_keys.any?
+        Rails.logger.warn(
+          "[Raif::ModelTool] Stripped unexpected tool arguments for #{name}: #{dropped_keys.join(", ")}"
+        )
+      end
+      normalized_arguments.slice(*allowed_keys)
+    end
   end
   # Instance method to get the tool arguments schema

data/app/models/raif/model_tool_invocation.rb CHANGED Viewed

@@ -56,7 +56,7 @@ class Raif::ModelToolInvocation < Raif::ApplicationRecord
   # Returns tool result in the format expected by LLM message formatting
   # @return [Hash] Hash representation for JSONB storage and LLM APIs
-  def as_tool_call_result_message
+  def as_tool_call_result_message(result: self.result)
     Raif::Messages::ToolCallResult.new(
       provider_tool_call_id: provider_tool_call_id,
       name: tool_name,
@@ -68,10 +68,40 @@ class Raif::ModelToolInvocation < Raif::ApplicationRecord
     "raif/model_tool_invocations/#{tool.invocation_partial_name}"
   end
+  def admin_observation
+    admin_observation_result[:observation]
+  end
+  def admin_observation_error
+    admin_observation_result[:error]
+  end
+  def admin_observation_available?
+    admin_observation.present? || admin_observation_error.present?
+  end
   def ensure_valid_tool_argument_schema
     unless JSON::Validator.validate(tool_arguments_schema, tool_arguments)
       errors.add(:tool_arguments, "does not match schema")
     end
   end
+private
+  # Best-effort reconstruction of the observation shown in admin. This uses the
+  # current formatter code against persisted invocation data, so failures are
+  # captured for display instead of breaking the page render.
+  def admin_observation_result
+    @admin_observation_result ||= if completed? && triggers_observation_to_model?
+      begin
+        observation = tool.observation_for_invocation(self)
+        { observation: observation.presence, error: nil }
+      rescue StandardError => e
+        { observation: nil, error: e.message }
+      end
+    else
+      { observation: nil, error: nil }
+    end
+  end
 end

data/app/models/raif/prompt_studio_batch_run.rb ADDED Viewed

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+# == Schema Information
+#
+# Table name: raif_prompt_studio_batch_runs
+#
+#  id                  :bigint           not null, primary key
+#  completed_at        :datetime
+#  completed_count     :integer          default(0), not null
+#  failed_at           :datetime
+#  failed_count        :integer          default(0), not null
+#  judge_config        :jsonb            not null
+#  judge_llm_model_key :string
+#  judge_type          :string
+#  llm_model_key       :string           not null
+#  started_at          :datetime
+#  task_type           :string           not null
+#  total_count         :integer          default(0), not null
+#  created_at          :datetime         not null
+#  updated_at          :datetime         not null
+#
+module Raif
+  class PromptStudioBatchRun < Raif::ApplicationRecord
+    ALLOWED_JUDGE_TYPES = [
+      "Raif::Evals::LlmJudges::Binary",
+      "Raif::Evals::LlmJudges::Scored",
+      "Raif::Evals::LlmJudges::Comparative",
+      "Raif::Evals::LlmJudges::Summarization"
+    ].freeze
+    after_initialize -> { self.judge_config ||= {} }
+    has_many :items,
+      class_name: "Raif::PromptStudioBatchRunItem",
+      foreign_key: :batch_run_id,
+      dependent: :destroy,
+      inverse_of: :batch_run
+    boolean_timestamp :started_at
+    boolean_timestamp :completed_at
+    boolean_timestamp :failed_at
+    validates :task_type, presence: true
+    validates :llm_model_key, presence: true
+    validates :judge_type, inclusion: { in: ALLOWED_JUDGE_TYPES }, allow_nil: true
+    def status
+      if completed_at?
+        :completed
+      elsif failed_at?
+        :failed
+      elsif started_at?
+        :in_progress
+      else
+        :pending
+      end
+    end
+    def progress_percentage
+      return 0 if total_count.zero?
+      ((completed_count + failed_count).to_f / total_count * 100).round
+    end
+    def has_judge?
+      judge_type.present?
+    end
+    def judge_class
+      judge_type&.safe_constantize
+    end
+    def judge_pass_rate
+      judge_tasks = completed_judge_tasks
+      return if judge_tasks.empty?
+      pass_count = judge_tasks.count(&:passes?)
+      percentage = ((pass_count.to_f / judge_tasks.size) * 100).round
+      "#{percentage}% (#{pass_count}/#{judge_tasks.size})"
+    end
+    def judge_average_score
+      scores = completed_judge_tasks.filter_map(&:judgment_score)
+      return if scores.empty?
+      (scores.sum.to_f / scores.size).round(1)
+    end
+    def judge_comparative_summary
+      completed_items = items.where.not(judge_task_id: nil).includes(:judge_task)
+      return if completed_items.empty?
+      new_wins = 0
+      original_wins = 0
+      ties = 0
+      completed_items.each do |item|
+        next unless item.judge_task&.completed?
+        parsed = item.judge_task.parsed_response
+        next unless parsed.is_a?(Hash)
+        winner = parsed["winner"]
+        if winner == "tie"
+          ties += 1
+        elsif winner == item.metadata&.dig("new_response_letter")
+          new_wins += 1
+        else
+          original_wins += 1
+        end
+      end
+      total = new_wins + original_wins + ties
+      return if total.zero?
+      {
+        new_wins: new_wins,
+        original_wins: original_wins,
+        ties: ties,
+        total: total,
+        new_win_pct: ((new_wins.to_f / total) * 100).round,
+        original_win_pct: ((original_wins.to_f / total) * 100).round,
+        tie_pct: ((ties.to_f / total) * 100).round
+      }
+    end
+  private
+    def completed_judge_tasks
+      Raif::Task.where(
+        id: items.where.not(judge_task_id: nil).select(:judge_task_id)
+      ).where.not(completed_at: nil)
+    end
+  public
+    def check_completion!
+      reload
+      remaining = items.where(status: %w[pending running judging]).count
+      self.completed_count = items.where(status: "completed").count
+      self.failed_count = items.where(status: "failed").count
+      if remaining.zero?
+        if failed_count > 0 && completed_count == 0
+          self.failed_at = Time.current
+        else
+          self.completed_at = Time.current
+        end
+      end
+      save!
+    end
+  end
+end

data/app/models/raif/prompt_studio_batch_run_item.rb ADDED Viewed

@@ -0,0 +1,220 @@
+# frozen_string_literal: true
+# == Schema Information
+#
+# Table name: raif_prompt_studio_batch_run_items
+#
+#  id             :bigint           not null, primary key
+#  metadata       :jsonb
+#  status         :string           default("pending"), not null
+#  created_at     :datetime         not null
+#  updated_at     :datetime         not null
+#  batch_run_id   :bigint           not null
+#  judge_task_id  :bigint
+#  result_task_id :bigint
+#  source_task_id :bigint           not null
+#
+# Indexes
+#
+#  index_raif_prompt_studio_batch_run_items_on_batch_run_id    (batch_run_id)
+#  index_raif_prompt_studio_batch_run_items_on_judge_task_id   (judge_task_id)
+#  index_raif_prompt_studio_batch_run_items_on_result_task_id  (result_task_id)
+#  index_raif_prompt_studio_batch_run_items_on_source_task_id  (source_task_id)
+#  index_raif_prompt_studio_batch_run_items_on_status          (status)
+#
+# Foreign Keys
+#
+#  fk_rails_...  (batch_run_id => raif_prompt_studio_batch_runs.id)
+#  fk_rails_...  (judge_task_id => raif_tasks.id)
+#  fk_rails_...  (result_task_id => raif_tasks.id)
+#  fk_rails_...  (source_task_id => raif_tasks.id)
+#
+module Raif
+  class PromptStudioBatchRunItem < Raif::ApplicationRecord
+    include ActionView::RecordIdentifier
+    STATUSES = %w[pending running judging completed failed].freeze
+    after_initialize -> { self.metadata ||= {} }
+    belongs_to :batch_run,
+      class_name: "Raif::PromptStudioBatchRun",
+      inverse_of: :items
+    belongs_to :source_task,
+      class_name: "Raif::Task"
+    belongs_to :result_task,
+      class_name: "Raif::Task",
+      optional: true
+    belongs_to :judge_task,
+      class_name: "Raif::Task",
+      optional: true
+    validates :status, inclusion: { in: STATUSES }
+    def execute!
+      update!(status: "running")
+      broadcast_item
+      new_task = create_and_run_task
+      run_judge_if_configured(new_task)
+      update!(status: "completed")
+    rescue StandardError => e
+      Rails.logger.error "Error running batch run item ##{id}: #{e.message}"
+      Rails.logger.error e.backtrace&.join("\n")
+      update!(status: "failed")
+    ensure
+      broadcast_item
+      batch_run.check_completion!
+      broadcast_progress
+    end
+    def judge_summary
+      return unless judge_task&.completed?
+      parsed = judge_task.parsed_response
+      return unless parsed.is_a?(Hash)
+      case batch_run.judge_type
+      when "Raif::Evals::LlmJudges::Binary"
+        parsed["passes"] ? "PASS" : "FAIL"
+      when "Raif::Evals::LlmJudges::Scored"
+        "Score: #{parsed["score"]}"
+      when "Raif::Evals::LlmJudges::Comparative"
+        if parsed["winner"] == "tie"
+          I18n.t("raif.admin.prompt_studio.batch_runs.judge.tie")
+        else
+          winner_label = comparative_winner_label(parsed["winner"])
+          I18n.t("raif.admin.prompt_studio.batch_runs.judge.winner", name: winner_label)
+        end
+      when "Raif::Evals::LlmJudges::Summarization"
+        "Overall: #{parsed.dig("overall", "score")}/5"
+      end
+    end
+    def judge_reasoning
+      return unless judge_task&.completed?
+      parsed = judge_task.parsed_response
+      return unless parsed.is_a?(Hash)
+      parsed["reasoning"]
+    end
+    def comparative_winner_label(winner_letter)
+      new_response_letter = metadata&.dig("new_response_letter")
+      return winner_letter unless new_response_letter
+      if winner_letter == new_response_letter
+        I18n.t("raif.admin.prompt_studio.batch_runs.judge.new_response")
+      else
+        I18n.t("raif.admin.prompt_studio.batch_runs.judge.original_response")
+      end
+    end
+  private
+    def create_and_run_task
+      new_task = source_task.class.new(
+        creator: source_task.creator,
+        source: source_task,
+        llm_model_key: batch_run.llm_model_key,
+        available_model_tools: source_task.available_model_tools,
+        run_with: source_task.run_with,
+        prompt_studio_run: true,
+        started_at: Time.current
+      )
+      new_task.assign_attributes(source_task.prompt_studio_task_attributes)
+      new_task.save!
+      update!(result_task_id: new_task.id)
+      new_task.run
+      new_task
+    end
+    def run_judge_if_configured(new_task)
+      return unless batch_run.has_judge? && new_task.completed?
+      update!(status: "judging")
+      broadcast_item
+      judge_result = invoke_judge(new_task)
+      update!(judge_task_id: judge_result.id)
+    end
+    def invoke_judge(new_task)
+      judge_class = batch_run.judge_class
+      config = batch_run.judge_config
+      judge_args = {
+        creator: source_task.creator,
+        prompt_studio_run: true,
+        llm_model_key: batch_run.judge_llm_model_key
+      }
+      judge_args.merge!(source_task.prompt_studio_task_attributes)
+      if config["include_original_prompt_as_context"]
+        judge_args[:additional_context] =
+          "The content being evaluated was generated in response to the following prompt:\n\n#{source_task.prompt}"
+      end
+      case batch_run.judge_type
+      when "Raif::Evals::LlmJudges::Binary"
+        judge_class.run(
+          content_to_judge: new_task.raw_response,
+          criteria: config["criteria"],
+          strict_mode: config["strict_mode"],
+          **judge_args
+        )
+      when "Raif::Evals::LlmJudges::Scored"
+        rubric = Raif::Evals::ScoringRubric.send(config["scoring_rubric"])
+        judge_class.run(
+          content_to_judge: new_task.raw_response,
+          scoring_rubric: rubric,
+          **judge_args
+        )
+      when "Raif::Evals::LlmJudges::Comparative"
+        result = judge_class.run(
+          content_to_judge: new_task.raw_response,
+          over_content: source_task.raw_response,
+          comparison_criteria: config["comparison_criteria"],
+          **judge_args
+        )
+        # Store which letter was assigned to the new response so we can display
+        # "Winner: New Response" / "Winner: Original Response" instead of "A"/"B"
+        update!(metadata: metadata.merge("new_response_letter" => result.expected_winner))
+        result
+      when "Raif::Evals::LlmJudges::Summarization"
+        judge_class.run(
+          original_content: source_task.prompt,
+          summary: new_task.raw_response,
+          **judge_args
+        )
+      end
+    end
+    def broadcast_item
+      Turbo::StreamsChannel.broadcast_replace_to(
+        batch_run,
+        target: dom_id(self),
+        partial: "raif/admin/prompt_studio/batch_runs/batch_run_item",
+        locals: { item: self }
+      )
+    end
+    def broadcast_progress
+      batch_run.reload
+      Turbo::StreamsChannel.broadcast_replace_to(
+        batch_run,
+        target: dom_id(batch_run, :progress),
+        partial: "raif/admin/prompt_studio/batch_runs/progress",
+        locals: { batch_run: batch_run }
+      )
+    end
+  end
+end