RubyGems - raif - Versions diffs - 1.2.1 → 1.3.0 - Mend

raif 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

checksums.yaml +4 -4
data/README.md +29 -935
data/app/assets/builds/raif_admin.css +5 -1
data/app/assets/images/raif-logo-white.svg +8 -0
data/app/assets/stylesheets/raif_admin.scss +4 -0
data/app/jobs/raif/conversation_entry_job.rb +1 -1
data/app/models/raif/agents/re_act_step.rb +1 -2
data/app/models/raif/concerns/has_llm.rb +1 -1
data/app/models/raif/concerns/task_run_args.rb +62 -0
data/app/models/raif/conversation.rb +8 -0
data/app/models/raif/conversation_entry.rb +6 -9
data/app/models/raif/llm.rb +1 -1
data/app/models/raif/llms/open_router.rb +47 -4
data/app/models/raif/task.rb +22 -9
data/app/views/layouts/raif/admin.html.erb +3 -1
data/app/views/raif/conversation_entries/_form.html.erb +1 -1
data/app/views/raif/conversations/_full_conversation.html.erb +3 -6
data/app/views/raif/conversations/_initial_chat_message.html.erb +5 -0
data/config/locales/en.yml +8 -0
data/db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb +13 -0
data/db/migrate/20250811171150_make_raif_task_creator_optional.rb +8 -0
data/exe/raif +7 -0
data/lib/generators/raif/agent/agent_generator.rb +22 -7
data/lib/generators/raif/agent/templates/agent.rb.tt +20 -24
data/lib/generators/raif/agent/templates/agent_eval_set.rb.tt +48 -0
data/lib/generators/raif/agent/templates/application_agent.rb.tt +0 -2
data/lib/generators/raif/base_generator.rb +19 -0
data/lib/generators/raif/conversation/conversation_generator.rb +21 -2
data/lib/generators/raif/conversation/templates/application_conversation.rb.tt +0 -2
data/lib/generators/raif/conversation/templates/conversation.rb.tt +29 -33
data/lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt +70 -0
data/lib/generators/raif/eval_set/eval_set_generator.rb +28 -0
data/lib/generators/raif/eval_set/templates/eval_set.rb.tt +21 -0
data/lib/generators/raif/evals/setup/setup_generator.rb +47 -0
data/lib/generators/raif/install/install_generator.rb +15 -0
data/lib/generators/raif/install/templates/initializer.rb +14 -3
data/lib/generators/raif/model_tool/model_tool_generator.rb +5 -2
data/lib/generators/raif/model_tool/templates/model_tool.rb.tt +78 -76
data/lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt +10 -0
data/lib/generators/raif/task/task_generator.rb +22 -3
data/lib/generators/raif/task/templates/application_task.rb.tt +0 -2
data/lib/generators/raif/task/templates/task.rb.tt +55 -59
data/lib/generators/raif/task/templates/task_eval_set.rb.tt +54 -0
data/lib/raif/cli/base.rb +39 -0
data/lib/raif/cli/evals.rb +47 -0
data/lib/raif/cli/evals_setup.rb +27 -0
data/lib/raif/cli.rb +67 -0
data/lib/raif/configuration.rb +23 -9
data/lib/raif/engine.rb +2 -1
data/lib/raif/evals/eval.rb +30 -0
data/lib/raif/evals/eval_set.rb +111 -0
data/lib/raif/evals/eval_sets/expectations.rb +53 -0
data/lib/raif/evals/eval_sets/llm_judge_expectations.rb +255 -0
data/lib/raif/evals/expectation_result.rb +39 -0
data/lib/raif/evals/llm_judge.rb +32 -0
data/lib/raif/evals/llm_judges/binary.rb +94 -0
data/lib/raif/evals/llm_judges/comparative.rb +89 -0
data/lib/raif/evals/llm_judges/scored.rb +63 -0
data/lib/raif/evals/llm_judges/summarization.rb +166 -0
data/lib/raif/evals/run.rb +201 -0
data/lib/raif/evals/scoring_rubric.rb +174 -0
data/lib/raif/evals.rb +26 -0
data/lib/raif/llm_registry.rb +33 -0
data/lib/raif/migration_checker.rb +3 -3
data/lib/raif/utils/colors.rb +23 -0
data/lib/raif/utils.rb +1 -0
data/lib/raif/version.rb +1 -1
data/lib/raif.rb +4 -0
data/spec/support/current_temperature_test_tool.rb +34 -0
data/spec/support/test_conversation.rb +1 -1
metadata +37 -3

data/lib/raif/evals/scoring_rubric.rb ADDED Viewed

@@ -0,0 +1,174 @@
+# frozen_string_literal: true
+module Raif
+  module Evals
+    # ScoringRubric provides a standardized way to define evaluation criteria with
+    # multiple scoring levels. Each level can define either a score range or a single
+    # score value, along with descriptive text explaining what qualifies for that score.
+    #
+    # @example Creating a custom rubric
+    #   rubric = ScoringRubric.new(
+    #     name: :technical_accuracy,
+    #     description: "Evaluates technical correctness and precision",
+    #     levels: [
+    #       { score_range: (9..10), description: "Technically perfect with no errors" },
+    #       { score_range: (7..8), description: "Mostly correct with minor technical issues" },
+    #       { score_range: (5..6), description: "Generally correct but some technical problems" },
+    #       { score_range: (3..4), description: "Significant technical errors present" },
+    #       { score_range: (0..2), description: "Technically incorrect or misleading" }
+    #     ]
+    #   )
+    #
+    # @example Integer scoring levels
+    #   rubric = ScoringRubric.new(
+    #     name: :technical_accuracy ,
+    #     description: "Evaluates technical correctness and precision",
+    #     levels: [
+    #       { score: 5, description: "Technically perfect with no errors" },
+    #       { score: 4, description: "Mostly correct with minor technical issues" },
+    #       { score: 3, description: "Generally correct but some technical problems" },
+    #       { score: 2, description: "Significant technical errors present" },
+    #       { score: 1, description: "Mostly incorrect or misleading" },
+    #       { score: 0, description: "Completely incorrect or misleading" }
+    #     ]
+    #   )
+    #
+    # @example Using built-in rubrics
+    #   accuracy_rubric = ScoringRubric.accuracy
+    #   helpfulness_rubric = ScoringRubric.helpfulness
+    #   clarity_rubric = ScoringRubric.clarity
+    #
+    class ScoringRubric
+      # @return [Symbol] The rubric's identifier name
+      attr_reader :name
+      # @return [String] Human-readable description of what this rubric evaluates
+      attr_reader :description
+      # @return [Array<Hash>] Array of scoring level definitions
+      attr_reader :levels
+      # Creates a new ScoringRubric with the specified criteria.
+      #
+      # @param name [Symbol] Identifier for this rubric (e.g., :accuracy, :helpfulness)
+      # @param description [String] Human-readable description of what this rubric evaluates
+      # @param levels [Array<Hash>] Array of scoring level definitions. Each level must contain
+      #   either :score (Integer) or :score_range (Range), plus :description (String)
+      def initialize(name:, description:, levels:)
+        @name = name
+        @description = description
+        @levels = levels
+      end
+      # Converts the rubric into a formatted string suitable for LLM prompts.
+      #
+      # The output includes the rubric description followed by a detailed breakdown
+      # of all scoring levels with their criteria.
+      #
+      # @return [String] Formatted rubric text ready for inclusion in prompts
+      #
+      # @example Output format
+      #   "Evaluates factual correctness and precision
+      #
+      #   Scoring levels:
+      #   - 9-10: Completely accurate with no errors
+      #   - 7-8: Mostly accurate with minor imprecisions
+      #   - 5-6: Generally accurate but some notable errors"
+      #
+      # @raise [ArgumentError] If a level doesn't contain :score or :score_range
+      def to_prompt
+        prompt = "#{description}\n\nScoring levels:\n"
+        levels.each do |level|
+          if level.key?(:score)
+            score = level[:score]
+            prompt += "- #{score}: #{level[:description]}\n"
+          else
+            range = level[:score_range]
+            min, max = case range
+            when Range
+              [range.begin, range.exclude_end? ? range.end - 1 : range.end]
+            else
+              raise ArgumentError, "level must include :score or :score_range (Range)"
+            end
+            prompt += "- #{min}-#{max}: #{level[:description]}\n"
+          end
+        end
+        prompt.strip
+      end
+      class << self
+        # Creates a rubric for evaluating factual accuracy and correctness.
+        #
+        # This rubric focuses on whether information is factually correct,
+        # precise, and free from errors or misconceptions.
+        #
+        # @return [ScoringRubric] Pre-configured accuracy rubric (1-5 scale)
+        #
+        # @example
+        #   rubric = ScoringRubric.accuracy
+        #   expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
+        def accuracy
+          new(
+            name: :accuracy,
+            description: "Evaluates factual correctness and precision",
+            levels: [
+              { score: 5, description: "Completely accurate with no errors" },
+              { score: 4, description: "Mostly accurate with minor imprecisions" },
+              { score: 3, description: "Generally accurate but some notable errors" },
+              { score: 2, description: "Significant inaccuracies present" },
+              { score: 1, description: "Mostly or entirely inaccurate" }
+            ]
+          )
+        end
+        # Creates a rubric for evaluating how well content addresses user needs.
+        #
+        # This rubric assesses whether the response is useful, relevant, and
+        # effectively helps the user accomplish their goals.
+        #
+        # @return [ScoringRubric] Pre-configured helpfulness rubric (1-5 scale)
+        #
+        # @example
+        #   rubric = ScoringRubric.helpfulness
+        #   expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
+        def helpfulness
+          new(
+            name: :helpfulness,
+            description: "Evaluates how well the response addresses user needs",
+            levels: [
+              { score: 5, description: "Extremely helpful, fully addresses the need" },
+              { score: 4, description: "Very helpful with good coverage" },
+              { score: 3, description: "Moderately helpful but missing some aspects" },
+              { score: 2, description: "Somewhat helpful but significant gaps" },
+              { score: 1, description: "Not helpful or misleading" }
+            ]
+          )
+        end
+        # Creates a rubric for evaluating clarity and comprehensibility.
+        #
+        # This rubric focuses on how easy content is to understand, whether
+        # it's well-organized, and if the language is appropriate for the audience.
+        #
+        # @return [ScoringRubric] Pre-configured clarity rubric (1-5 scale)
+        #
+        # @example
+        #   rubric = ScoringRubric.clarity
+        #   expect_llm_judge_score(response, scoring_rubric: rubric, min_passing_score: 4)
+        def clarity
+          new(
+            name: :clarity,
+            description: "Evaluates clarity and comprehensibility",
+            levels: [
+              { score: 5, description: "Crystal clear and easy to understand" },
+              { score: 4, description: "Clear with minor ambiguities" },
+              { score: 3, description: "Generally clear but some confusion" },
+              { score: 2, description: "Unclear in significant ways" },
+              { score: 1, description: "Very unclear or incomprehensible" }
+            ]
+          )
+        end
+      end
+    end
+  end
+end

data/lib/raif/evals.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+require "raif/evals/expectation_result"
+require "raif/evals/eval"
+require "raif/evals/eval_set"
+require "raif/evals/run"
+require "raif/evals/llm_judge"
+require "raif/evals/llm_judges/binary"
+require "raif/evals/llm_judges/comparative"
+require "raif/evals/llm_judges/scored"
+require "raif/evals/llm_judges/summarization"
+require "raif/evals/scoring_rubric"
+module Raif
+  module Evals
+    # Namespace modules for organizing eval sets
+    module Tasks
+    end
+    module Conversations
+    end
+    module Agents
+    end
+  end
+end

data/lib/raif/llm_registry.rb CHANGED Viewed

@@ -113,6 +113,27 @@ module Raif
         output_token_cost: 4.4 / 1_000_000,
         model_provider_settings: { supports_temperature: false },
       },
+      {
+        key: :open_ai_gpt_5,
+        api_name: "gpt-5",
+        input_token_cost: 1.25 / 1_000_000,
+        output_token_cost: 10.0 / 1_000_000,
+        model_provider_settings: { supports_temperature: false },
+      },
+      {
+        key: :open_ai_gpt_5_mini,
+        api_name: "gpt-5-mini",
+        input_token_cost: 0.25 / 1_000_000,
+        output_token_cost: 2.0 / 1_000_000,
+        model_provider_settings: { supports_temperature: false },
+      },
+      {
+        key: :open_ai_gpt_5_nano,
+        api_name: "gpt-5-nano",
+        input_token_cost: 0.05 / 1_000_000,
+        output_token_cost: 0.4 / 1_000_000,
+        model_provider_settings: { supports_temperature: false },
+      }
     ]
     open_ai_responses_models = open_ai_models.dup.map.with_index do |model, _index|
@@ -321,6 +342,18 @@ module Raif
           input_token_cost: 0.27 / 1_000_000,
           output_token_cost: 1.1 / 1_000_000,
         },
+        {
+          key: :open_router_open_ai_gpt_oss_120b,
+          api_name: "gpt-oss-120b",
+          input_token_cost: 0.15 / 1_000_000,
+          output_token_cost: 0.6 / 1_000_000,
+        },
+        {
+          key: :open_router_open_ai_gpt_oss_20b,
+          api_name: "gpt-oss-20b",
+          input_token_cost: 0.05 / 1_000_000,
+          output_token_cost: 0.2 / 1_000_000,
+        }
       ]
     }
   end

data/lib/raif/migration_checker.rb CHANGED Viewed

@@ -53,8 +53,7 @@ module Raif
       end
       def build_warning_message(uninstalled_migration_names)
-        <<~WARNING
-          \e[33m
+        msg = <<~WARNING
           ⚠️  RAIF MIGRATION WARNING ⚠️
           The following Raif migrations have not been run in your application:
@@ -66,8 +65,9 @@ module Raif
             rails raif:install:migrations
             rails db:migrate
-          \e[0m
         WARNING
+        Raif::Utils::Colors.yellow(msg)
       end
     end
   end

data/lib/raif/utils/colors.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+module Raif
+  module Utils
+    module Colors
+      def self.green(text)
+        "\e[32m#{text}\e[0m"
+      end
+      def self.red(text)
+        "\e[31m#{text}\e[0m"
+      end
+      def self.yellow(text)
+        "\e[33m#{text}\e[0m"
+      end
+      def self.blue(text)
+        "\e[34m#{text}\e[0m"
+      end
+    end
+  end
+end

data/lib/raif/utils.rb CHANGED Viewed

@@ -4,4 +4,5 @@ module Raif::Utils
   require "raif/utils/readable_content_extractor"
   require "raif/utils/html_to_markdown_converter"
   require "raif/utils/html_fragment_processor"
+  require "raif/utils/colors"
 end

data/lib/raif/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Raif
-  VERSION = "1.2.1"
+  VERSION = "1.3.0"
 end

data/lib/raif.rb CHANGED Viewed

@@ -37,4 +37,8 @@ module Raif
   def self.logger
     @logger ||= Rails.logger
   end
+  def self.running_evals?
+    ENV["RAIF_RUNNING_EVALS"] == "true"
+  end
 end

data/spec/support/current_temperature_test_tool.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+class Raif::ModelTools::CurrentTemperatureTestTool < Raif::ModelTool
+  tool_arguments_schema do
+    string :zip_code, description: "The zip code to get the current temperature for"
+  end
+  tool_description do
+    "A tool to get the current temperature for a given zip code"
+  end
+  class << self
+    def process_invocation(tool_invocation)
+      tool_invocation.update!(
+        result: {
+          temperature: 72
+        }
+      )
+      tool_invocation.result
+    end
+    def triggers_observation_to_model?
+      true
+    end
+    def observation_for_invocation(tool_invocation)
+      zip_code = tool_invocation.tool_arguments["zip_code"]
+      temperature = tool_invocation.result["temperature"]
+      "The current temperature for zip code #{zip_code} is #{temperature} degrees Fahrenheit."
+    end
+  end
+end

data/spec/support/test_conversation.rb CHANGED Viewed

@@ -12,7 +12,7 @@ class Raif::TestConversation < Raif::Conversation
   end
   def process_model_response_message(message:, entry:)
-    message.gsub("jerk", "[REDACTED]")
+    message&.gsub("jerk", "[REDACTED]")
   end
 end

metadata CHANGED Viewed

@@ -1,12 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: raif
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  version: 1.3.0
 platform: ruby
 authors:
 - Ben Roesch
 - Brian Leslie
-bindir: bin
+bindir: exe
 cert_chain: []
 date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
@@ -142,7 +142,8 @@ description: Raif (Ruby AI Framework) is a Rails engine that helps you add AI-po
 email:
 - ben@cultivatelabs.com
 - brian@cultivatelabs.com
-executables: []
+executables:
+- raif
 extensions: []
 extra_rdoc_files: []
 files:
@@ -152,6 +153,7 @@ files:
 - app/assets/builds/raif.css
 - app/assets/builds/raif_admin.css
 - app/assets/config/raif_manifest.js
+- app/assets/images/raif-logo-white.svg
 - app/assets/javascript/raif.js
 - app/assets/javascript/raif/controllers/conversations_controller.js
 - app/assets/javascript/raif/stream_actions/raif_scroll_to_bottom.js
@@ -198,6 +200,7 @@ files:
 - app/models/raif/concerns/llms/open_ai_completions/tool_formatting.rb
 - app/models/raif/concerns/llms/open_ai_responses/message_formatting.rb
 - app/models/raif/concerns/llms/open_ai_responses/tool_formatting.rb
+- app/models/raif/concerns/task_run_args.rb
 - app/models/raif/conversation.rb
 - app/models/raif/conversation_entry.rb
 - app/models/raif/embedding_model.rb
@@ -260,6 +263,7 @@ files:
 - app/views/raif/conversation_entries/new.turbo_stream.erb
 - app/views/raif/conversations/_available_user_tools.html.erb
 - app/views/raif/conversations/_full_conversation.html.erb
+- app/views/raif/conversations/_initial_chat_message.html.erb
 - app/views/raif/conversations/show.html.erb
 - config/i18n-tasks.yml
 - config/importmap.rb
@@ -276,21 +280,36 @@ files:
 - db/migrate/20250527213016_add_response_id_and_response_array_to_model_completions.rb
 - db/migrate/20250603140622_add_citations_to_raif_model_completions.rb
 - db/migrate/20250603202013_add_stream_response_to_raif_model_completions.rb
+- db/migrate/20250804013843_add_task_run_args_to_raif_tasks.rb
+- db/migrate/20250811171150_make_raif_task_creator_optional.rb
+- exe/raif
 - lib/generators/raif/agent/agent_generator.rb
 - lib/generators/raif/agent/templates/agent.rb.tt
+- lib/generators/raif/agent/templates/agent_eval_set.rb.tt
 - lib/generators/raif/agent/templates/application_agent.rb.tt
+- lib/generators/raif/base_generator.rb
 - lib/generators/raif/conversation/conversation_generator.rb
 - lib/generators/raif/conversation/templates/application_conversation.rb.tt
 - lib/generators/raif/conversation/templates/conversation.rb.tt
+- lib/generators/raif/conversation/templates/conversation_eval_set.rb.tt
+- lib/generators/raif/eval_set/eval_set_generator.rb
+- lib/generators/raif/eval_set/templates/eval_set.rb.tt
+- lib/generators/raif/evals/setup/setup_generator.rb
 - lib/generators/raif/install/install_generator.rb
 - lib/generators/raif/install/templates/initializer.rb
 - lib/generators/raif/model_tool/model_tool_generator.rb
 - lib/generators/raif/model_tool/templates/model_tool.rb.tt
+- lib/generators/raif/model_tool/templates/model_tool_invocation_partial.html.erb.tt
 - lib/generators/raif/task/task_generator.rb
 - lib/generators/raif/task/templates/application_task.rb.tt
 - lib/generators/raif/task/templates/task.rb.tt
+- lib/generators/raif/task/templates/task_eval_set.rb.tt
 - lib/generators/raif/views_generator.rb
 - lib/raif.rb
+- lib/raif/cli.rb
+- lib/raif/cli/base.rb
+- lib/raif/cli/evals.rb
+- lib/raif/cli/evals_setup.rb
 - lib/raif/configuration.rb
 - lib/raif/embedding_model_registry.rb
 - lib/raif/engine.rb
@@ -304,18 +323,33 @@ files:
 - lib/raif/errors/open_ai/json_schema_error.rb
 - lib/raif/errors/streaming_error.rb
 - lib/raif/errors/unsupported_feature_error.rb
+- lib/raif/evals.rb
+- lib/raif/evals/eval.rb
+- lib/raif/evals/eval_set.rb
+- lib/raif/evals/eval_sets/expectations.rb
+- lib/raif/evals/eval_sets/llm_judge_expectations.rb
+- lib/raif/evals/expectation_result.rb
+- lib/raif/evals/llm_judge.rb
+- lib/raif/evals/llm_judges/binary.rb
+- lib/raif/evals/llm_judges/comparative.rb
+- lib/raif/evals/llm_judges/scored.rb
+- lib/raif/evals/llm_judges/summarization.rb
+- lib/raif/evals/run.rb
+- lib/raif/evals/scoring_rubric.rb
 - lib/raif/json_schema_builder.rb
 - lib/raif/languages.rb
 - lib/raif/llm_registry.rb
 - lib/raif/migration_checker.rb
 - lib/raif/rspec.rb
 - lib/raif/utils.rb
+- lib/raif/utils/colors.rb
 - lib/raif/utils/html_fragment_processor.rb
 - lib/raif/utils/html_to_markdown_converter.rb
 - lib/raif/utils/readable_content_extractor.rb
 - lib/raif/version.rb
 - lib/tasks/raif_tasks.rake
 - spec/support/complex_test_tool.rb
+- spec/support/current_temperature_test_tool.rb
 - spec/support/rspec_helpers.rb
 - spec/support/test_conversation.rb
 - spec/support/test_embedding_model.rb