RubyGems - ruby_llm-semantic_cache - Versions diffs - 0.1.0 - Mend

ruby_llm-semantic_cache 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/README.md +151 -0
data/Rakefile +6 -0
data/docker-compose.yml +32 -0
data/lib/ruby_llm/semantic_cache/cache_stores/base.rb +49 -0
data/lib/ruby_llm/semantic_cache/cache_stores/memory.rb +86 -0
data/lib/ruby_llm/semantic_cache/cache_stores/redis.rb +92 -0
data/lib/ruby_llm/semantic_cache/configuration.rb +131 -0
data/lib/ruby_llm/semantic_cache/embedding.rb +24 -0
data/lib/ruby_llm/semantic_cache/entry.rb +51 -0
data/lib/ruby_llm/semantic_cache/middleware.rb +199 -0
data/lib/ruby_llm/semantic_cache/scoped.rb +263 -0
data/lib/ruby_llm/semantic_cache/serializer.rb +116 -0
data/lib/ruby_llm/semantic_cache/vector_stores/base.rb +49 -0
data/lib/ruby_llm/semantic_cache/vector_stores/memory.rb +85 -0
data/lib/ruby_llm/semantic_cache/vector_stores/redis.rb +89 -0
data/lib/ruby_llm/semantic_cache/version.rb +7 -0
data/lib/ruby_llm/semantic_cache.rb +317 -0
data/lib/ruby_llm-semantic_cache.rb +3 -0
data/ruby_llm-semantic_cache.gemspec +41 -0
metadata +135 -0

data/lib/ruby_llm/semantic_cache/middleware.rb ADDED Viewed

@@ -0,0 +1,199 @@
+# frozen_string_literal: true
+require "digest"
+module RubyLLM
+  module SemanticCache
+    # Middleware wrapper for RubyLLM::Chat that automatically caches responses
+    #
+    # @example Basic usage
+    #   chat = RubyLLM.chat(model: "gpt-5.2")
+    #   cached_chat = RubyLLM::SemanticCache.wrap(chat)
+    #   cached_chat.ask("What is 2+2?")  # First call - executes LLM
+    #
+    # @example With custom threshold
+    #   cached_chat = RubyLLM::SemanticCache.wrap(chat, threshold: 0.95)
+    #
+    class Middleware
+      # Methods to delegate directly to the wrapped chat (no caching)
+      DELEGATED_METHODS = %i[
+        model messages tools params headers schema
+        with_instructions with_tool with_tools with_model
+        with_temperature with_context with_params with_headers with_schema
+        on_new_message on_end_message on_tool_call on_tool_result
+        each reset_messages!
+      ].freeze
+      attr_reader :chat
+      # @param chat [RubyLLM::Chat] the chat instance to wrap
+      # @param threshold [Float, nil] similarity threshold override
+      # @param ttl [Integer, nil] TTL override in seconds
+      # @param on_cache_hit [Proc, nil] callback when cache hit occurs, receives (chat, user_message, cached_response)
+      # @param max_messages [Integer, :unlimited, false, nil] max conversation messages before skipping cache
+      #   - Integer: skip cache after N messages (default: 1, only first message cached)
+      #   - :unlimited or false: cache all messages regardless of conversation length
+      #   - nil: use config default
+      def initialize(chat, threshold: nil, ttl: nil, on_cache_hit: nil, max_messages: nil)
+        @chat = chat
+        @threshold = threshold
+        @ttl = ttl
+        @on_cache_hit = on_cache_hit
+        @max_messages = max_messages
+      end
+      # Ask a question with automatic caching
+      # @param message [String] the message to send
+      # @param with [Object] attachments to include
+      # @return [RubyLLM::Message] the response message
+      def ask(message = nil, with: nil, &block)
+        # Skip caching if message has attachments
+        return @chat.ask(message, with: with, &block) if with
+        # Skip caching for tool-enabled chats (responses may vary)
+        return @chat.ask(message, with: with, &block) if @chat.tools.any?
+        # Skip caching if conversation exceeds max_messages (excluding system messages)
+        return @chat.ask(message, with: with, &block) if conversation_too_long?
+        # Skip caching for streaming (too complex to handle correctly)
+        return @chat.ask(message, with: with, &block) if block_given?
+        # Use cache for non-streaming
+        cache_key = build_cache_key(message)
+        cached = cache_lookup(cache_key)
+        if cached
+          handle_cache_hit(message, cached)
+          return cached
+        end
+        # Execute the actual LLM call
+        response = @chat.ask(message)
+        # Cache the response
+        store_in_cache(cache_key, response)
+        RubyLLM::SemanticCache.record_miss!
+        response
+      end
+      alias say ask
+      # Delegate other methods to the wrapped chat
+      DELEGATED_METHODS.each do |method|
+        define_method(method) do |*args, **kwargs, &block|
+          result = @chat.send(method, *args, **kwargs, &block)
+          # If the method returns the chat (for chaining), return self instead
+          result.equal?(@chat) ? self : result
+        end
+      end
+      private
+      def conversation_too_long?
+        max = effective_max_messages
+        return false if max.nil?
+        # Count non-system messages in the conversation
+        conversation_length = @chat.messages.count { |m| m.role != :system }
+        conversation_length >= max
+      end
+      def effective_max_messages
+        # Use instance setting if provided, otherwise config
+        max = @max_messages.nil? ? RubyLLM::SemanticCache.config.max_messages : @max_messages
+        # :unlimited or false means no limit
+        return nil if max == :unlimited || max == false
+        max
+      end
+      def build_cache_key(message)
+        # Include model and system instructions in the cache key
+        parts = []
+        # Add model ID to ensure different models have separate cache entries
+        model_id = @chat.model&.id || @chat.model
+        parts << "[MODEL:#{model_id}]" if model_id
+        # Add system instructions
+        system_messages = @chat.messages.select { |m| m.role == :system }
+        system_context = system_messages.map { |m| extract_text(m.content) }.join("\n")
+        parts << "[SYSTEM]\n#{system_context}" unless system_context.empty?
+        # Add current message
+        parts << "[USER]\n#{message}"
+        parts.join("\n---\n")
+      end
+      def extract_text(content)
+        case content
+        when String
+          content
+        when ->(c) { c.respond_to?(:text) }
+          content.text
+        else
+          content.to_s
+        end
+      end
+      def handle_cache_hit(user_message, cached_response)
+        if @on_cache_hit
+          # Let the callback handle persistence (for ActiveRecord-backed chats)
+          @on_cache_hit.call(@chat, user_message, cached_response)
+        else
+          # Default: add to in-memory messages array for conversation continuity
+          add_message_to_chat(:user, user_message)
+          add_message_to_chat(:assistant, cached_response.content, cached_response)
+        end
+      end
+      def add_message_to_chat(role, content, original_message = nil)
+        return unless defined?(RubyLLM::Message)
+        message = if role == :user
+                    RubyLLM::Message.new(role: :user, content: content)
+                  elsif original_message.is_a?(RubyLLM::Message)
+                    original_message
+                  else
+                    RubyLLM::Message.new(role: role, content: content)
+                  end
+        @chat.messages << message if @chat.messages.respond_to?(:<<)
+      end
+      def cache_lookup(key)
+        embedding = RubyLLM::SemanticCache.embedding_generator.generate(key)
+        threshold = @threshold || RubyLLM::SemanticCache.config.similarity_threshold
+        matches = RubyLLM::SemanticCache.vector_store.search(embedding, limit: 1)
+        if matches.any? && matches.first[:similarity] >= threshold
+          entry_data = RubyLLM::SemanticCache.cache_store.get(matches.first[:id])
+          return nil unless entry_data
+          RubyLLM::SemanticCache.record_hit!
+          Serializer.deserialize(entry_data[:response])
+        end
+      end
+      def store_in_cache(key, response)
+        embedding = RubyLLM::SemanticCache.embedding_generator.generate(key)
+        ttl = @ttl || RubyLLM::SemanticCache.config.ttl_seconds
+        entry = Entry.new(
+          query: key,
+          response: Serializer.serialize(response),
+          embedding: embedding,
+          metadata: { model: @chat.model&.id }
+        )
+        RubyLLM::SemanticCache.vector_store.add(entry.id, embedding)
+        RubyLLM::SemanticCache.cache_store.set(entry.id, entry.to_h, ttl: ttl)
+      end
+    end
+  end
+end

data/lib/ruby_llm/semantic_cache/scoped.rb ADDED Viewed

@@ -0,0 +1,263 @@
+# frozen_string_literal: true
+module RubyLLM
+  module SemanticCache
+    # Scoped cache wrapper for multi-tenant scenarios
+    # Each scoped instance maintains its own stores for true isolation
+    #
+    # @example
+    #   support = RubyLLM::SemanticCache::Scoped.new(namespace: "support")
+    #   sales = RubyLLM::SemanticCache::Scoped.new(namespace: "sales")
+    #
+    #   support.store(query: "How to reset password?", response: "...")
+    #   sales.store(query: "What is the price?", response: "...")
+    #
+    class Scoped
+      attr_reader :namespace
+      def initialize(namespace:)
+        @namespace = namespace
+        @vector_store = nil
+        @cache_store = nil
+        @hits = 0
+        @misses = 0
+      end
+      def fetch(query, threshold: nil, ttl: nil, &block)
+        raise ArgumentError, "Block required" unless block_given?
+        threshold ||= config.similarity_threshold
+        ttl ||= config.ttl_seconds
+        embedding = embedding_generator.generate(query)
+        matches = vector_store.search(embedding, limit: 1)
+        if matches.any? && matches.first[:similarity] >= threshold
+          @hits += 1
+          entry_data = cache_store.get(matches.first[:id])
+          return Serializer.deserialize(entry_data[:response]) if entry_data
+        end
+        @misses += 1
+        response = block.call
+        store(query: query, response: response, embedding: embedding, ttl: ttl)
+        response
+      end
+      def store(query:, response:, embedding: nil, metadata: {}, ttl: nil)
+        embedding ||= embedding_generator.generate(query)
+        ttl ||= config.ttl_seconds
+        entry = Entry.new(
+          query: query,
+          response: Serializer.serialize(response),
+          embedding: embedding,
+          metadata: metadata
+        )
+        vector_store.add(entry.id, embedding)
+        cache_store.set(entry.id, entry.to_h, ttl: ttl)
+        entry
+      end
+      def search(query, limit: 5)
+        embedding = embedding_generator.generate(query)
+        matches = vector_store.search(embedding, limit: limit)
+        matches.filter_map do |match|
+          entry_data = cache_store.get(match[:id])
+          next unless entry_data
+          {
+            query: entry_data[:query],
+            response: Serializer.deserialize(entry_data[:response]),
+            similarity: match[:similarity],
+            metadata: entry_data[:metadata]
+          }
+        end
+      end
+      def exists?(query, threshold: nil)
+        threshold ||= config.similarity_threshold
+        embedding = embedding_generator.generate(query)
+        matches = vector_store.search(embedding, limit: 1)
+        matches.any? && matches.first[:similarity] >= threshold
+      end
+      def delete(query, threshold: nil)
+        threshold ||= config.similarity_threshold
+        embedding = embedding_generator.generate(query)
+        matches = vector_store.search(embedding, limit: 1)
+        return false unless matches.any? && matches.first[:similarity] >= threshold
+        id = matches.first[:id]
+        vector_store.delete(id)
+        cache_store.delete(id)
+        true
+      end
+      def invalidate(query, threshold: nil, limit: 100)
+        threshold ||= config.similarity_threshold
+        embedding = embedding_generator.generate(query)
+        matches = vector_store.search(embedding, limit: limit)
+        count = 0
+        matches.each do |match|
+          next unless match[:similarity] >= threshold
+          vector_store.delete(match[:id])
+          cache_store.delete(match[:id])
+          count += 1
+        end
+        count
+      end
+      def clear!
+        vector_store.clear!
+        cache_store.clear!
+        @hits = 0
+        @misses = 0
+      end
+      def stats
+        {
+          hits: @hits,
+          misses: @misses,
+          hit_rate: hit_rate,
+          entries: cache_store.size
+        }
+      end
+      def wrap(chat, threshold: nil, ttl: nil, on_cache_hit: nil, max_messages: nil)
+        # For scoped wrap, we create a middleware that uses this scoped instance
+        ScopedMiddleware.new(
+          self,
+          chat,
+          threshold: threshold,
+          ttl: ttl,
+          on_cache_hit: on_cache_hit,
+          max_messages: max_messages
+        )
+      end
+      private
+      def config
+        RubyLLM::SemanticCache.config
+      end
+      def embedding_generator
+        RubyLLM::SemanticCache.embedding_generator
+      end
+      def vector_store
+        @vector_store ||= build_vector_store
+      end
+      def cache_store
+        @cache_store ||= build_cache_store
+      end
+      def build_vector_store
+        case config.vector_store
+        when :memory
+          VectorStores::Memory.new(config)
+        when :redis
+          require_relative "vector_stores/redis"
+          VectorStores::Redis.new(scoped_config)
+        else
+          raise Error, "Unknown vector store: #{config.vector_store}"
+        end
+      end
+      def build_cache_store
+        case config.cache_store
+        when :memory
+          CacheStores::Memory.new(config)
+        when :redis
+          require_relative "cache_stores/redis"
+          CacheStores::Redis.new(scoped_config)
+        else
+          raise Error, "Unknown cache store: #{config.cache_store}"
+        end
+      end
+      # Create a config-like object with the scoped namespace
+      def scoped_config
+        ScopedConfig.new(config, @namespace)
+      end
+      # Wrapper that delegates to main config but overrides namespace
+      class ScopedConfig
+        def initialize(config, namespace)
+          @config = config
+          @namespace = namespace
+        end
+        def namespace
+          @namespace
+        end
+        def method_missing(method, *args, &block)
+          @config.send(method, *args, &block)
+        end
+        def respond_to_missing?(method, include_private = false)
+          @config.respond_to?(method, include_private)
+        end
+      end
+      def hit_rate
+        total = @hits + @misses
+        return 0.0 if total.zero?
+        @hits.to_f / total
+      end
+    end
+    # Middleware that uses a scoped cache instance
+    class ScopedMiddleware < Middleware
+      def initialize(scoped, chat, threshold: nil, ttl: nil, on_cache_hit: nil, max_messages: nil)
+        super(chat, threshold: threshold, ttl: ttl, on_cache_hit: on_cache_hit, max_messages: max_messages)
+        @scoped = scoped
+      end
+      private
+      def cache_lookup(key)
+        embedding = RubyLLM::SemanticCache.embedding_generator.generate(key)
+        threshold = @threshold || RubyLLM::SemanticCache.config.similarity_threshold
+        matches = @scoped.send(:vector_store).search(embedding, limit: 1)
+        if matches.any? && matches.first[:similarity] >= threshold
+          entry_data = @scoped.send(:cache_store).get(matches.first[:id])
+          return nil unless entry_data
+          @scoped.instance_variable_set(:@hits, @scoped.instance_variable_get(:@hits) + 1)
+          Serializer.deserialize(entry_data[:response])
+        end
+      end
+      def store_in_cache(key, response)
+        embedding = RubyLLM::SemanticCache.embedding_generator.generate(key)
+        ttl = @ttl || RubyLLM::SemanticCache.config.ttl_seconds
+        entry = Entry.new(
+          query: key,
+          response: Serializer.serialize(response),
+          embedding: embedding,
+          metadata: { model: @chat.model&.id }
+        )
+        @scoped.send(:vector_store).add(entry.id, embedding)
+        @scoped.send(:cache_store).set(entry.id, entry.to_h, ttl: ttl)
+        @scoped.instance_variable_set(:@misses, @scoped.instance_variable_get(:@misses) + 1)
+      end
+    end
+  end
+end

data/lib/ruby_llm/semantic_cache/serializer.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# frozen_string_literal: true
+module RubyLLM
+  module SemanticCache
+    # Handles serialization/deserialization of cached responses
+    module Serializer
+      class << self
+        def serialize(response)
+          if defined?(RubyLLM::Message) && response.is_a?(RubyLLM::Message)
+            serialize_message(response)
+          else
+            serialize_basic(response)
+          end
+        end
+        def deserialize(data)
+          return data unless data.is_a?(Hash)
+          type = data[:type] || data["type"]
+          value = data[:value] || data["value"]
+          case type
+          when "rubyllm_message"
+            deserialize_message(value)
+          when "string", "hash", "object"
+            value
+          when "nil"
+            nil
+          else
+            value
+          end
+        end
+        private
+        def serialize_basic(response)
+          case response
+          when String
+            { type: "string", value: response }
+          when Hash
+            { type: "hash", value: response }
+          when NilClass
+            { type: "nil", value: nil }
+          else
+            if response.respond_to?(:to_h)
+              { type: "object", class: response.class.name, value: response.to_h }
+            else
+              { type: "string", value: response.to_s }
+            end
+          end
+        end
+        def serialize_message(message)
+          {
+            type: "rubyllm_message",
+            value: {
+              role: message.role,
+              content: serialize_content(message.content),
+              model_id: message.model_id,
+              tool_calls: message.tool_calls,
+              tool_call_id: message.tool_call_id,
+              input_tokens: message.input_tokens,
+              output_tokens: message.output_tokens,
+              cached_tokens: message.cached_tokens,
+              cache_creation_tokens: message.cache_creation_tokens
+            }.compact
+          }
+        end
+        def serialize_content(content)
+          case content
+          when String
+            { type: "string", value: content }
+          when Hash
+            { type: "hash", value: content }
+          when ->(c) { defined?(RubyLLM::Content) && c.is_a?(RubyLLM::Content) }
+            { type: "rubyllm_content", value: content.to_h }
+          else
+            { type: "string", value: content.to_s }
+          end
+        end
+        def deserialize_message(value)
+          return value unless defined?(RubyLLM::Message)
+          content = deserialize_content(value[:content] || value["content"])
+          RubyLLM::Message.new(
+            role: (value[:role] || value["role"]).to_sym,
+            content: content,
+            model_id: value[:model_id] || value["model_id"],
+            tool_calls: value[:tool_calls] || value["tool_calls"],
+            tool_call_id: value[:tool_call_id] || value["tool_call_id"],
+            input_tokens: value[:input_tokens] || value["input_tokens"],
+            output_tokens: value[:output_tokens] || value["output_tokens"],
+            cached_tokens: value[:cached_tokens] || value["cached_tokens"],
+            cache_creation_tokens: value[:cache_creation_tokens] || value["cache_creation_tokens"]
+          )
+        end
+        def deserialize_content(data)
+          return data unless data.is_a?(Hash)
+          type = data[:type] || data["type"]
+          value = data[:value] || data["value"]
+          case type
+          when "string", "hash", "rubyllm_content"
+            value
+          else
+            value
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/semantic_cache/vector_stores/base.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+module RubyLLM
+  module SemanticCache
+    module VectorStores
+      class Base
+        def initialize(config)
+          @config = config
+        end
+        # Add a vector with the given ID
+        # @param id [String] unique identifier
+        # @param embedding [Array<Float>] vector embedding
+        def add(id, embedding)
+          raise NotImplementedError
+        end
+        # Search for similar vectors
+        # @param embedding [Array<Float>] query vector
+        # @param limit [Integer] maximum number of results
+        # @return [Array<Hash>] array of { id:, similarity: } hashes
+        def search(embedding, limit: 5)
+          raise NotImplementedError
+        end
+        # Delete a vector by ID
+        # @param id [String] unique identifier
+        def delete(id)
+          raise NotImplementedError
+        end
+        # Clear all vectors
+        def clear!
+          raise NotImplementedError
+        end
+        # Check if the store is empty
+        def empty?
+          raise NotImplementedError
+        end
+        # Get the number of vectors stored
+        def size
+          raise NotImplementedError
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/semantic_cache/vector_stores/memory.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+require_relative "base"
+module RubyLLM
+  module SemanticCache
+    module VectorStores
+      class Memory < Base
+        def initialize(config)
+          super
+          @vectors = {}
+          @mutex = Mutex.new
+        end
+        def add(id, embedding)
+          @mutex.synchronize do
+            @vectors[id] = embedding
+          end
+        end
+        def search(embedding, limit: 5)
+          @mutex.synchronize do
+            return [] if @vectors.empty?
+            results = @vectors.map do |id, stored_embedding|
+              similarity = cosine_similarity(embedding, stored_embedding)
+              { id: id, similarity: similarity }
+            end
+            results
+              .sort_by { |r| -r[:similarity] }
+              .first(limit)
+          end
+        end
+        def delete(id)
+          @mutex.synchronize do
+            @vectors.delete(id)
+          end
+        end
+        def clear!
+          @mutex.synchronize do
+            @vectors.clear
+          end
+        end
+        def empty?
+          @mutex.synchronize do
+            @vectors.empty?
+          end
+        end
+        def size
+          @mutex.synchronize do
+            @vectors.size
+          end
+        end
+        private
+        def cosine_similarity(vec_a, vec_b)
+          return 0.0 if vec_a.nil? || vec_b.nil?
+          return 0.0 if vec_a.empty? || vec_b.empty?
+          return 0.0 if vec_a.length != vec_b.length
+          dot_product = 0.0
+          norm_a = 0.0
+          norm_b = 0.0
+          vec_a.each_with_index do |a, i|
+            b = vec_b[i]
+            dot_product += a * b
+            norm_a += a * a
+            norm_b += b * b
+          end
+          return 0.0 if norm_a.zero? || norm_b.zero?
+          dot_product / (Math.sqrt(norm_a) * Math.sqrt(norm_b))
+        end
+      end
+    end
+  end
+end