RubyGems - ragdoll - Versions diffs - 0.1.8 → 0.1.10 - Mend

ragdoll 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +243 -0
data/README.md +209 -31
data/Rakefile +4 -5
data/app/models/ragdoll/document.rb +115 -12
data/app/models/ragdoll/embedding.rb +108 -2
data/app/models/ragdoll/search.rb +165 -0
data/app/models/ragdoll/search_result.rb +121 -0
data/app/services/ragdoll/configuration_service.rb +3 -3
data/app/services/ragdoll/document_processor.rb +124 -1
data/app/services/ragdoll/embedding_service.rb +10 -0
data/app/services/ragdoll/search_engine.rb +75 -6
data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} +7 -8
data/db/migrate/20250815234902_create_ragdoll_documents.rb +117 -0
data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} +13 -10
data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} +14 -11
data/db/migrate/20250815234905_create_ragdoll_searches.rb +77 -0
data/db/migrate/20250815234906_create_ragdoll_search_results.rb +49 -0
data/lib/ragdoll/core/client.rb +75 -8
data/lib/ragdoll/core/database.rb +8 -3
data/lib/ragdoll/core/model.rb +13 -0
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +2 -0
data/lib/ragdoll.rb +17 -0
data/lib/tasks/db.rake +75 -27
metadata +375 -6
data/db/migrate/004_create_ragdoll_documents.rb +0 -70

data/app/models/ragdoll/document.rb CHANGED Viewed

@@ -142,10 +142,12 @@ module Ragdoll
     def keywords_array
       return [] unless keywords.present?
+      # After migration, keywords is now a PostgreSQL array
       case keywords
       when Array
-        keywords
+        keywords.map(&:to_s).map(&:strip).reject(&:empty?)
       when String
+        # Fallback for any remaining string data (shouldn't happen after migration)
         keywords.split(",").map(&:strip).reject(&:empty?)
       else
         []
@@ -153,17 +155,23 @@ module Ragdoll
     end
     def add_keyword(keyword)
+      return if keyword.blank?
       current_keywords = keywords_array
-      return if current_keywords.include?(keyword.strip)
+      normalized_keyword = keyword.to_s.strip.downcase
+      return if current_keywords.map(&:downcase).include?(normalized_keyword)
-      current_keywords << keyword.strip
-      self.keywords = current_keywords.join(", ")
+      current_keywords << normalized_keyword
+      self.keywords = current_keywords
     end
     def remove_keyword(keyword)
+      return if keyword.blank?
       current_keywords = keywords_array
-      current_keywords.delete(keyword.strip)
-      self.keywords = current_keywords.join(", ")
+      normalized_keyword = keyword.to_s.strip.downcase
+      current_keywords.reject! { |k| k.downcase == normalized_keyword }
+      self.keywords = current_keywords
     end
     # Metadata accessors for common fields
@@ -249,15 +257,110 @@ module Ragdoll
       puts "Metadata generation failed: #{e.message}"
     end
-    # PostgreSQL full-text search on metadata fields
+    # PostgreSQL full-text search on metadata fields with per-word match-ratio [0.0..1.0]
     def self.search_content(query, **options)
       return none if query.blank?
-      # Use PostgreSQL's built-in full-text search across metadata fields
-      where(
-        "to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(metadata->>'summary', '') || ' ' || COALESCE(metadata->>'keywords', '') || ' ' || COALESCE(metadata->>'description', '')) @@ plainto_tsquery('english', ?)",
-        query
-      ).limit(options[:limit] || 20)
+      # Split into unique alphanumeric words
+      words = query.downcase.scan(/[[:alnum:]]+/).uniq
+      return none if words.empty?
+      limit = options[:limit] || 20
+      threshold = options[:threshold] || 0.0
+      # Use precomputed tsvector column if it exists, otherwise build on the fly
+      if column_names.include?("search_vector")
+        tsvector = "#{table_name}.search_vector"
+      else
+        # Build tsvector from title and metadata fields
+        text_expr =
+          "COALESCE(title, '') || ' ' || " \
+          "COALESCE(metadata->>'summary', '') || ' ' || " \
+          "COALESCE(metadata->>'keywords', '') || ' ' || " \
+          "COALESCE(metadata->>'description', '')"
+        tsvector = "to_tsvector('english', #{text_expr})"
+      end
+      # Prepare sanitized tsquery terms
+      tsqueries = words.map do |word|
+        sanitize_sql_array(["plainto_tsquery('english', ?)", word])
+      end
+      # Combine per-word tsqueries with OR so PostgreSQL can use the GIN index
+      combined_tsquery = tsqueries.join(' || ')
+      # Score each match (1 if present, 0 if not), sum them
+      score_terms = tsqueries.map { |tsq| "(#{tsvector} @@ #{tsq})::int" }
+      score_sum   = score_terms.join(' + ')
+      # Similarity ratio: fraction of query words present
+      similarity_sql = "(#{score_sum})::float / #{words.size}"
+      # Start with basic search query
+      query = select("#{table_name}.*, #{similarity_sql} AS fulltext_similarity")
+      # Build where conditions
+      conditions = ["#{tsvector} @@ (#{combined_tsquery})"]
+      # Add status filter (default to processed unless overridden)
+      status = options[:status] || 'processed'
+      conditions << "#{table_name}.status = '#{status}'"
+      # Add document type filter if specified
+      if options[:document_type].present?
+        conditions << sanitize_sql_array(["#{table_name}.document_type = ?", options[:document_type]])
+      end
+      # Add threshold filtering if specified
+      if threshold > 0.0
+        conditions << "#{similarity_sql} >= #{threshold}"
+      end
+      # Combine all conditions
+      where_clause = conditions.join(' AND ')
+      # Materialize to array to avoid COUNT/SELECT alias conflicts in some AR versions
+      query.where(where_clause)
+        .order(Arel.sql("fulltext_similarity DESC, updated_at DESC"))
+        .limit(limit)
+        .to_a
+    end
+    # Search documents by keywords using PostgreSQL array operations
+    # Returns documents that match keywords with scoring based on match count
+    # Inspired by find_matching_entries.rb algorithm but optimized for PostgreSQL arrays
+    def self.search_by_keywords(keywords_array, **options)
+      return where("1 = 0") if keywords_array.blank?
+      # Normalize keywords to lowercase strings array
+      normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
+      return where("1 = 0") if normalized_keywords.empty?
+      limit = options[:limit] || 20
+      # Use PostgreSQL array overlap operator with proper array literal
+      quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
+      array_literal = "'{#{quoted_keywords}}'::text[]"
+      where("keywords && #{array_literal}")
+        .order("created_at DESC")
+        .limit(limit)
+    end
+    # Find documents that contain ALL specified keywords (exact array matching)
+    def self.search_by_keywords_all(keywords_array, **options)
+      return where("1 = 0") if keywords_array.blank?
+      normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
+      return where("1 = 0") if normalized_keywords.empty?
+      limit = options[:limit] || 20
+      # Use PostgreSQL array contains operator with proper array literal
+      quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
+      array_literal = "'{#{quoted_keywords}}'::text[]"
+      where("keywords @> #{array_literal}")
+        .order("created_at DESC")
+        .limit(limit)
     end
     # Faceted search by metadata fields

data/app/models/ragdoll/embedding.rb CHANGED Viewed

@@ -11,6 +11,8 @@ module Ragdoll
     has_neighbors :embedding_vector
     belongs_to :embeddable, polymorphic: true
+    has_many :search_results, class_name: "Ragdoll::SearchResult", dependent: :destroy
+    has_many :searches, through: :search_results
     validates :embeddable_id,    presence: true
     validates :embeddable_type,  presence: true
@@ -62,16 +64,66 @@ module Ragdoll
       scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
       # Document-level filters require joining through embeddable (STI Content) to documents
-      if filters[:document_type]
+      needs_document_join = filters[:document_type] || filters[:keywords]
+      if needs_document_join
         scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
                      .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
-                     .where("ragdoll_documents.document_type = ?", filters[:document_type])
+      end
+      if filters[:document_type]
+        scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
+      end
+      # Keywords filtering using PostgreSQL array operations
+      if filters[:keywords] && filters[:keywords].any?
+        normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
+        if normalized_keywords.any?
+          # Use PostgreSQL array overlap operator with proper array literal
+          quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
+          array_literal = "'{#{quoted_keywords}}'::text[]"
+          scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
+        end
       end
       # Use pgvector for similarity search
       search_with_pgvector(query_embedding, scope, limit, threshold)
     end
+    # Enhanced search that returns both results and similarity statistics
+    def self.search_similar_with_stats(query_embedding, limit: 20, threshold: 0.8, filters: {})
+      # Apply filters
+      scope = all
+      scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
+      scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
+      scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
+      # Document-level filters require joining through embeddable (STI Content) to documents
+      needs_document_join = filters[:document_type] || filters[:keywords]
+      if needs_document_join
+        scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
+                     .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
+      end
+      if filters[:document_type]
+        scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
+      end
+      # Keywords filtering using PostgreSQL array operations
+      if filters[:keywords] && filters[:keywords].any?
+        normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
+        if normalized_keywords.any?
+          # Use PostgreSQL array overlap operator with proper array literal
+          quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
+          array_literal = "'{#{quoted_keywords}}'::text[]"
+          scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
+        end
+      end
+      search_with_pgvector_stats(query_embedding, scope, limit, threshold)
+    end
     # Fast search using pgvector with neighbor gem
     def self.search_with_pgvector(query_embedding, scope, limit, threshold)
       # Use pgvector for similarity search
@@ -103,6 +155,60 @@ module Ragdoll
       results
     end
+    # Enhanced search with statistics
+    def self.search_with_pgvector_stats(query_embedding, scope, limit, threshold)
+      # Use pgvector for similarity search - get more results to analyze
+      # Note: We convert to array immediately to avoid SQL conflicts with count operations
+      neighbor_results = scope
+                         .includes(:embeddable)
+                         .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
+                         .limit([limit * 3, 50].max) # Get enough for statistics
+                         .to_a # Convert to array to avoid SQL conflicts
+      results = []
+      all_similarities = []
+      highest_similarity = 0.0
+      lowest_similarity = 1.0
+      total_checked = neighbor_results.length
+      neighbor_results.each do |embedding|
+        # Calculate cosine similarity (neighbor returns distance, we want similarity)
+        similarity = 1.0 - embedding.neighbor_distance
+        all_similarities << similarity
+        highest_similarity = similarity if similarity > highest_similarity
+        lowest_similarity = similarity if similarity < lowest_similarity
+        next if similarity < threshold
+        usage_score = calculate_usage_score(embedding)
+        combined_score = similarity + usage_score
+        results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
+                                     usage_score, combined_score)
+      end
+      # Sort by combined score and limit
+      results = results.sort_by { |r| -r[:combined_score] }.take(limit)
+      mark_embeddings_as_used(results)
+      # Calculate statistics
+      stats = {
+        total_embeddings_checked: total_checked,
+        threshold_used: threshold,
+        highest_similarity: highest_similarity,
+        lowest_similarity: lowest_similarity,
+        average_similarity: all_similarities.empty? ? 0.0 : (all_similarities.sum / all_similarities.length),
+        similarities_above_threshold: all_similarities.count { |s| s >= threshold },
+        total_similarities_calculated: all_similarities.length
+      }
+      {
+        results: results,
+        statistics: stats
+      }
+    end
     private
     # Calculate usage score for ranking

data/app/models/ragdoll/search.rb ADDED Viewed

@@ -0,0 +1,165 @@
+# frozen_string_literal: true
+require "active_record"
+require "neighbor"
+module Ragdoll
+  class Search < ActiveRecord::Base
+    self.table_name = "ragdoll_searches"
+    # Use pgvector for vector similarity search on query embeddings
+    has_neighbors :query_embedding
+    has_many :search_results, class_name: "Ragdoll::SearchResult", foreign_key: "search_id", dependent: :destroy
+    has_many :embeddings, through: :search_results
+    validates :query, presence: true
+    validates :query_embedding, presence: true
+    validates :search_type, presence: true, inclusion: { in: %w[semantic hybrid fulltext] }
+    validates :results_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
+    scope :by_type, ->(type) { where(search_type: type) }
+    scope :by_session, ->(session_id) { where(session_id: session_id) }
+    scope :by_user, ->(user_id) { where(user_id: user_id) }
+    scope :recent, -> { order(created_at: :desc) }
+    scope :with_results, -> { where("results_count > 0") }
+    scope :popular, -> { where("results_count > 0").order(results_count: :desc) }
+    scope :slow_searches, ->(threshold_ms = 1000) { where("execution_time_ms > ?", threshold_ms) }
+    # Find searches with similar query embeddings
+    def self.find_similar(query_embedding, limit: 10, threshold: 0.8)
+      nearest_neighbors(:query_embedding, query_embedding, distance: "cosine")
+        .limit(limit * 2)
+        .map do |search|
+          similarity = 1.0 - search.neighbor_distance
+          next if similarity < threshold
+          search.define_singleton_method(:similarity_score) { similarity }
+          search
+        end
+        .compact
+        .sort_by(&:similarity_score)
+        .reverse
+        .take(limit)
+    end
+    # Calculate statistics for this search
+    def calculate_similarity_stats!
+      return unless search_results.any?
+      scores = search_results.pluck(:similarity_score)
+      update!(
+        max_similarity_score: scores.max,
+        min_similarity_score: scores.min,
+        avg_similarity_score: scores.sum.to_f / scores.length
+      )
+    end
+    # Get search results ordered by rank
+    def ranked_results
+      search_results.includes(:embedding).order(:result_rank)
+    end
+    # Get clicked results
+    def clicked_results
+      search_results.where(clicked: true).order(:clicked_at)
+    end
+    # Calculate click-through rate
+    def click_through_rate
+      return 0.0 if results_count == 0
+      clicked_count = search_results.where(clicked: true).count
+      clicked_count.to_f / results_count
+    end
+    # Record a search with its results
+    def self.record_search(query:, query_embedding:, results:, search_type: "semantic",
+                          filters: {}, options: {}, execution_time_ms: nil,
+                          session_id: nil, user_id: nil)
+      search = create!(
+        query: query,
+        query_embedding: query_embedding,
+        search_type: search_type,
+        results_count: results.length,
+        search_filters: filters,
+        search_options: options,
+        execution_time_ms: execution_time_ms,
+        session_id: session_id,
+        user_id: user_id
+      )
+      # Create search result records
+      results.each_with_index do |result, index|
+        search.search_results.create!(
+          embedding_id: result[:embedding_id],
+          similarity_score: result[:similarity],
+          result_rank: index + 1
+        )
+      end
+      # Calculate and store similarity statistics
+      search.calculate_similarity_stats!
+      search
+    end
+    # Search analytics methods
+    def self.search_analytics(days: 30)
+      start_date = days.days.ago
+      searches = where(created_at: start_date..)
+      {
+        total_searches: searches.count,
+        unique_queries: searches.distinct.count(:query),
+        avg_results_per_search: searches.average(:results_count)&.round(2),
+        avg_execution_time: searches.average(:execution_time_ms)&.round(2),
+        search_types: searches.group(:search_type).count,
+        searches_with_results: searches.where("results_count > 0").count,
+        avg_click_through_rate: calculate_avg_ctr(searches)
+      }
+    end
+    # Cleanup orphaned searches that have no remaining search results
+    def self.cleanup_orphaned_searches
+      orphaned_search_ids = where.not(id: SearchResult.distinct.pluck(:search_id))
+      orphaned_count = orphaned_search_ids.count
+      if orphaned_count > 0
+        orphaned_search_ids.destroy_all
+        Rails.logger.info "Cleaned up #{orphaned_count} orphaned search records" if defined?(Rails)
+      end
+      orphaned_count
+    end
+    # Cleanup searches older than specified days with no clicks
+    def self.cleanup_old_unused_searches(days: 30)
+      cutoff_date = days.days.ago
+      unused_searches = where(created_at: ...cutoff_date)
+                       .left_joins(:search_results)
+                       .where(search_results: { clicked: [nil, false] })
+      unused_count = unused_searches.count
+      if unused_count > 0
+        unused_searches.destroy_all
+        Rails.logger.info "Cleaned up #{unused_count} old unused search records" if defined?(Rails)
+      end
+      unused_count
+    end
+    private
+    def self.calculate_avg_ctr(searches)
+      search_ids = searches.pluck(:id)
+      return 0.0 if search_ids.empty?
+      total_results = SearchResult.where(search_id: search_ids).count
+      return 0.0 if total_results == 0
+      clicked_results = SearchResult.where(search_id: search_ids, clicked: true).count
+      (clicked_results.to_f / total_results * 100).round(2)
+    end
+  end
+end

data/app/models/ragdoll/search_result.rb ADDED Viewed

@@ -0,0 +1,121 @@
+# frozen_string_literal: true
+require "active_record"
+module Ragdoll
+  class SearchResult < ActiveRecord::Base
+    self.table_name = "ragdoll_search_results"
+    belongs_to :search, class_name: "Ragdoll::Search"
+    belongs_to :embedding, class_name: "Ragdoll::Embedding"
+    validates :similarity_score, presence: true, numericality: { in: 0.0..1.0 }
+    validates :result_rank, presence: true, numericality: { greater_than: 0 }
+    validates :result_rank, uniqueness: { scope: :search_id }
+    scope :by_rank, -> { order(:result_rank) }
+    scope :clicked, -> { where(clicked: true) }
+    scope :unclicked, -> { where(clicked: false) }
+    scope :high_similarity, ->(threshold = 0.8) { where("similarity_score >= ?", threshold) }
+    scope :recent_clicks, -> { where(clicked: true).order(clicked_at: :desc) }
+    # Cleanup callback to remove searches when they have no results left
+    after_destroy :cleanup_empty_search
+    # Mark this result as clicked
+    def mark_as_clicked!
+      update!(clicked: true, clicked_at: Time.current)
+    end
+    # Get the content through the embedding relationship
+    def content
+      embedding&.content
+    end
+    # Get the document through the embedding relationship
+    def document
+      embedding&.embeddable&.document
+    end
+    # Get the document title
+    def document_title
+      document&.title
+    end
+    # Get the document location
+    def document_location
+      document&.location
+    end
+    # Analytics for search results
+    def self.analytics(days: 30)
+      start_date = days.days.ago
+      results = where(created_at: start_date..)
+      {
+        total_results: results.count,
+        clicked_results: results.where(clicked: true).count,
+        click_through_rate: calculate_ctr(results),
+        avg_similarity_score: results.average(:similarity_score)&.round(4),
+        high_similarity_results: results.where("similarity_score >= 0.8").count,
+        low_similarity_results: results.where("similarity_score < 0.5").count,
+        rank_performance: rank_click_analysis(results)
+      }
+    end
+    # Analyze click performance by result rank
+    def self.rank_click_analysis(results = nil)
+      results ||= all
+      results.group(:result_rank)
+             .group("clicked")
+             .count
+             .each_with_object({}) do |((rank, clicked), count), hash|
+        hash[rank] ||= { total: 0, clicked: 0 }
+        hash[rank][:total] += count
+        hash[rank][:clicked] += count if clicked
+      end
+             .transform_values do |stats|
+        stats.merge(
+          ctr: stats[:total] > 0 ? (stats[:clicked].to_f / stats[:total] * 100).round(2) : 0.0
+        )
+      end
+    end
+    # Find embeddings that perform well across multiple searches
+    def self.top_performing_embeddings(limit: 20)
+      joins(:embedding)
+        .group(:embedding_id)
+        .select(
+          "embedding_id",
+          "COUNT(*) as appearance_count",
+          "AVG(similarity_score) as avg_similarity",
+          "COUNT(CASE WHEN clicked THEN 1 END) as click_count",
+          "ROUND(COUNT(CASE WHEN clicked THEN 1 END) * 100.0 / COUNT(*), 2) as ctr"
+        )
+        .having("COUNT(*) > 1")
+        .order("avg_similarity DESC, ctr DESC")
+        .limit(limit)
+    end
+    private
+    def self.calculate_ctr(results)
+      total = results.count
+      return 0.0 if total == 0
+      clicked = results.where(clicked: true).count
+      (clicked.to_f / total * 100).round(2)
+    end
+    # Cleanup callback to remove parent search if it has no results left
+    def cleanup_empty_search
+      return unless search
+      # Check if this was the last result for the search
+      if search.search_results.count == 0
+        search.destroy
+      end
+    end
+  end
+end

data/app/services/ragdoll/configuration_service.rb CHANGED Viewed

@@ -20,10 +20,10 @@ module Ragdoll
         @config.embedding_model(content_type)
       when :summary, :keywords
         # Check for task-specific model, fall back to default
-        task_model = @config.models.text_generation[task_type]
-        task_model || @config.models.text_generation[:default]
+        task_model = @config.models[:text_generation][task_type]
+        task_model || @config.models[:text_generation][:default]
       else
-        @config.models.text_generation[:default]
+        @config.models[:text_generation][:default]
       end
     end