RubyGems - ragdoll - Versions diffs - 0.1.8 → 0.1.9 - Mend

ragdoll 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +201 -0
data/README.md +160 -31
data/Rakefile +0 -3
data/app/models/ragdoll/embedding.rb +74 -0
data/app/models/ragdoll/search.rb +165 -0
data/app/models/ragdoll/search_result.rb +121 -0
data/app/services/ragdoll/configuration_service.rb +3 -3
data/app/services/ragdoll/document_processor.rb +124 -1
data/app/services/ragdoll/embedding_service.rb +10 -0
data/app/services/ragdoll/search_engine.rb +64 -6
data/db/migrate/007_create_ragdoll_searches.rb +73 -0
data/db/migrate/008_create_ragdoll_search_results.rb +49 -0
data/lib/ragdoll/core/client.rb +75 -8
data/lib/ragdoll/core/model.rb +13 -0
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +2 -0
data/lib/ragdoll.rb +17 -0
data/lib/tasks/db.rake +13 -13
metadata +371 -2

data/app/models/ragdoll/search.rb ADDED Viewed

@@ -0,0 +1,165 @@
+# frozen_string_literal: true
+require "active_record"
+require "neighbor"
+module Ragdoll
+  class Search < ActiveRecord::Base
+    self.table_name = "ragdoll_searches"
+    # Use pgvector for vector similarity search on query embeddings
+    has_neighbors :query_embedding
+    has_many :search_results, class_name: "Ragdoll::SearchResult", foreign_key: "search_id", dependent: :destroy
+    has_many :embeddings, through: :search_results
+    validates :query, presence: true
+    validates :query_embedding, presence: true
+    validates :search_type, presence: true, inclusion: { in: %w[semantic hybrid fulltext] }
+    validates :results_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
+    scope :by_type, ->(type) { where(search_type: type) }
+    scope :by_session, ->(session_id) { where(session_id: session_id) }
+    scope :by_user, ->(user_id) { where(user_id: user_id) }
+    scope :recent, -> { order(created_at: :desc) }
+    scope :with_results, -> { where("results_count > 0") }
+    scope :popular, -> { where("results_count > 0").order(results_count: :desc) }
+    scope :slow_searches, ->(threshold_ms = 1000) { where("execution_time_ms > ?", threshold_ms) }
+    # Find searches with similar query embeddings
+    def self.find_similar(query_embedding, limit: 10, threshold: 0.8)
+      nearest_neighbors(:query_embedding, query_embedding, distance: "cosine")
+        .limit(limit * 2)
+        .map do |search|
+          similarity = 1.0 - search.neighbor_distance
+          next if similarity < threshold
+          search.define_singleton_method(:similarity_score) { similarity }
+          search
+        end
+        .compact
+        .sort_by(&:similarity_score)
+        .reverse
+        .take(limit)
+    end
+    # Calculate statistics for this search
+    def calculate_similarity_stats!
+      return unless search_results.any?
+      scores = search_results.pluck(:similarity_score)
+      update!(
+        max_similarity_score: scores.max,
+        min_similarity_score: scores.min,
+        avg_similarity_score: scores.sum.to_f / scores.length
+      )
+    end
+    # Get search results ordered by rank
+    def ranked_results
+      search_results.includes(:embedding).order(:result_rank)
+    end
+    # Get clicked results
+    def clicked_results
+      search_results.where(clicked: true).order(:clicked_at)
+    end
+    # Calculate click-through rate
+    def click_through_rate
+      return 0.0 if results_count == 0
+      clicked_count = search_results.where(clicked: true).count
+      clicked_count.to_f / results_count
+    end
+    # Record a search with its results
+    def self.record_search(query:, query_embedding:, results:, search_type: "semantic",
+                          filters: {}, options: {}, execution_time_ms: nil,
+                          session_id: nil, user_id: nil)
+      search = create!(
+        query: query,
+        query_embedding: query_embedding,
+        search_type: search_type,
+        results_count: results.length,
+        search_filters: filters,
+        search_options: options,
+        execution_time_ms: execution_time_ms,
+        session_id: session_id,
+        user_id: user_id
+      )
+      # Create search result records
+      results.each_with_index do |result, index|
+        search.search_results.create!(
+          embedding_id: result[:embedding_id],
+          similarity_score: result[:similarity],
+          result_rank: index + 1
+        )
+      end
+      # Calculate and store similarity statistics
+      search.calculate_similarity_stats!
+      search
+    end
+    # Search analytics methods
+    def self.search_analytics(days: 30)
+      start_date = days.days.ago
+      searches = where(created_at: start_date..)
+      {
+        total_searches: searches.count,
+        unique_queries: searches.distinct.count(:query),
+        avg_results_per_search: searches.average(:results_count)&.round(2),
+        avg_execution_time: searches.average(:execution_time_ms)&.round(2),
+        search_types: searches.group(:search_type).count,
+        searches_with_results: searches.where("results_count > 0").count,
+        avg_click_through_rate: calculate_avg_ctr(searches)
+      }
+    end
+    # Cleanup orphaned searches that have no remaining search results
+    def self.cleanup_orphaned_searches
+      orphaned_search_ids = where.not(id: SearchResult.distinct.pluck(:search_id))
+      orphaned_count = orphaned_search_ids.count
+      if orphaned_count > 0
+        orphaned_search_ids.destroy_all
+        Rails.logger.info "Cleaned up #{orphaned_count} orphaned search records" if defined?(Rails)
+      end
+      orphaned_count
+    end
+    # Cleanup searches older than specified days with no clicks
+    def self.cleanup_old_unused_searches(days: 30)
+      cutoff_date = days.days.ago
+      unused_searches = where(created_at: ...cutoff_date)
+                       .left_joins(:search_results)
+                       .where(search_results: { clicked: [nil, false] })
+      unused_count = unused_searches.count
+      if unused_count > 0
+        unused_searches.destroy_all
+        Rails.logger.info "Cleaned up #{unused_count} old unused search records" if defined?(Rails)
+      end
+      unused_count
+    end
+    private
+    def self.calculate_avg_ctr(searches)
+      search_ids = searches.pluck(:id)
+      return 0.0 if search_ids.empty?
+      total_results = SearchResult.where(search_id: search_ids).count
+      return 0.0 if total_results == 0
+      clicked_results = SearchResult.where(search_id: search_ids, clicked: true).count
+      (clicked_results.to_f / total_results * 100).round(2)
+    end
+  end
+end

data/app/models/ragdoll/search_result.rb ADDED Viewed

@@ -0,0 +1,121 @@
+# frozen_string_literal: true
+require "active_record"
+module Ragdoll
+  class SearchResult < ActiveRecord::Base
+    self.table_name = "ragdoll_search_results"
+    belongs_to :search, class_name: "Ragdoll::Search"
+    belongs_to :embedding, class_name: "Ragdoll::Embedding"
+    validates :similarity_score, presence: true, numericality: { in: 0.0..1.0 }
+    validates :result_rank, presence: true, numericality: { greater_than: 0 }
+    validates :result_rank, uniqueness: { scope: :search_id }
+    scope :by_rank, -> { order(:result_rank) }
+    scope :clicked, -> { where(clicked: true) }
+    scope :unclicked, -> { where(clicked: false) }
+    scope :high_similarity, ->(threshold = 0.8) { where("similarity_score >= ?", threshold) }
+    scope :recent_clicks, -> { where(clicked: true).order(clicked_at: :desc) }
+    # Cleanup callback to remove searches when they have no results left
+    after_destroy :cleanup_empty_search
+    # Mark this result as clicked
+    def mark_as_clicked!
+      update!(clicked: true, clicked_at: Time.current)
+    end
+    # Get the content through the embedding relationship
+    def content
+      embedding&.content
+    end
+    # Get the document through the embedding relationship
+    def document
+      embedding&.embeddable&.document
+    end
+    # Get the document title
+    def document_title
+      document&.title
+    end
+    # Get the document location
+    def document_location
+      document&.location
+    end
+    # Analytics for search results
+    def self.analytics(days: 30)
+      start_date = days.days.ago
+      results = where(created_at: start_date..)
+      {
+        total_results: results.count,
+        clicked_results: results.where(clicked: true).count,
+        click_through_rate: calculate_ctr(results),
+        avg_similarity_score: results.average(:similarity_score)&.round(4),
+        high_similarity_results: results.where("similarity_score >= 0.8").count,
+        low_similarity_results: results.where("similarity_score < 0.5").count,
+        rank_performance: rank_click_analysis(results)
+      }
+    end
+    # Analyze click performance by result rank
+    def self.rank_click_analysis(results = nil)
+      results ||= all
+      results.group(:result_rank)
+             .group("clicked")
+             .count
+             .each_with_object({}) do |((rank, clicked), count), hash|
+        hash[rank] ||= { total: 0, clicked: 0 }
+        hash[rank][:total] += count
+        hash[rank][:clicked] += count if clicked
+      end
+             .transform_values do |stats|
+        stats.merge(
+          ctr: stats[:total] > 0 ? (stats[:clicked].to_f / stats[:total] * 100).round(2) : 0.0
+        )
+      end
+    end
+    # Find embeddings that perform well across multiple searches
+    def self.top_performing_embeddings(limit: 20)
+      joins(:embedding)
+        .group(:embedding_id)
+        .select(
+          "embedding_id",
+          "COUNT(*) as appearance_count",
+          "AVG(similarity_score) as avg_similarity",
+          "COUNT(CASE WHEN clicked THEN 1 END) as click_count",
+          "ROUND(COUNT(CASE WHEN clicked THEN 1 END) * 100.0 / COUNT(*), 2) as ctr"
+        )
+        .having("COUNT(*) > 1")
+        .order("avg_similarity DESC, ctr DESC")
+        .limit(limit)
+    end
+    private
+    def self.calculate_ctr(results)
+      total = results.count
+      return 0.0 if total == 0
+      clicked = results.where(clicked: true).count
+      (clicked.to_f / total * 100).round(2)
+    end
+    # Cleanup callback to remove parent search if it has no results left
+    def cleanup_empty_search
+      return unless search
+      # Check if this was the last result for the search
+      if search.search_results.count == 0
+        search.destroy
+      end
+    end
+  end
+end

data/app/services/ragdoll/configuration_service.rb CHANGED Viewed

@@ -20,10 +20,10 @@ module Ragdoll
         @config.embedding_model(content_type)
       when :summary, :keywords
         # Check for task-specific model, fall back to default
-        task_model = @config.models.text_generation[task_type]
-        task_model || @config.models.text_generation[:default]
+        task_model = @config.models[:text_generation][task_type]
+        task_model || @config.models[:text_generation][:default]
       else
-        @config.models.text_generation[:default]
+        @config.models[:text_generation][:default]
       end
     end

data/app/services/ragdoll/document_processor.rb CHANGED Viewed

@@ -3,6 +3,8 @@
 require "pdf-reader"
 require "docx"
 require "rmagick"
+require "yaml"
+require "date"
 # Image description service is auto-loaded from app/services
 module Ragdoll
@@ -137,6 +139,11 @@ module Ragdoll
         raise ParseError, "Unsupported PDF feature: #{e.message}"
       end
+      # Add filepath-based title as fallback if no title was found
+      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
+        metadata[:title] = extract_title_from_filepath
+      end
       {
         content: content.strip,
         metadata: metadata,
@@ -192,6 +199,11 @@ module Ragdoll
         raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
       end
+      # Add filepath-based title as fallback if no title was found
+      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
+        metadata[:title] = extract_title_from_filepath
+      end
       {
         content: content.strip,
         metadata: metadata,
@@ -212,6 +224,20 @@ module Ragdoll
                       else "text"
                       end
+      # Parse YAML front matter for markdown files
+      if document_type == "markdown" && content.start_with?("---\n")
+        front_matter, body_content = parse_yaml_front_matter(content)
+        if front_matter
+          metadata.merge!(front_matter)
+          content = body_content
+        end
+      end
+      # Add filepath-based title as fallback if no title was found
+      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
+        metadata[:title] = extract_title_from_filepath
+      end
       {
         content: content,
         metadata: metadata,
@@ -225,16 +251,41 @@ module Ragdoll
         encoding: "ISO-8859-1"
       }
+      # Try to parse front matter with different encoding too
+      if document_type == "markdown" && content.start_with?("---\n")
+        front_matter, body_content = parse_yaml_front_matter(content)
+        if front_matter
+          metadata.merge!(front_matter)
+          content = body_content
+        end
+      end
+      # Add filepath-based title as fallback if no title was found
+      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
+        metadata[:title] = extract_title_from_filepath
+      end
       {
         content: content,
         metadata: metadata,
-        document_type: "text"
+        document_type: document_type.nil? ? "text" : document_type
       }
     end
     def parse_html
       content = File.read(@file_path, encoding: "UTF-8")
+      # Extract title from H1 tag if present
+      h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
+      title = nil
+      if h1_match
+        # Clean up the H1 content by removing any HTML tags and normalizing whitespace
+        title = h1_match[1]
+                  .gsub(/<[^>]+>/, " ")  # Remove any nested HTML tags
+                  .gsub(/\s+/, " ")      # Normalize whitespace
+                  .strip
+      end
       # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
       clean_content = content
                       .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
@@ -248,6 +299,13 @@ module Ragdoll
         original_format: "html"
       }
+      # Add title to metadata if found, otherwise use filepath fallback
+      if title && !title.empty?
+        metadata[:title] = title
+      else
+        metadata[:title] = extract_title_from_filepath
+      end
       {
         content: clean_content,
         metadata: metadata,
@@ -286,6 +344,9 @@ module Ragdoll
       # Use AI-generated description or fallback placeholder
       content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
+      # Add filepath-based title as fallback
+      metadata[:title] = extract_title_from_filepath
       puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
       {
@@ -338,5 +399,67 @@ module Ragdoll
       else "application/octet-stream"
       end
     end
+    private
+    # Extract a meaningful title from the file path as a fallback
+    # @param file_path [String] the full file path
+    # @return [String] a cleaned title derived from the filename
+    def extract_title_from_filepath(file_path = @file_path)
+      filename = File.basename(file_path, File.extname(file_path))
+      # Clean up common patterns in filenames to make them more readable
+      title = filename
+               .gsub(/[-_]+/, ' ')           # Replace hyphens and underscores with spaces
+               .gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
+               .gsub(/\s+/, ' ')             # Normalize multiple spaces
+               .strip
+      # Capitalize words for better readability
+      title.split(' ').map(&:capitalize).join(' ')
+    end
+    # Parse YAML front matter from markdown content
+    # @param content [String] the full content of the markdown file
+    # @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
+    def parse_yaml_front_matter(content)
+      # Check if content starts with YAML front matter delimiter
+      return [nil, content] unless content.start_with?("---\n")
+      # Find the closing delimiter
+      lines = content.lines
+      closing_index = nil
+      lines.each_with_index do |line, index|
+        next if index == 0 # Skip the opening ---
+        if line.strip == "---"
+          closing_index = index
+          break
+        end
+      end
+      # No closing delimiter found
+      return [nil, content] unless closing_index
+      # Extract YAML content and body
+      yaml_lines = lines[1...closing_index]
+      body_lines = lines[(closing_index + 1)..-1]
+      yaml_content = yaml_lines.join
+      body_content = body_lines&.join || ""
+      # Parse YAML
+      begin
+        # Allow Time objects for date fields in YAML front matter
+        front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
+        # Convert string keys to symbols for consistency
+        front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
+        [front_matter, body_content.strip]
+      rescue YAML::SyntaxError, Psych::DisallowedClass => e
+        # If YAML parsing fails, return original content
+        Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
+        [nil, content]
+      end
+    end
   end
 end

data/app/services/ragdoll/embedding_service.rb CHANGED Viewed

@@ -38,6 +38,11 @@ module Ragdoll
           embedding_config = @model_resolver.resolve_embedding(:text)
           # Use just the model name for RubyLLM
           model = embedding_config.model.model
+          # If model is nil or empty, use fallback
+          if model.nil? || model.empty?
+            return generate_fallback_embedding
+          end
           begin
             response = RubyLLM.embed(cleaned_text, model: model)
@@ -93,6 +98,11 @@ module Ragdoll
           embedding_config = @model_resolver.resolve_embedding(:text)
           # Use just the model name for RubyLLM
           model = embedding_config.model.model
+          # If model is nil or empty, use fallback
+          if model.nil? || model.empty?
+            return cleaned_texts.map { generate_fallback_embedding }
+          end
           cleaned_texts.map do |text|
             response = RubyLLM.embed(text, model: model)

data/app/services/ragdoll/search_engine.rb CHANGED Viewed

@@ -27,25 +27,83 @@ module Ragdoll
     end
     def search_similar_content(query_or_embedding, options = {})
+      start_time = Time.current
       search_config = @config_service.search_config
       limit = options[:limit] || search_config[:max_results]
       threshold = options[:threshold] || search_config[:similarity_threshold]
       filters = options[:filters] || {}
+      # Extract tracking options
+      session_id = options[:session_id]
+      user_id = options[:user_id]
+      track_search = options.fetch(:track_search, true)
       if query_or_embedding.is_a?(Array)
         # It's already an embedding
         query_embedding = query_or_embedding
+        query_string = options[:query] # Should be provided when passing embedding directly
       else
         # It's a query string, generate embedding
-        query_embedding = @embedding_service.generate_embedding(query_or_embedding)
+        query_string = query_or_embedding
+        query_embedding = @embedding_service.generate_embedding(query_string)
         return [] if query_embedding.nil?
       end
-      # Search using ActiveRecord models
-      Ragdoll::Embedding.search_similar(query_embedding,
-                                       limit: limit,
-                                       threshold: threshold,
-                                       filters: filters)
+      # Search using ActiveRecord models with statistics
+      # Try enhanced search first, fall back to original if it fails
+      begin
+        search_response = Ragdoll::Embedding.search_similar_with_stats(query_embedding,
+                                                                      limit: limit,
+                                                                      threshold: threshold,
+                                                                      filters: filters)
+        results = search_response[:results]
+        statistics = search_response[:statistics]
+      rescue NoMethodError, PG::SyntaxError => e
+        # Fall back to original search method if enhanced version fails
+        puts "Warning: Enhanced search failed (#{e.message}), using fallback" if ENV["RAGDOLL_DEBUG"]
+        results = Ragdoll::Embedding.search_similar(query_embedding,
+                                                   limit: limit,
+                                                   threshold: threshold,
+                                                   filters: filters)
+        statistics = nil
+      end
+      execution_time = ((Time.current - start_time) * 1000).round
+      # Record search if tracking enabled and we have a query string
+      if track_search && query_string && !query_string.empty?
+        begin
+          # Format results for search recording
+          search_results = results.map do |result|
+            {
+              embedding_id: result[:embedding_id] || result[:id],
+              similarity: result[:similarity] || result[:similarity_score] || 0.0
+            }
+          end
+          Ragdoll::Search.record_search(
+            query: query_string,
+            query_embedding: query_embedding,
+            results: search_results,
+            search_type: "semantic",
+            filters: filters,
+            options: { limit: limit, threshold: threshold },
+            execution_time_ms: execution_time,
+            session_id: session_id,
+            user_id: user_id
+          )
+        rescue => e
+          # Log error but don't fail the search
+          puts "Warning: Search tracking failed: #{e.message}" if ENV["RAGDOLL_DEBUG"]
+        end
+      end
+      # Return results with statistics for better user feedback
+      {
+        results: results,
+        statistics: statistics,
+        execution_time_ms: execution_time
+      }
     end
   end
 end

data/db/migrate/007_create_ragdoll_searches.rb ADDED Viewed

@@ -0,0 +1,73 @@
+class CreateRagdollSearches < ActiveRecord::Migration[7.0]
+  def change
+    create_table :ragdoll_searches,
+      comment: "Search queries and results tracking with vector similarity support" do |t|
+      t.text :query, null: false,
+        comment: "Original search query text"
+      t.vector :query_embedding, limit: 1536, null: false,
+        comment: "Vector embedding of the search query for similarity matching"
+      t.string :search_type, null: false, default: "semantic",
+        comment: "Type of search performed (semantic, hybrid, fulltext)"
+      t.integer :results_count, null: false, default: 0,
+        comment: "Number of results returned for this search"
+      t.float :max_similarity_score,
+        comment: "Highest similarity score from results"
+      t.float :min_similarity_score,
+        comment: "Lowest similarity score from results"
+      t.float :avg_similarity_score,
+        comment: "Average similarity score of results"
+      t.json :search_filters, default: {},
+        comment: "Filters applied during search (document_type, date_range, etc.)"
+      t.json :search_options, default: {},
+        comment: "Search configuration options (threshold, limit, etc.)"
+      t.integer :execution_time_ms,
+        comment: "Search execution time in milliseconds"
+      t.string :session_id,
+        comment: "User session identifier for grouping related searches"
+      t.string :user_id,
+        comment: "User identifier if authentication is available"
+      t.timestamps null: false,
+        comment: "Standard creation and update timestamps"
+      ###########
+      # Indexes #
+      ###########
+      t.index :query_embedding, using: :ivfflat, opclass: :vector_cosine_ops,
+        name: "index_ragdoll_searches_on_query_embedding_cosine",
+        comment: "IVFFlat index for finding similar search queries"
+      t.index :search_type,
+        comment: "Index for filtering by search type"
+      t.index :session_id,
+        comment: "Index for grouping searches by session"
+      t.index :user_id,
+        comment: "Index for filtering searches by user"
+      t.index :created_at,
+        comment: "Index for chronological search history"
+      t.index :results_count,
+        comment: "Index for analyzing search effectiveness"
+      t.index "to_tsvector('english', query)", using: :gin,
+        name: "index_ragdoll_searches_on_fulltext_query",
+        comment: "Full-text search index for finding searches by query text"
+    end
+  end
+end