RubyGems - ragdoll - Versions diffs - 0.1.9 → 0.1.11 - Mend

ragdoll 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +68 -4
data/README.md +86 -1
data/Rakefile +4 -2
data/app/models/ragdoll/document.rb +115 -12
data/app/models/ragdoll/embedding.rb +36 -4
data/app/models/ragdoll/search.rb +1 -1
data/app/services/ragdoll/document_management.rb +117 -9
data/app/services/ragdoll/document_processor.rb +67 -31
data/app/services/ragdoll/search_engine.rb +13 -2
data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} +7 -8
data/db/migrate/20250815234902_create_ragdoll_documents.rb +117 -0
data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} +13 -10
data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} +14 -11
data/db/migrate/{007_create_ragdoll_searches.rb → 20250815234905_create_ragdoll_searches.rb} +24 -20
data/db/migrate/{008_create_ragdoll_search_results.rb → 20250815234906_create_ragdoll_search_results.rb} +16 -16
data/lib/ragdoll/core/client.rb +2 -2
data/lib/ragdoll/core/database.rb +8 -3
data/lib/ragdoll/core/version.rb +1 -1
data/lib/tasks/db.rake +63 -15
metadata +7 -7
data/db/migrate/004_create_ragdoll_documents.rb +0 -70

data/app/services/ragdoll/document_processor.rb CHANGED Viewed

@@ -99,8 +99,6 @@ module Ragdoll
       else
         parse_text # Default to text parsing for unknown formats
       end
-    rescue StandardError => e # StandardError => e
-      raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
     end
     private
@@ -109,6 +107,12 @@ module Ragdoll
       content = ""
       metadata = {}
+      # Add file-based metadata for duplicate detection
+      if File.exist?(@file_path)
+        metadata[:file_size] = File.size(@file_path)
+        metadata[:file_hash] = calculate_file_hash(@file_path)
+      end
       begin
         PDF::Reader.open(@file_path) do |reader|
           # Extract metadata
@@ -144,6 +148,10 @@ module Ragdoll
         metadata[:title] = extract_title_from_filepath
       end
+      # Add content hash for duplicate detection
+      # Ensure content is UTF-8 encoded before checking presence
+      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
       {
         content: content.strip,
         metadata: metadata,
@@ -155,6 +163,12 @@ module Ragdoll
       content = ""
       metadata = {}
+      # Add file-based metadata for duplicate detection
+      if File.exist?(@file_path)
+        metadata[:file_size] = File.size(@file_path)
+        metadata[:file_hash] = calculate_file_hash(@file_path)
+      end
       begin
         doc = Docx::Document.open(@file_path)
@@ -204,6 +218,10 @@ module Ragdoll
         metadata[:title] = extract_title_from_filepath
       end
+      # Add content hash for duplicate detection
+      # Ensure content is UTF-8 encoded before checking presence
+      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
       {
         content: content.strip,
         metadata: metadata,
@@ -212,46 +230,31 @@ module Ragdoll
     end
     def parse_text
-      content = File.read(@file_path, encoding: "UTF-8")
-      metadata = {
-        file_size: File.size(@file_path),
-        encoding: "UTF-8"
-      }
+      # Determine document type first (before any IO operations)
       document_type = case @file_extension
                       when ".md", ".markdown" then "markdown"
                       when ".txt" then "text"
                       else "text"
                       end
-      # Parse YAML front matter for markdown files
-      if document_type == "markdown" && content.start_with?("---\n")
-        front_matter, body_content = parse_yaml_front_matter(content)
-        if front_matter
-          metadata.merge!(front_matter)
-          content = body_content
-        end
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
+      begin
+        content = File.read(@file_path, encoding: "UTF-8")
+        encoding = "UTF-8"
+      rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+        # Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
+        content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
+        encoding = "ISO-8859-1"
+      rescue Errno::ENOENT, Errno::EACCES => e
+        raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
       end
-      {
-        content: content,
-        metadata: metadata,
-        document_type: document_type
-      }
-    rescue Encoding::InvalidByteSequenceError
-      # Try with different encoding
-      content = File.read(@file_path, encoding: "ISO-8859-1")
       metadata = {
         file_size: File.size(@file_path),
-        encoding: "ISO-8859-1"
+        file_hash: calculate_file_hash(@file_path),
+        encoding: encoding
       }
-      # Try to parse front matter with different encoding too
+      # Parse YAML front matter for markdown files
       if document_type == "markdown" && content.start_with?("---\n")
         front_matter, body_content = parse_yaml_front_matter(content)
         if front_matter
@@ -265,10 +268,14 @@ module Ragdoll
         metadata[:title] = extract_title_from_filepath
       end
+      # Add content hash for duplicate detection
+      # Ensure content is UTF-8 encoded before checking presence
+      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
       {
         content: content,
         metadata: metadata,
-        document_type: document_type.nil? ? "text" : document_type
+        document_type: document_type
       }
     end
@@ -296,6 +303,7 @@ module Ragdoll
       metadata = {
         file_size: File.size(@file_path),
+        file_hash: calculate_file_hash(@file_path),
         original_format: "html"
       }
@@ -306,6 +314,9 @@ module Ragdoll
         metadata[:title] = extract_title_from_filepath
       end
+      # Add content hash for duplicate detection
+      metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
       {
         content: clean_content,
         metadata: metadata,
@@ -318,6 +329,7 @@ module Ragdoll
       metadata = {
         file_size: File.size(@file_path),
+        file_hash: calculate_file_hash(@file_path),
         file_type: @file_extension.sub(".", ""),
         original_filename: File.basename(@file_path)
       }
@@ -347,6 +359,10 @@ module Ragdoll
       # Add filepath-based title as fallback
       metadata[:title] = extract_title_from_filepath
+      # Add content hash for duplicate detection
+      # Ensure content is UTF-8 encoded before checking presence
+      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
       puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
       {
@@ -461,5 +477,25 @@ module Ragdoll
         [nil, content]
       end
     end
+    # Calculate SHA256 hash of file content for duplicate detection
+    def calculate_file_hash(file_path)
+      require 'digest'
+      Digest::SHA256.file(file_path).hexdigest
+    rescue StandardError => e
+      Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
+      puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
+      nil
+    end
+    # Calculate SHA256 hash of text content for duplicate detection
+    def calculate_content_hash(content)
+      require 'digest'
+      Digest::SHA256.hexdigest(content)
+    rescue StandardError => e
+      Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
+      puts "Warning: Failed to calculate content hash: #{e.message}"
+      nil
+    end
   end
 end

data/app/services/ragdoll/search_engine.rb CHANGED Viewed

@@ -33,6 +33,10 @@ module Ragdoll
       threshold = options[:threshold] || search_config[:similarity_threshold]
       filters = options[:filters] || {}
+      # Extract keywords option and normalize
+      keywords = options[:keywords] || []
+      keywords = Array(keywords).map(&:to_s).reject(&:empty?)
       # Extract tracking options
       session_id = options[:session_id]
       user_id = options[:user_id]
@@ -49,6 +53,11 @@ module Ragdoll
         return [] if query_embedding.nil?
       end
+      # Add keywords to filters if provided
+      if keywords.any?
+        filters[:keywords] = keywords
+      end
       # Search using ActiveRecord models with statistics
       # Try enhanced search first, fall back to original if it fails
       begin
@@ -81,13 +90,15 @@ module Ragdoll
             }
           end
+          search_type = keywords.any? ? "semantic_with_keywords" : "semantic"
           Ragdoll::Search.record_search(
             query: query_string,
             query_embedding: query_embedding,
             results: search_results,
-            search_type: "semantic",
+            search_type: search_type,
             filters: filters,
-            options: { limit: limit, threshold: threshold },
+            options: { limit: limit, threshold: threshold, keywords: keywords },
             execution_time_ms: execution_time,
             session_id: session_id,
             user_id: user_id

data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} RENAMED Viewed

@@ -1,8 +1,5 @@
 class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
   def up
-    # This migration is now handled by the db:create rake task
-    # Just ensure required extensions are available
     # Vector similarity search (required for embeddings)
     execute "CREATE EXTENSION IF NOT EXISTS vector"
@@ -15,9 +12,11 @@ class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
   end
   def down
-    execute <<-SQL
-      DROP DATABASE IF EXISTS ragdoll_development;
-      DROP ROLE IF EXISTS ragdoll;
-    SQL
+    # Extensions are typically not dropped as they might be used by other databases
+    # If you really need to drop them, uncomment the following:
+    # execute "DROP EXTENSION IF EXISTS vector"
+    # execute "DROP EXTENSION IF EXISTS unaccent"
+    # execute "DROP EXTENSION IF EXISTS pg_trgm"
+    # execute "DROP EXTENSION IF EXISTS \"uuid-ossp\""
   end
-end
+end

data/db/migrate/20250815234902_create_ragdoll_documents.rb ADDED Viewed

@@ -0,0 +1,117 @@
+class CreateRagdollDocuments < ActiveRecord::Migration[7.0]
+  # For concurrent index creation (PostgreSQL)
+  disable_ddl_transaction!
+  def up
+    create_table :ragdoll_documents,
+      comment: "Core documents table with LLM-generated structured metadata" do |t|
+      t.string :location, null: false,
+        comment: "Source location of document (file path, URL, or identifier)"
+      t.string :title, null: false,
+        comment: "Human-readable document title for display and search"
+      t.text :summary, null: false, default: "",
+        comment: "LLM-generated summary of document content"
+      t.string :document_type, null: false, default: "text",
+        comment: "Document format type"
+      t.string :status, null: false, default: "pending",
+        comment: "Document processing status"
+      t.json :metadata, default: {},
+        comment: "LLM-generated structured metadata about the file"
+      t.timestamp :file_modified_at, null: false, default: -> { "CURRENT_TIMESTAMP" },
+        comment: "Timestamp when the source file was last modified"
+      t.timestamps null: false,
+        comment: "Standard creation and update timestamps"
+      # Add tsvector column for full-text search
+      t.tsvector :search_vector
+      # Add keywords as array column
+      t.text :keywords, array: true, default: []
+    end
+    ###########
+    # Indexes #
+    ###########
+    add_index :ragdoll_documents, :location, unique: true,
+      comment: "Unique index for document source lookup"
+    add_index :ragdoll_documents, :title,
+      comment: "Index for title-based search"
+    add_index :ragdoll_documents, :document_type,
+      comment: "Index for filtering by document type"
+    add_index :ragdoll_documents, :status,
+      comment: "Index for filtering by processing status"
+    add_index :ragdoll_documents, :created_at,
+      comment: "Index for chronological sorting"
+    add_index :ragdoll_documents, [:document_type, :status],
+      comment: "Composite index for type+status filtering"
+    # Full-text search index
+    execute <<-SQL
+      CREATE INDEX CONCURRENTLY index_ragdoll_documents_on_fulltext_search
+      ON ragdoll_documents
+      USING gin(to_tsvector('english',
+        COALESCE(title, '') || ' ' ||
+        COALESCE(metadata->>'summary', '') || ' ' ||
+        COALESCE(metadata->>'keywords', '') || ' ' ||
+        COALESCE(metadata->>'description', '')
+      ))
+    SQL
+    add_index :ragdoll_documents, "(metadata->>'document_type')",
+      name: "index_ragdoll_documents_on_metadata_type",
+      comment: "Index for filtering by document type"
+    add_index :ragdoll_documents, "(metadata->>'classification')",
+      name: "index_ragdoll_documents_on_metadata_classification",
+      comment: "Index for filtering by document classification"
+    # GIN index on search_vector
+    add_index :ragdoll_documents, :search_vector, using: :gin, algorithm: :concurrently
+    # GIN index on keywords array
+    add_index :ragdoll_documents, :keywords, using: :gin,
+      name: 'index_ragdoll_documents_on_keywords_gin'
+    # Trigger to keep search_vector up to date on INSERT/UPDATE
+    execute <<-SQL
+      CREATE FUNCTION ragdoll_documents_vector_update() RETURNS trigger AS $$
+      BEGIN
+        NEW.search_vector := to_tsvector('english',
+          COALESCE(NEW.title, '') || ' ' ||
+          COALESCE(NEW.metadata->>'summary', '') || ' ' ||
+          COALESCE(NEW.metadata->>'keywords', '') || ' ' ||
+          COALESCE(NEW.metadata->>'description', '')
+        );
+        RETURN NEW;
+      END
+      $$ LANGUAGE plpgsql;
+      CREATE TRIGGER ragdoll_search_vector_update
+      BEFORE INSERT OR UPDATE ON ragdoll_documents
+      FOR EACH ROW EXECUTE FUNCTION ragdoll_documents_vector_update();
+    SQL
+  end
+  def down
+    execute <<-SQL
+      DROP TRIGGER IF EXISTS ragdoll_search_vector_update ON ragdoll_documents;
+      DROP FUNCTION IF EXISTS ragdoll_documents_vector_update();
+    SQL
+    drop_table :ragdoll_documents
+  end
+end

data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} RENAMED Viewed

@@ -3,7 +3,7 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
     create_table :ragdoll_embeddings,
       comment: "Polymorphic vector embeddings storage for semantic similarity search" do |t|
-        t.references :embeddable, polymorphic: true, null: false,
+      t.references :embeddable, polymorphic: true, null: false,
         comment: "Polymorphic reference to embeddable content"
       t.text :content, null: false, default: "",
@@ -26,16 +26,19 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
       t.timestamps null: false,
         comment: "Standard creation and update timestamps"
+    end
-      ###########
-      # Indexes #
-      ###########
+    ###########
+    # Indexes #
+    ###########
-      t.index %i[embeddable_type embeddable_id],
-        comment: "Index for finding embeddings by embeddable content"
+    add_index :ragdoll_embeddings, [:embeddable_type, :embeddable_id],
+      comment: "Index for finding embeddings by embeddable content"
-      t.index :embedding_vector, using: :ivfflat, opclass: :vector_cosine_ops, name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
-        comment: "IVFFlat index for fast cosine similarity search"
-    end
+    add_index :ragdoll_embeddings, :embedding_vector,
+      using: :ivfflat,
+      opclass: :vector_cosine_ops,
+      name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
+      comment: "IVFFlat index for fast cosine similarity search"
   end
-end
+end

data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} RENAMED Viewed

@@ -29,19 +29,22 @@ class CreateRagdollContents < ActiveRecord::Migration[7.0]
       t.timestamps null: false,
         comment: "Standard creation and update timestamps"
+    end
-      ###########
-      # Indexes #
-      ###########
+    ###########
+    # Indexes #
+    ###########
-      t.index :embedding_model,
-        comment: "Index for filtering by embedding model"
+    add_index :ragdoll_contents, :embedding_model,
+      comment: "Index for filtering by embedding model"
-      t.index :type,
-        comment: "Index for filtering by content type"
+    add_index :ragdoll_contents, :type,
+      comment: "Index for filtering by content type"
-      t.index "to_tsvector('english', COALESCE(content, ''))", using: :gin, name: "index_ragdoll_contents_on_fulltext_search",
-        comment: "Full-text search index for text content"
-    end
+    execute <<-SQL
+      CREATE INDEX index_ragdoll_contents_on_fulltext_search
+      ON ragdoll_contents
+      USING gin(to_tsvector('english', COALESCE(content, '')))
+    SQL
   end
-end
+end

data/db/migrate/{007_create_ragdoll_searches.rb → 20250815234905_create_ragdoll_searches.rb} RENAMED Viewed

@@ -41,33 +41,37 @@ class CreateRagdollSearches < ActiveRecord::Migration[7.0]
       t.timestamps null: false,
         comment: "Standard creation and update timestamps"
+    end
-      ###########
-      # Indexes #
-      ###########
+    ###########
+    # Indexes #
+    ###########
-      t.index :query_embedding, using: :ivfflat, opclass: :vector_cosine_ops,
-        name: "index_ragdoll_searches_on_query_embedding_cosine",
-        comment: "IVFFlat index for finding similar search queries"
+    add_index :ragdoll_searches, :query_embedding,
+      using: :ivfflat,
+      opclass: :vector_cosine_ops,
+      name: "index_ragdoll_searches_on_query_embedding_cosine",
+      comment: "IVFFlat index for finding similar search queries"
-      t.index :search_type,
-        comment: "Index for filtering by search type"
+    add_index :ragdoll_searches, :search_type,
+      comment: "Index for filtering by search type"
-      t.index :session_id,
-        comment: "Index for grouping searches by session"
+    add_index :ragdoll_searches, :session_id,
+      comment: "Index for grouping searches by session"
-      t.index :user_id,
-        comment: "Index for filtering searches by user"
+    add_index :ragdoll_searches, :user_id,
+      comment: "Index for filtering searches by user"
-      t.index :created_at,
-        comment: "Index for chronological search history"
+    add_index :ragdoll_searches, :created_at,
+      comment: "Index for chronological search history"
-      t.index :results_count,
-        comment: "Index for analyzing search effectiveness"
+    add_index :ragdoll_searches, :results_count,
+      comment: "Index for analyzing search effectiveness"
-      t.index "to_tsvector('english', query)", using: :gin,
-        name: "index_ragdoll_searches_on_fulltext_query",
-        comment: "Full-text search index for finding searches by query text"
-    end
+    execute <<-SQL
+      CREATE INDEX index_ragdoll_searches_on_fulltext_query
+      ON ragdoll_searches
+      USING gin(to_tsvector('english', query))
+    SQL
   end
 end

data/db/migrate/{008_create_ragdoll_search_results.rb → 20250815234906_create_ragdoll_search_results.rb} RENAMED Viewed

@@ -24,26 +24,26 @@ class CreateRagdollSearchResults < ActiveRecord::Migration[7.0]
       t.timestamps null: false,
         comment: "Standard creation and update timestamps"
+    end
-      ###########
-      # Indexes #
-      ###########
+    ###########
+    # Indexes #
+    ###########
-      t.index [:search_id, :result_rank],
-        name: "idx_search_results_search_rank",
-        comment: "Index for retrieving results in ranked order"
+    add_index :ragdoll_search_results, [:search_id, :result_rank],
+      name: "idx_search_results_search_rank",
+      comment: "Index for retrieving results in ranked order"
-      t.index [:embedding_id, :similarity_score],
-        name: "idx_search_results_embedding_score",
-        comment: "Index for analyzing embedding performance"
+    add_index :ragdoll_search_results, [:embedding_id, :similarity_score],
+      name: "idx_search_results_embedding_score",
+      comment: "Index for analyzing embedding performance"
-      t.index :similarity_score,
-        name: "idx_search_results_similarity",
-        comment: "Index for similarity score analysis"
+    add_index :ragdoll_search_results, :similarity_score,
+      name: "idx_search_results_similarity",
+      comment: "Index for similarity score analysis"
-      t.index [:clicked, :clicked_at],
-        name: "idx_search_results_clicks",
-        comment: "Index for click-through analysis"
-    end
+    add_index :ragdoll_search_results, [:clicked, :clicked_at],
+      name: "idx_search_results_clicks",
+      comment: "Index for click-through analysis"
   end
 end

data/lib/ragdoll/core/client.rb CHANGED Viewed

@@ -184,7 +184,7 @@ module Ragdoll
       end
       # Document management
-      def add_document(path:)
+      def add_document(path:, force: false)
         # Parse the document
         parsed = Ragdoll::DocumentProcessor.parse(path)
@@ -197,7 +197,7 @@ module Ragdoll
                                                    title: title,
                                                    document_type: parsed[:document_type],
                                                    **parsed[:metadata]
-                                                 })
+                                                 }, force: force)
         # Queue background jobs for processing if content is available
         embeddings_queued = false

data/lib/ragdoll/core/database.rb CHANGED Viewed

@@ -90,10 +90,10 @@ module Ragdoll
         # Drop all tables in correct order (respecting foreign key constraints)
         # Order: dependent tables first, then parent tables
         tables_to_drop = %w[
+          ragdoll_search_results
+          ragdoll_searches
           ragdoll_embeddings
-          ragdoll_text_contents
-          ragdoll_image_contents
-          ragdoll_audio_contents
+          ragdoll_contents
           ragdoll_documents
           schema_migrations
         ]
@@ -109,6 +109,11 @@ module Ragdoll
           end
         end
+        # Also drop any functions/triggers that might exist
+        if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgresql")
+          ActiveRecord::Base.connection.execute("DROP FUNCTION IF EXISTS ragdoll_documents_vector_update() CASCADE")
+        end
         migrate!
       end

data/lib/ragdoll/core/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 module Ragdoll
   module Core
-    VERSION = "0.1.9"
+    VERSION = "0.1.11"
   end
 end