RubyGems - ragdoll - Versions diffs - 0.1.0 → 0.1.1 - Mend

ragdoll 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/README.md +318 -40
data/Rakefile +15 -4
data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
data/db/migrate/004_create_ragdoll_documents.rb +70 -0
data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
data/db/migrate/006_create_ragdoll_contents.rb +47 -0
data/lib/ragdoll/core/client.rb +315 -0
data/lib/ragdoll/core/configuration.rb +273 -0
data/lib/ragdoll/core/database.rb +141 -0
data/lib/ragdoll/core/document_management.rb +110 -0
data/lib/ragdoll/core/document_processor.rb +344 -0
data/lib/ragdoll/core/embedding_service.rb +183 -0
data/lib/ragdoll/core/errors.rb +11 -0
data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
data/lib/ragdoll/core/metadata_schemas.rb +334 -0
data/lib/ragdoll/core/models/audio_content.rb +175 -0
data/lib/ragdoll/core/models/content.rb +126 -0
data/lib/ragdoll/core/models/document.rb +678 -0
data/lib/ragdoll/core/models/embedding.rb +204 -0
data/lib/ragdoll/core/models/image_content.rb +227 -0
data/lib/ragdoll/core/models/text_content.rb +169 -0
data/lib/ragdoll/core/search_engine.rb +50 -0
data/lib/ragdoll/core/services/image_description_service.rb +230 -0
data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
data/lib/ragdoll/core/shrine_config.rb +71 -0
data/lib/ragdoll/core/text_chunker.rb +210 -0
data/lib/ragdoll/core/text_generation_service.rb +360 -0
data/lib/ragdoll/core/version.rb +8 -0
data/lib/ragdoll/core.rb +73 -0
data/lib/ragdoll-core.rb +3 -0
data/lib/ragdoll.rb +243 -6
data/lib/tasks/annotate.rake +126 -0
data/lib/tasks/db.rake +338 -0
metadata +40 -37
data/app/models/ragdoll/document.rb +0 -9
data/app/models/ragdoll/embedding.rb +0 -9
data/config/initializers/ragdoll.rb +0 -6
data/config/routes.rb +0 -5
data/db/migrate/20250218123456_create_documents.rb +0 -20
data/lib/config/database.yml +0 -28
data/lib/config/ragdoll.yml +0 -31
data/lib/ragdoll/engine.rb +0 -16
data/lib/ragdoll/import_job.rb +0 -15
data/lib/ragdoll/ingestion.rb +0 -30
data/lib/ragdoll/search.rb +0 -18
data/lib/ragdoll/version.rb +0 -7
data/lib/tasks/import_task.thor +0 -32
data/lib/tasks/jobs_task.thor +0 -40
data/lib/tasks/ragdoll_tasks.thor +0 -7
data/lib/tasks/search_task.thor +0 -55

data/lib/ragdoll/core/models/embedding.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# frozen_string_literal: true
+require "active_record"
+require "neighbor"
+# == Schema Information
+#
+# Table name: ragdoll_embeddings
+#
+#  id                                                                                              :bigint           not null, primary key
+#  chunk_index(Chunk index for ordering embeddings within the embeddable content)                  :integer          not null
+#  content(Original text content that was embedded, typically a document chunk)                    :text             not null
+#  embeddable_type                                                                                 :string           not null
+#  embedding_vector(Vector embedding using pgvector for optimal similarity search performance)     :vector(1536)     not null
+#  returned_at(Timestamp of most recent usage, for recency-based ranking and cache management)     :datetime
+#  usage_count(Number of times used in similarity searches, for caching optimization)              :integer          default(0)
+#  created_at(Standard creation and update timestamps for lifecycle tracking)                      :datetime         not null
+#  updated_at(Standard creation and update timestamps for lifecycle tracking)                      :datetime         not null
+#  embeddable_id(Polymorphic reference to embeddable content (text, image, audio))                 :bigint           not null
+#
+# Indexes
+#
+#  index_ragdoll_embeddings_on_embeddable                         (embeddable_type,embeddable_id)
+#  index_ragdoll_embeddings_on_embeddable_chunk                   (embeddable_type,embeddable_id,chunk_index) UNIQUE
+#  index_ragdoll_embeddings_on_embeddable_type_and_embeddable_id  (embeddable_type,embeddable_id)
+#  index_ragdoll_embeddings_on_embedding_vector_cosine            (embedding_vector) USING ivfflat
+#  index_ragdoll_embeddings_on_returned_at                        (returned_at)
+#  index_ragdoll_embeddings_on_usage_count                        (usage_count)
+#
+module Ragdoll
+  module Core
+    module Models
+      class Embedding < ActiveRecord::Base
+        self.table_name = "ragdoll_embeddings"
+        # Use pgvector for vector similarity search
+        has_neighbors :embedding_vector
+        belongs_to :embeddable, polymorphic: true
+        validates :embeddable_id,    presence: true
+        validates :embeddable_type,  presence: true
+        validates :chunk_index,      presence: true, uniqueness: { scope: %i[embeddable_id embeddable_type] }
+        validates :embedding_vector, presence: true
+        validates :content,          presence: true
+        scope :by_model, lambda { |model|
+          # Use STI table for all content types
+          where(
+            "embeddable_id IN (SELECT id FROM ragdoll_contents WHERE embedding_model = ?)",
+            model
+          )
+        }
+        scope :recent,             -> { order(created_at: :desc) }
+        scope :frequently_used,    -> { where("usage_count > 0").order(usage_count: :desc) }
+        scope :by_chunk_order,     -> { order(:chunk_index) }
+        scope :by_embeddable_type, ->(type) { where(embeddable_type: type) }
+        scope :text_embeddings,    -> { where(embeddable_type: "Ragdoll::Core::Models::TextContent") }
+        scope :image_embeddings,   -> { where(embeddable_type: "Ragdoll::Core::Models::ImageContent") }
+        scope :audio_embeddings,   -> { where(embeddable_type: "Ragdoll::Core::Models::AudioContent") }
+        # JSON columns are handled natively by PostgreSQL - no serialization needed
+        # Callback for vector column updates (no-op for pgvector)
+        before_save :update_vector_columns
+        def embedding_dimensions
+          embedding_vector&.length || 0
+        end
+        # Access embedding_model via polymorphic relationship
+        def embedding_model
+          embeddable&.embedding_model
+        end
+        def mark_as_used!
+          increment!(:usage_count)
+          update!(returned_at: Time.current)
+        end
+        # PostgreSQL pgvector similarity search using neighbor gem
+        def self.search_similar(query_embedding, limit: Ragdoll.config.search[:max_results], threshold: Ragdoll.config.search[:similarity_threshold], filters: {})
+          # Apply filters
+          scope = all
+          scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
+          scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
+          scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
+          # Document-level filters require joining through embeddable (STI Content) to documents
+          if filters[:document_type]
+            scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
+                         .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
+                         .where("ragdoll_documents.document_type = ?", filters[:document_type])
+          end
+          # Use pgvector for similarity search
+          search_with_pgvector(query_embedding, scope, limit, threshold)
+        end
+        # Fast search using pgvector with neighbor gem
+        def self.search_with_pgvector(query_embedding, scope, limit, threshold)
+          # Use pgvector for similarity search
+          neighbor_results = scope
+                             .includes(:embeddable)
+                             .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
+                             .limit(limit * 2) # Get more to filter by threshold
+          results = []
+          highest_similarity = 0.0
+          neighbor_results.each do |embedding|
+            # Calculate cosine similarity (neighbor returns distance, we want similarity)
+            similarity = 1.0 - embedding.neighbor_distance
+            highest_similarity = similarity if similarity > highest_similarity
+            next if similarity < threshold
+            usage_score = calculate_usage_score(embedding)
+            combined_score = similarity + usage_score
+            results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
+                                         usage_score, combined_score)
+          end
+          # Sort by combined score and limit
+          results = results.sort_by { |r| -r[:combined_score] }.take(limit)
+          mark_embeddings_as_used(results)
+          results
+        end
+        private
+        # Calculate usage score for ranking
+        def self.calculate_usage_score(embedding)
+          usage_score = 0.0
+          if embedding.returned_at && embedding.usage_count.positive?
+            frequency_weight = 0.7
+            recency_weight = 0.3
+            frequency_score = [Math.log(embedding.usage_count + 1) / Math.log(100), 1.0].min
+            days_since_use = (Time.current - embedding.returned_at) / 1.day
+            recency_score = Math.exp(-days_since_use / 30)
+            usage_score = frequency_weight * frequency_score + recency_weight * recency_score
+          end
+          usage_score
+        end
+        # Build standardized result hash
+        def self.build_result_hash(embedding, query_embedding, similarity, highest_similarity, usage_score,
+                                   combined_score)
+          {
+            embedding_id: embedding.id.to_s,
+            embeddable_id: embedding.embeddable_id.to_s,
+            embeddable_type: embedding.embeddable_type,
+            document_id: embedding.embeddable&.document_id&.to_s || "Unknown",
+            document_title: embedding.embeddable&.document&.title || "Unknown",
+            document_location: embedding.embeddable&.document&.location || "Unknown",
+            content: embedding.content,
+            similarity: similarity,
+            highest_similarity: highest_similarity,
+            distance: 1.0 - similarity,
+            chunk_index: embedding.chunk_index,
+            embedding_dimensions: query_embedding.length,
+            embedding_model: embedding.embeddable&.embedding_model,
+            usage_count: embedding.usage_count || 0,
+            returned_at: embedding.returned_at,
+            usage_score: usage_score,
+            combined_score: combined_score
+          }
+        end
+        # Mark embeddings as used for analytics
+        def self.mark_embeddings_as_used(results)
+          return if results.empty?
+          embedding_ids = results.map { |r| r[:embedding_id] }
+          where(id: embedding_ids).update_all(
+            usage_count: arel_table[:usage_count] + 1,
+            returned_at: Time.current
+          )
+        end
+        # Callback to update vector columns when embedding_vector changes
+        def update_vector_columns
+          # No additional processing needed for pgvector
+        end
+        def self.cosine_similarity(vec1, vec2)
+          return 0.0 if vec1.nil? || vec2.nil? || vec1.length != vec2.length
+          dot_product = vec1.zip(vec2).sum { |a, b| a * b }
+          magnitude1 = Math.sqrt(vec1.sum { |a| a * a })
+          magnitude2 = Math.sqrt(vec2.sum { |a| a * a })
+          return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
+          dot_product / (magnitude1 * magnitude2)
+        end
+      end
+    end
+  end
+end

data/lib/ragdoll/core/models/image_content.rb ADDED Viewed

@@ -0,0 +1,227 @@
+# frozen_string_literal: true
+require "active_record"
+require_relative "content"
+# == Schema Information
+#
+# Table name: ragdoll_contents (STI)
+#
+#  id                                                            :bigint           not null, primary key
+#  type(Type of content - TextContent, ImageContent, AudioContent) :string         not null
+#  document_id(Reference to parent document)                     :bigint           not null
+#  embedding_model(Embedding model to use for this content)      :string           not null
+#  content(Text content or description of the file)              :text
+#  data(Raw data from file)                                      :text
+#  metadata(Additional metadata about the file's raw data)       :json             default({})
+#  duration(Duration of audio in seconds - for audio content)    :float
+#  sample_rate(Audio sample rate in Hz - for audio content)      :integer
+#  created_at(Standard creation and update timestamps)           :datetime         not null
+#  updated_at(Standard creation and update timestamps)           :datetime         not null
+#
+# Indexes
+#
+#  index_ragdoll_contents_on_document_id        (document_id)
+#  index_ragdoll_contents_on_embedding_model    (embedding_model)
+#  index_ragdoll_contents_on_type               (type)
+#  index_ragdoll_contents_on_fulltext_search    (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
+#
+# Foreign Keys
+#
+#  fk_rails_...  (document_id => ragdoll_documents.id)
+#
+module Ragdoll
+  module Core
+    module Models
+      class ImageContent < Content
+        validate :image_data_or_description_present
+        scope :recent, -> { order(created_at: :desc) }
+        scope :with_images, -> { where.not(data: [nil, ""]) }
+        scope :with_descriptions, -> { where.not(content: [nil, ""]) }
+        # Image content accessors - content field stores description for embedding
+        def description
+          content
+        end
+        def description=(value)
+          self.content = value
+        end
+        # Image file data accessor
+        def image_data
+          data
+        end
+        def image_data=(value)
+          self.data = value
+        end
+        # Image-specific technical metadata (raw file properties)
+        # This metadata is about the actual image file data, not AI-generated insights
+        def alt_text
+          metadata.dig('alt_text')
+        end
+        def alt_text=(value)
+          self.metadata = metadata.merge('alt_text' => value)
+        end
+        def embedding_count
+          embeddings.count
+        end
+        # Image file technical properties (stored in content metadata - raw file data)
+        def image_attached?
+          data.present?
+        end
+        def image_size
+          metadata.dig('file_size') || 0
+        end
+        def image_size=(value)
+          self.metadata = metadata.merge('file_size' => value)
+        end
+        def image_content_type
+          metadata.dig('content_type')
+        end
+        def image_content_type=(value)
+          self.metadata = metadata.merge('content_type' => value)
+        end
+        def image_filename
+          metadata.dig('filename')
+        end
+        def image_filename=(value)
+          self.metadata = metadata.merge('filename' => value)
+        end
+        def image_dimensions
+          width = metadata.dig('width')
+          height = metadata.dig('height')
+          return nil unless width && height
+          { width: width, height: height }
+        end
+        def set_image_dimensions(width, height)
+          self.metadata = metadata.merge('width' => width, 'height' => height)
+        end
+        # Image format and technical details
+        def color_space
+          metadata.dig('color_space')
+        end
+        def color_space=(value)
+          self.metadata = metadata.merge('color_space' => value)
+        end
+        def bit_depth
+          metadata.dig('bit_depth')
+        end
+        def bit_depth=(value)
+          self.metadata = metadata.merge('bit_depth' => value)
+        end
+        # Generate description from image file using LLM vision capabilities
+        def generate_description_from_image!(options = {})
+          return false unless image_attached? || file_path_available?
+          begin
+            image_path = get_image_path
+            return false unless image_path
+            # Use the image description service
+            require_relative "../services/image_description_service"
+            description_service = Services::ImageDescriptionService.new
+            generated_description = description_service.generate_description(image_path, options)
+            if generated_description.present?
+              self.description = generated_description
+              save!
+              return true
+            end
+            false
+          rescue StandardError => e
+            puts "Failed to generate image description: #{e.message}"
+            false
+          end
+        end
+        # Override content for embedding to combine description and alt_text
+        def content_for_embedding
+          content_parts = []
+          content_parts << alt_text if alt_text.present?
+          content_parts << description if description.present?
+          content_parts.join(" ")
+        end
+        def generate_embeddings!
+          return unless should_generate_embeddings?
+          embedding_content = content_for_embedding
+          return if embedding_content.blank?
+          # Generate embeddings using the base class method
+          super
+        end
+        # Override should_generate_embeddings to check for content
+        def should_generate_embeddings?
+          content_for_embedding.present? && embeddings.empty?
+        end
+        def self.stats
+          {
+            total_image_contents: count,
+            by_model: group(:embedding_model).count,
+            total_embeddings: joins(:embeddings).count,
+            with_images: with_images.count,
+            with_descriptions: with_descriptions.count,
+            average_image_size: joins(:image_attachment).average("active_storage_blobs.byte_size")
+          }
+        end
+        private
+        def file_path_available?
+          document&.location&.present? && File.exist?(document.location)
+        end
+        def get_image_path
+          if file_path_available?
+            # Use document location if it's an image file
+            document.location if image_file?(document.location)
+          elsif image_attached?
+            # Try to get path from stored data (if it's a file path)
+            data if data&.start_with?('/')
+          end
+        end
+        def image_file?(file_path)
+          return false unless file_path
+          image_extensions = %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif]
+          ext = File.extname(file_path).downcase
+          image_extensions.include?(ext)
+        end
+        def image_data_or_description_present
+          return if image_attached? || description.present? || alt_text.present?
+          errors.add(:base, "Must have either image data or description/alt_text")
+        end
+      end
+    end
+  end
+end

data/lib/ragdoll/core/models/text_content.rb ADDED Viewed

@@ -0,0 +1,169 @@
+# frozen_string_literal: true
+require "active_record"
+require_relative "content"
+# == Schema Information
+#
+# Table name: ragdoll_contents (STI)
+#
+#  id                                                            :bigint           not null, primary key
+#  type(Type of content - TextContent, ImageContent, AudioContent) :string         not null
+#  document_id(Reference to parent document)                     :bigint           not null
+#  embedding_model(Embedding model to use for this content)      :string           not null
+#  content(Text content or description of the file)              :text
+#  data(Raw data from file)                                      :text
+#  metadata(Additional metadata about the file's raw data)       :json             default({})
+#  duration(Duration of audio in seconds - for audio content)    :float
+#  sample_rate(Audio sample rate in Hz - for audio content)      :integer
+#  created_at(Standard creation and update timestamps)           :datetime         not null
+#  updated_at(Standard creation and update timestamps)           :datetime         not null
+#
+# Indexes
+#
+#  index_ragdoll_contents_on_document_id        (document_id)
+#  index_ragdoll_contents_on_embedding_model    (embedding_model)
+#  index_ragdoll_contents_on_type               (type)
+#  index_ragdoll_contents_on_fulltext_search    (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
+#
+# Foreign Keys
+#
+#  fk_rails_...  (document_id => ragdoll_documents.id)
+#
+module Ragdoll
+  module Core
+    module Models
+      class TextContent < Content
+        validates :content, presence: true
+        scope :recent, -> { order(created_at: :desc) }
+        # Text-specific processing configuration stored in content metadata
+        # This metadata is about the raw content processing, not AI-generated insights
+        def chunk_size
+          metadata.dig('chunk_size') || 1000
+        end
+        def chunk_size=(value)
+          self.metadata = metadata.merge('chunk_size' => value)
+        end
+        def overlap
+          metadata.dig('overlap') || 200
+        end
+        def overlap=(value)
+          self.metadata = metadata.merge('overlap' => value)
+        end
+        # Content-specific technical metadata (file processing info)
+        def encoding
+          metadata.dig('encoding')
+        end
+        def encoding=(value)
+          self.metadata = metadata.merge('encoding' => value)
+        end
+        def line_count
+          metadata.dig('line_count')
+        end
+        def line_count=(value)
+          self.metadata = metadata.merge('line_count' => value)
+        end
+        def word_count
+          content&.split&.length || 0
+        end
+        def character_count
+          content&.length || 0
+        end
+        def embedding_count
+          embeddings.count
+        end
+        # Text-specific processing methods
+        def chunks
+          return [] if content.blank?
+          chunks = []
+          start_pos = 0
+          while start_pos < content.length
+            end_pos = [start_pos + chunk_size, content.length].min
+            # Try to break at word boundary if not at end
+            if end_pos < content.length
+              last_space = content.rindex(" ", end_pos)
+              end_pos = last_space if last_space && last_space > start_pos
+            end
+            chunk_content = content[start_pos...end_pos].strip
+            if chunk_content.present?
+              chunks << {
+                content: chunk_content,
+                start_position: start_pos,
+                end_position: end_pos,
+                chunk_index: chunks.length
+              }
+            end
+            break if end_pos >= content.length
+            start_pos = [end_pos - overlap, start_pos + 1].max
+          end
+          chunks
+        end
+        def generate_embeddings!
+          return if content.blank?
+          # Clear existing embeddings
+          embeddings.destroy_all
+          # Use TextChunker to split content into manageable chunks
+          chunks = Ragdoll::Core::TextChunker.chunk(content)
+          # Generate embeddings for each chunk
+          embedding_service = Ragdoll::Core::EmbeddingService.new
+          chunks.each_with_index do |chunk_text, index|
+            begin
+              vector = embedding_service.generate_embedding(chunk_text)
+              embeddings.create!(
+                content: chunk_text,
+                embedding_vector: vector,
+                chunk_index: index
+              )
+            rescue StandardError => e
+              puts "Failed to generate embedding for chunk #{index}: #{e.message}"
+            end
+          end
+          update!(metadata: (metadata || {}).merge("embeddings_generated_at" => Time.current))
+        end
+        # Override content for embedding to use the text content
+        def content_for_embedding
+          content
+        end
+        def self.stats
+          {
+            total_text_contents:  count,
+            by_model:             group(:embedding_model).count,
+            total_embeddings:     joins(:embeddings).count,
+            average_word_count:   average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
+            average_chunk_size:   average(:chunk_size)
+          }
+        end
+      end
+    end
+  end
+end

data/lib/ragdoll/core/search_engine.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+# FIXME: This is crap.  It does not focus on search.
+module Ragdoll
+  module Core
+    class SearchEngine
+      def initialize(embedding_service)
+        @embedding_service = embedding_service
+      end
+      def search_documents(query, options = {})
+        limit = options[:limit] || Ragdoll.config.search[:max_results]
+        threshold = options[:threshold] || Ragdoll.config.search[:similarity_threshold]
+        filters = options[:filters] || {}
+        # Generate embedding for the query
+        query_embedding = @embedding_service.generate_embedding(query)
+        return [] if query_embedding.nil?
+        # Search using ActiveRecord models
+        Models::Embedding.search_similar(query_embedding,
+                                         limit: limit,
+                                         threshold: threshold,
+                                         filters: filters)
+      end
+      def search_similar_content(query_or_embedding, options = {})
+        limit = options[:limit] || Ragdoll.config.search[:max_results]
+        threshold = options[:threshold] || Ragdoll.config.search[:similarity_threshold]
+        filters = options[:filters] || {}
+        if query_or_embedding.is_a?(Array)
+          # It's already an embedding
+          query_embedding = query_or_embedding
+        else
+          # It's a query string, generate embedding
+          query_embedding = @embedding_service.generate_embedding(query_or_embedding)
+          return [] if query_embedding.nil?
+        end
+        # Search using ActiveRecord models
+        Models::Embedding.search_similar(query_embedding,
+                                         limit: limit,
+                                         threshold: threshold,
+                                         filters: filters)
+      end
+    end
+  end
+end