RubyGems - ragdoll - Versions diffs - 0.1.11 → 0.1.12 - Mend

ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +323 -384
data/app/models/ragdoll/document.rb +1 -1
data/app/models/ragdoll/unified_content.rb +216 -0
data/app/models/ragdoll/unified_document.rb +338 -0
data/app/services/ragdoll/audio_to_text_service.rb +200 -0
data/app/services/ragdoll/document_converter.rb +216 -0
data/app/services/ragdoll/document_processor.rb +197 -331
data/app/services/ragdoll/image_to_text_service.rb +322 -0
data/app/services/ragdoll/migration_service.rb +340 -0
data/app/services/ragdoll/text_extraction_service.rb +422 -0
data/app/services/ragdoll/unified_document_management.rb +300 -0
data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +7 -0
metadata +11 -2

data/app/services/ragdoll/unified_document_management.rb ADDED Viewed

@@ -0,0 +1,300 @@
+# frozen_string_literal: true
+module Ragdoll
+  # Unified document management service for text-based RAG system
+  # Handles the entire pipeline from document ingestion to searchable text embeddings
+  class UnifiedDocumentManagement
+    class ProcessingError < StandardError; end
+    def self.add_document(file_path, **options)
+      new.add_document(file_path, **options)
+    end
+    def self.add_document_from_upload(uploaded_file, **options)
+      new.add_document_from_upload(uploaded_file, **options)
+    end
+    def self.process_document(document_id)
+      new.process_document(document_id)
+    end
+    def initialize
+      @converter = Ragdoll::DocumentConverter.new
+    end
+    # Add a document from file path
+    def add_document(file_path, **options)
+      return nil unless File.exist?(file_path)
+      # Determine document type
+      document_type = @converter.determine_document_type(file_path)
+      # Convert to text
+      text_content = @converter.convert_to_text(file_path, document_type)
+      # Create document
+      document = create_unified_document(
+        location: File.expand_path(file_path),
+        document_type: document_type,
+        text_content: text_content,
+        **options
+      )
+      # Process asynchronously if requested
+      if options[:async]
+        process_document_async(document.id)
+      else
+        process_document_sync(document)
+      end
+      document
+    end
+    # Add a document from uploaded file
+    def add_document_from_upload(uploaded_file, **options)
+      # Create temporary file to process
+      temp_file = nil
+      begin
+        temp_file = create_temp_file_from_upload(uploaded_file)
+        document_type = @converter.determine_document_type(temp_file.path)
+        text_content = @converter.convert_to_text(temp_file.path, document_type)
+        # Create document
+        document = create_unified_document(
+          location: uploaded_file.original_filename || "uploaded_file",
+          document_type: document_type,
+          text_content: text_content,
+          **options
+        )
+        # Process asynchronously if requested
+        if options[:async]
+          process_document_async(document.id)
+        else
+          process_document_sync(document)
+        end
+        document
+      ensure
+        temp_file&.close
+        temp_file&.unlink if temp_file&.path
+      end
+    end
+    # Process a document by ID
+    def process_document(document_id)
+      if defined?(Ragdoll::UnifiedDocument)
+        document = Ragdoll::UnifiedDocument.find(document_id)
+      else
+        # Fallback to regular Document
+        document = Ragdoll::Document.find(document_id)
+      end
+      process_document_sync(document)
+    end
+    # Reprocess document with new text conversion
+    def reprocess_document(document_id, **options)
+      if defined?(Ragdoll::UnifiedDocument)
+        document = Ragdoll::UnifiedDocument.find(document_id)
+      else
+        document = Ragdoll::Document.find(document_id)
+      end
+      return nil unless File.exist?(document.location)
+      # Re-convert to text
+      document_type = @converter.determine_document_type(document.location)
+      text_content = @converter.convert_to_text(document.location, document_type, **options)
+      # Update document content
+      if document.respond_to?(:unified_contents)
+        # Unified document approach
+        if document.unified_contents.any?
+          document.unified_contents.first.update!(content: text_content)
+        else
+          document.unified_contents.create!(
+            content: text_content,
+            original_media_type: document_type,
+            embedding_model: "text-embedding-3-large",
+            metadata: { "reprocessed_at" => Time.current }
+          )
+        end
+      else
+        # Fallback to content field
+        document.content = text_content
+      end
+      # Reprocess
+      process_document_sync(document)
+    end
+    # Batch processing for multiple documents
+    def batch_process_documents(file_paths, **options)
+      results = []
+      errors = []
+      file_paths.each do |file_path|
+        begin
+          document = add_document(file_path, **options)
+          results << document
+        rescue StandardError => e
+          errors << { file_path: file_path, error: e.message }
+        end
+      end
+      {
+        processed: results,
+        errors: errors,
+        total: file_paths.length,
+        success_count: results.length,
+        error_count: errors.length
+      }
+    end
+    # Search across all documents
+    def search_documents(query, **options)
+      if defined?(Ragdoll::UnifiedDocument)
+        Ragdoll::UnifiedDocument.search_content(query, **options)
+      else
+        Ragdoll::Document.search_content(query, **options)
+      end
+    end
+    # Get processing statistics
+    def processing_stats
+      if defined?(Ragdoll::UnifiedDocument)
+        base_stats = Ragdoll::UnifiedDocument.stats
+        content_stats = Ragdoll::UnifiedContent.stats
+      else
+        base_stats = Ragdoll::Document.stats
+        content_stats = Ragdoll::Content.stats
+      end
+      {
+        documents: base_stats,
+        content: content_stats,
+        processing_summary: {
+          total_documents: base_stats[:total_documents],
+          processed_documents: base_stats.dig(:by_status, "processed") || 0,
+          total_embeddings: base_stats[:total_embeddings],
+          average_processing_time: estimate_average_processing_time
+        }
+      }
+    end
+    private
+    def create_unified_document(location:, document_type:, text_content:, **options)
+      title = options[:title] || extract_title_from_location(location)
+      if defined?(Ragdoll::UnifiedDocument)
+        document = Ragdoll::UnifiedDocument.create!(
+          location: location,
+          title: title,
+          document_type: document_type,
+          status: "pending",
+          file_modified_at: options[:file_modified_at] || Time.current,
+          metadata: options[:metadata] || {}
+        )
+        # Create unified content
+        document.unified_contents.create!(
+          content: text_content,
+          original_media_type: document_type,
+          embedding_model: "text-embedding-3-large",
+          metadata: {
+            "created_at" => Time.current,
+            "conversion_method" => "unified_converter",
+            "original_filename" => File.basename(location)
+          }
+        )
+      else
+        # Fallback to regular Document
+        document = Ragdoll::Document.create!(
+          location: location,
+          title: title,
+          content: text_content,
+          document_type: document_type,
+          status: "pending",
+          file_modified_at: options[:file_modified_at] || Time.current,
+          metadata: options[:metadata] || {}
+        )
+      end
+      document
+    end
+    def process_document_sync(document)
+      begin
+        if document.respond_to?(:process_document!)
+          document.process_document!
+        else
+          # Fallback processing
+          document.update!(status: "processing")
+          generate_embeddings_for_document(document)
+          document.update!(status: "processed")
+        end
+      rescue StandardError => e
+        document.update!(status: "error", metadata: (document.metadata || {}).merge("error" => e.message))
+        raise ProcessingError, "Failed to process document #{document.id}: #{e.message}"
+      end
+      document
+    end
+    def process_document_async(document_id)
+      # In a real application, this would enqueue a background job
+      # For now, we'll just process synchronously
+      puts "Note: Async processing not implemented, processing synchronously"
+      process_document(document_id)
+    end
+    def generate_embeddings_for_document(document)
+      if document.respond_to?(:unified_contents)
+        document.unified_contents.each(&:generate_embeddings!)
+      elsif document.respond_to?(:contents)
+        document.contents.each(&:generate_embeddings!)
+      end
+    end
+    def create_temp_file_from_upload(uploaded_file)
+      temp_file = Tempfile.new([
+        File.basename(uploaded_file.original_filename || "upload", ".*"),
+        File.extname(uploaded_file.original_filename || "")
+      ])
+      if uploaded_file.respond_to?(:read)
+        temp_file.write(uploaded_file.read)
+      elsif uploaded_file.respond_to?(:path)
+        FileUtils.cp(uploaded_file.path, temp_file.path)
+      else
+        raise ProcessingError, "Unknown upload file format"
+      end
+      temp_file.flush
+      temp_file.rewind
+      temp_file
+    end
+    def extract_title_from_location(location)
+      filename = File.basename(location, File.extname(location))
+      # Clean up common patterns in filenames
+      title = filename
+               .gsub(/[-_]+/, ' ')
+               .gsub(/([a-z])([A-Z])/, '\1 \2')
+               .gsub(/\s+/, ' ')
+               .strip
+      # Capitalize words for better readability
+      title.split(' ').map(&:capitalize).join(' ')
+    end
+    def estimate_average_processing_time
+      # This would be calculated from actual processing logs in production
+      # For now, return a placeholder
+      "~2.5 seconds"
+    end
+  end
+end

data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+class CreateRagdollUnifiedContents < ActiveRecord::Migration[7.0]
+  def change
+    unless table_exists?(:ragdoll_unified_contents)
+      create_table :ragdoll_unified_contents,
+        comment: "Unified content storage for text-based RAG architecture" do |t|
+        t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents },
+          comment: "Reference to parent document"
+        t.text :content, null: false,
+          comment: "Text content (original text, extracted text, image description, audio transcript, etc.)"
+        t.string :original_media_type, null: false,
+          comment: "Original media type (text, image, audio, video, pdf, docx, html, markdown, unknown)"
+        t.string :embedding_model, null: false,
+          comment: "Embedding model used for this content"
+        t.string :conversion_method,
+          comment: "Method used to convert to text (text_extraction, image_to_text, audio_transcription, etc.)"
+        t.integer :word_count, default: 0,
+          comment: "Number of words in the content"
+        t.integer :character_count, default: 0,
+          comment: "Number of characters in the content"
+        t.float :content_quality_score, default: 0.0,
+          comment: "Quality score of the converted content (0.0-1.0)"
+        t.json :metadata, default: {},
+          comment: "Additional metadata about the conversion and content"
+        t.timestamps null: false,
+          comment: "Standard creation and update timestamps"
+      end
+    else
+      # Add missing columns to existing table
+      add_column :ragdoll_unified_contents, :original_media_type, :string unless column_exists?(:ragdoll_unified_contents, :original_media_type)
+      add_column :ragdoll_unified_contents, :conversion_method, :string unless column_exists?(:ragdoll_unified_contents, :conversion_method)
+      add_column :ragdoll_unified_contents, :word_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :word_count)
+      add_column :ragdoll_unified_contents, :character_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :character_count)
+      add_column :ragdoll_unified_contents, :content_quality_score, :float, default: 0.0 unless column_exists?(:ragdoll_unified_contents, :content_quality_score)
+    end
+    ###########
+    # Indexes #
+    ###########
+    unless index_exists?(:ragdoll_unified_contents, :embedding_model)
+      add_index :ragdoll_unified_contents, :embedding_model,
+        comment: "Index for filtering by embedding model"
+    end
+    unless index_exists?(:ragdoll_unified_contents, :original_media_type)
+      add_index :ragdoll_unified_contents, :original_media_type,
+        comment: "Index for filtering by original media type"
+    end
+    unless index_exists?(:ragdoll_unified_contents, :conversion_method)
+      add_index :ragdoll_unified_contents, :conversion_method,
+        comment: "Index for filtering by conversion method"
+    end
+    unless index_exists?(:ragdoll_unified_contents, :content_quality_score)
+      add_index :ragdoll_unified_contents, :content_quality_score,
+        comment: "Index for filtering by content quality"
+    end
+    unless index_exists?(:ragdoll_unified_contents, [:document_id, :original_media_type], name: "index_unified_contents_on_doc_and_media_type")
+      add_index :ragdoll_unified_contents, [:document_id, :original_media_type],
+        name: "index_unified_contents_on_doc_and_media_type",
+        comment: "Index for finding content by document and media type"
+    end
+    # Full-text search index
+    unless connection.execute("SELECT 1 FROM pg_indexes WHERE indexname = 'index_ragdoll_unified_contents_on_fulltext_search'").any?
+      execute <<-SQL
+        CREATE INDEX index_ragdoll_unified_contents_on_fulltext_search
+        ON ragdoll_unified_contents
+        USING gin(to_tsvector('english', COALESCE(content, '')))
+      SQL
+    end
+  end
+end

data/lib/ragdoll/core/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 module Ragdoll
   module Core
-    VERSION = "0.1.11"
+    VERSION = "0.1.12"
   end
 end

data/lib/ragdoll/core.rb CHANGED Viewed

@@ -25,6 +25,8 @@ require_relative "core/shrine_config"
 # Require models from app/models/ragdoll
 require "ragdoll/document"
+require "ragdoll/unified_document"
+require "ragdoll/unified_content"
 require "ragdoll/embedding"
 require "ragdoll/content"
 require "ragdoll/text_content"
@@ -34,11 +36,16 @@ require "ragdoll/search"
 require "ragdoll/search_result"
 require "ragdoll/document_processor"
 require "ragdoll/document_management"
+require "ragdoll/unified_document_management"
+require "ragdoll/document_converter"
+require "ragdoll/migration_service"
 require "ragdoll/text_chunker"
 require "ragdoll/embedding_service"
 require "ragdoll/text_generation_service"
 require "ragdoll/search_engine"
 require "ragdoll/image_description_service"
+require "ragdoll/image_to_text_service"
+require "ragdoll/text_extraction_service"
 require "ragdoll/metadata_generator"
 # Require from app/lib/ragdoll
 require "ragdoll/metadata_schemas"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ragdoll
 version: !ruby/object:Gem::Version
-  version: 0.1.11
+  version: 0.1.12
 platform: ruby
 authors:
 - Dewayne VanHoozer
@@ -396,22 +396,31 @@ files:
 - app/models/ragdoll/search.rb
 - app/models/ragdoll/search_result.rb
 - app/models/ragdoll/text_content.rb
+- app/models/ragdoll/unified_content.rb
+- app/models/ragdoll/unified_document.rb
+- app/services/ragdoll/audio_to_text_service.rb
 - app/services/ragdoll/configuration_service.rb
+- app/services/ragdoll/document_converter.rb
 - app/services/ragdoll/document_management.rb
 - app/services/ragdoll/document_processor.rb
 - app/services/ragdoll/embedding_service.rb
 - app/services/ragdoll/image_description_service.rb
+- app/services/ragdoll/image_to_text_service.rb
 - app/services/ragdoll/metadata_generator.rb
+- app/services/ragdoll/migration_service.rb
 - app/services/ragdoll/model_resolver.rb
 - app/services/ragdoll/search_engine.rb
 - app/services/ragdoll/text_chunker.rb
+- app/services/ragdoll/text_extraction_service.rb
 - app/services/ragdoll/text_generation_service.rb
+- app/services/ragdoll/unified_document_management.rb
 - db/migrate/20250815234901_enable_postgresql_extensions.rb
 - db/migrate/20250815234902_create_ragdoll_documents.rb
 - db/migrate/20250815234903_create_ragdoll_embeddings.rb
 - db/migrate/20250815234904_create_ragdoll_contents.rb
 - db/migrate/20250815234905_create_ragdoll_searches.rb
 - db/migrate/20250815234906_create_ragdoll_search_results.rb
+- db/migrate/20250923000001_create_ragdoll_unified_contents.rb
 - lib/ragdoll-core.rb
 - lib/ragdoll.rb
 - lib/ragdoll/core.rb
@@ -447,7 +456,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.7.1
+rubygems_version: 3.7.2
 specification_version: 4
 summary: Multi-Modal Retrieval Augmented Generation
 test_files: []