RubyGems - ragdoll - Versions diffs - 0.1.11 → 0.1.12 - Mend

ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +323 -384
data/app/models/ragdoll/document.rb +1 -1
data/app/models/ragdoll/unified_content.rb +216 -0
data/app/models/ragdoll/unified_document.rb +338 -0
data/app/services/ragdoll/audio_to_text_service.rb +200 -0
data/app/services/ragdoll/document_converter.rb +216 -0
data/app/services/ragdoll/document_processor.rb +197 -331
data/app/services/ragdoll/image_to_text_service.rb +322 -0
data/app/services/ragdoll/migration_service.rb +340 -0
data/app/services/ragdoll/text_extraction_service.rb +422 -0
data/app/services/ragdoll/unified_document_management.rb +300 -0
data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +7 -0
metadata +11 -2

data/app/services/ragdoll/document_processor.rb CHANGED Viewed

@@ -5,7 +5,6 @@ require "docx"
 require "rmagick"
 require "yaml"
 require "date"
-# Image description service is auto-loaded from app/services
 module Ragdoll
   class DocumentProcessor
@@ -34,7 +33,7 @@ module Ragdoll
         location: File.expand_path(file_path),
         title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
         content: parsed[:content],
-        document_type: determine_document_type(file_path),
+        document_type: parsed[:document_type] || determine_document_type(file_path),
         metadata: parsed[:metadata] || {},
         status: "processed",
         file_modified_at: file_modified_at,
@@ -85,399 +84,237 @@ module Ragdoll
     end
     def parse
-      case @file_extension
-      when ".pdf"
-        parse_pdf
-      when ".docx"
-        parse_docx
-      when ".txt", ".md", ".markdown"
-        parse_text
-      when ".html", ".htm"
-        parse_html
-      when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
-        parse_image
-      else
-        parse_text # Default to text parsing for unknown formats
+      # Check if file exists first
+      unless File.exist?(@file_path)
+        raise ParseError, "File does not exist: #{@file_path}"
       end
-    end
-    private
+      # Use the new unified document converter
+      document_type = determine_document_type(@file_path)
-    def parse_pdf
-      content = ""
-      metadata = {}
+      begin
+        # Convert to text using the unified pipeline
+        text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
-      # Add file-based metadata for duplicate detection
-      if File.exist?(@file_path)
-        metadata[:file_size] = File.size(@file_path)
-        metadata[:file_hash] = calculate_file_hash(@file_path)
-      end
+        # Extract metadata based on document type
+        metadata = extract_metadata_for_type(document_type)
-      begin
-        PDF::Reader.open(@file_path) do |reader|
-          # Extract metadata
-          if reader.info
-            metadata[:title] = reader.info[:Title] if reader.info[:Title]
-            metadata[:author] = reader.info[:Author] if reader.info[:Author]
-            metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
-            metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
-            metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
-            metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
-            metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
-          end
+        # Add encoding information for text files
+        if %w[text markdown html].include?(document_type)
+          encoding = detect_file_encoding(@file_path) || "UTF-8"
+          metadata[:encoding] = encoding
+        end
-          metadata[:page_count] = reader.page_count
+        # Get title from metadata or filename
+        title = metadata[:title] || extract_title_from_filepath
+        {
+          content: text_content,
+          metadata: metadata,
+          title: title,
+          document_type: document_type
+        }
+      rescue StandardError => e
+        raise ParseError, "Failed to parse document: #{e.message}"
+      end
+    end
-          # Extract text from all pages
-          reader.pages.each_with_index do |page, index|
-            page_text = page.text.strip
-            next if page_text.empty?
+    # Helper methods for document type determination
+    def self.determine_document_type(file_path)
+      Ragdoll::DocumentConverter.new.determine_document_type(file_path)
+    end
-            content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
-            content += page_text
-          end
-        end
-      rescue PDF::Reader::MalformedPDFError => e
-        raise ParseError, "Malformed PDF: #{e.message}"
-      rescue PDF::Reader::UnsupportedFeatureError => e
-        raise ParseError, "Unsupported PDF feature: #{e.message}"
+    def self.determine_document_type_from_content_type(content_type)
+      case content_type
+      when "application/pdf" then "pdf"
+      when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
+      when "text/plain" then "text"
+      when "text/markdown" then "markdown"
+      when "text/html" then "html"
+      when %r{^image/} then "image"
+      when %r{^audio/} then "audio"
+      when %r{^video/} then "video"
+      else "text"
       end
+    end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
+    def self.determine_content_type(file_path)
+      case File.extname(file_path).downcase
+      when ".pdf" then "application/pdf"
+      when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+      when ".txt" then "text/plain"
+      when ".md", ".markdown" then "text/markdown"
+      when ".html", ".htm" then "text/html"
+      when ".jpg", ".jpeg" then "image/jpeg"
+      when ".png" then "image/png"
+      when ".gif" then "image/gif"
+      when ".webp" then "image/webp"
+      when ".bmp" then "image/bmp"
+      when ".svg" then "image/svg+xml"
+      when ".ico" then "image/x-icon"
+      when ".tiff", ".tif" then "image/tiff"
+      when ".mp3" then "audio/mpeg"
+      when ".wav" then "audio/wav"
+      when ".m4a" then "audio/mp4"
+      when ".flac" then "audio/flac"
+      when ".ogg" then "audio/ogg"
+      when ".mp4" then "video/mp4"
+      when ".mov" then "video/quicktime"
+      when ".avi" then "video/x-msvideo"
+      when ".webm" then "video/webm"
+      else "application/octet-stream"
       end
+    end
-      # Add content hash for duplicate detection
-      # Ensure content is UTF-8 encoded before checking presence
-      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
+    private
-      {
-        content: content.strip,
-        metadata: metadata,
-        document_type: "pdf"
-      }
+    def determine_document_type(file_path)
+      Ragdoll::DocumentConverter.new.determine_document_type(file_path)
+    end
+    def extract_metadata_for_type(document_type)
+      metadata = basic_file_metadata
+      case document_type
+      when "pdf"
+        metadata.merge!(extract_pdf_metadata)
+      when "docx"
+        metadata.merge!(extract_docx_metadata)
+      when "image"
+        metadata.merge!(extract_image_metadata)
+      when "audio"
+        metadata.merge!(extract_audio_metadata)
+      when "video"
+        metadata.merge!(extract_video_metadata)
+      end
+      metadata
     end
-    def parse_docx
-      content = ""
+    def basic_file_metadata
       metadata = {}
-      # Add file-based metadata for duplicate detection
       if File.exist?(@file_path)
         metadata[:file_size] = File.size(@file_path)
         metadata[:file_hash] = calculate_file_hash(@file_path)
+        metadata[:file_modified_at] = File.mtime(@file_path)
       end
-      begin
-        doc = Docx::Document.open(@file_path)
-        # Extract core properties
-        if doc.core_properties
-          metadata[:title] = doc.core_properties.title if doc.core_properties.title
-          metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
-          metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
-          metadata[:description] = doc.core_properties.description if doc.core_properties.description
-          metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
-          metadata[:created] = doc.core_properties.created if doc.core_properties.created
-          metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
-          if doc.core_properties.last_modified_by
-            metadata[:last_modified_by] =
-              doc.core_properties.last_modified_by
-          end
-        end
-        # Extract text from paragraphs
-        doc.paragraphs.each do |paragraph|
-          paragraph_text = paragraph.text.strip
-          next if paragraph_text.empty?
-          content += "#{paragraph_text}\n\n"
-        end
+      metadata[:original_filename] = File.basename(@file_path)
+      metadata[:file_extension] = File.extname(@file_path).downcase
+      metadata
+    end
-        # Extract text from tables
-        doc.tables.each_with_index do |table, table_index|
-          content += "\n--- Table #{table_index + 1} ---\n\n"
+    def extract_pdf_metadata
+      return {} unless File.exist?(@file_path)
-          table.rows.each do |row|
-            row_text = row.cells.map(&:text).join(" | ")
-            content += "#{row_text}\n" unless row_text.strip.empty?
+      begin
+        metadata = {}
+        PDF::Reader.open(@file_path) do |reader|
+          if reader.info
+            metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
+            metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
+            metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
+            metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
+            metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
+            metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
+            metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
           end
-          content += "\n"
+          metadata[:page_count] = reader.page_count
         end
-        metadata[:paragraph_count] = doc.paragraphs.count
-        metadata[:table_count] = doc.tables.count
-      rescue StandardError => e # StandardError => e
-        raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
+        # Use PDF title as main title if available
+        metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract PDF metadata: #{e.message}"
+        {}
       end
-      # Add content hash for duplicate detection
-      # Ensure content is UTF-8 encoded before checking presence
-      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
-      {
-        content: content.strip,
-        metadata: metadata,
-        document_type: "docx"
-      }
     end
-    def parse_text
-      # Determine document type first (before any IO operations)
-      document_type = case @file_extension
-                      when ".md", ".markdown" then "markdown"
-                      when ".txt" then "text"
-                      else "text"
-                      end
+    def extract_docx_metadata
+      return {} unless File.exist?(@file_path)
       begin
-        content = File.read(@file_path, encoding: "UTF-8")
-        encoding = "UTF-8"
-      rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
-        # Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
-        content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
-        encoding = "ISO-8859-1"
-      rescue Errno::ENOENT, Errno::EACCES => e
-        raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
-      end
-      metadata = {
-        file_size: File.size(@file_path),
-        file_hash: calculate_file_hash(@file_path),
-        encoding: encoding
-      }
+        metadata = {}
+        doc = Docx::Document.open(@file_path)
-      # Parse YAML front matter for markdown files
-      if document_type == "markdown" && content.start_with?("---\n")
-        front_matter, body_content = parse_yaml_front_matter(content)
-        if front_matter
-          metadata.merge!(front_matter)
-          content = body_content
+        if doc.core_properties
+          metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
+          metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
+          metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
+          metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
+          metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
+          metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
+          metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
+          metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
         end
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
-      end
-      # Add content hash for duplicate detection
-      # Ensure content is UTF-8 encoded before checking presence
-      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
-      {
-        content: content,
-        metadata: metadata,
-        document_type: document_type
-      }
-    end
-    def parse_html
-      content = File.read(@file_path, encoding: "UTF-8")
-      # Extract title from H1 tag if present
-      h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
-      title = nil
-      if h1_match
-        # Clean up the H1 content by removing any HTML tags and normalizing whitespace
-        title = h1_match[1]
-                  .gsub(/<[^>]+>/, " ")  # Remove any nested HTML tags
-                  .gsub(/\s+/, " ")      # Normalize whitespace
-                  .strip
-      end
-      # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
-      clean_content = content
-                      .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
-                      .gsub(%r{<style[^>]*>.*?</style>}mi, "")   # Remove style tags
-                      .gsub(/<[^>]+>/, " ")                     # Remove all HTML tags
-                      .gsub(/\s+/, " ")                         # Normalize whitespace
-                      .strip
-      metadata = {
-        file_size: File.size(@file_path),
-        file_hash: calculate_file_hash(@file_path),
-        original_format: "html"
-      }
+        metadata[:paragraph_count] = doc.paragraphs.count
+        metadata[:table_count] = doc.tables.count
-      # Add title to metadata if found, otherwise use filepath fallback
-      if title && !title.empty?
-        metadata[:title] = title
-      else
-        metadata[:title] = extract_title_from_filepath
+        # Use DOCX title as main title if available
+        metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract DOCX metadata: #{e.message}"
+        {}
       end
-      # Add content hash for duplicate detection
-      metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
-      {
-        content: clean_content,
-        metadata: metadata,
-        document_type: "html"
-      }
     end
-    def parse_image
-      puts "🖼️  DocumentProcessor: Starting image parsing for #{@file_path}"
+    def extract_image_metadata
+      return {} unless File.exist?(@file_path)
-      metadata = {
-        file_size: File.size(@file_path),
-        file_hash: calculate_file_hash(@file_path),
-        file_type: @file_extension.sub(".", ""),
-        original_filename: File.basename(@file_path)
-      }
-      # Extract image dimensions
       begin
+        metadata = {}
         img = Magick::Image.read(@file_path).first
-        metadata[:width]  = img.columns
+        metadata[:width] = img.columns
         metadata[:height] = img.rows
-        puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
-      rescue StandardError => e # StandardError
-        puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
-        metadata[:width]  = nil
-        metadata[:height] = nil
+        metadata[:image_format] = img.format
+        metadata[:mime_type] = img.mime_type
+        metadata[:number_colors] = img.number_colors
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract image metadata: #{e.message}"
+        {}
       end
+    end
-      puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
-      desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
-      puts "📝 DocumentProcessor: Received description: '#{desc}'"
-      metadata[:description] = desc if desc && !desc.empty?
-      # Use AI-generated description or fallback placeholder
-      content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
-      # Add filepath-based title as fallback
-      metadata[:title] = extract_title_from_filepath
-      # Add content hash for duplicate detection
-      # Ensure content is UTF-8 encoded before checking presence
-      metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
-      puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
+    def extract_audio_metadata
+      # Basic audio file metadata
+      # In production, you might use audio analysis libraries
       {
-        content: content,
-        metadata: metadata,
-        document_type: "image"
+        media_type: "audio",
+        file_type: File.extname(@file_path).sub(".", "")
       }
     end
-    # Helper methods for document type determination
-    def self.determine_document_type(file_path)
-      case File.extname(file_path).downcase
-      when ".pdf" then "pdf"
-      when ".docx" then "docx"
-      when ".txt" then "text"
-      when ".md", ".markdown" then "markdown"
-      when ".html", ".htm" then "html"
-      when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
-      else "text"
-      end
-    end
-    def self.determine_document_type_from_content_type(content_type)
-      case content_type
-      when "application/pdf" then "pdf"
-      when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
-      when "text/plain" then "text"
-      when "text/markdown" then "markdown"
-      when "text/html" then "html"
-      when %r{^image/} then "image"
-      else "text"
-      end
-    end
-    def self.determine_content_type(file_path)
-      case File.extname(file_path).downcase
-      when ".pdf" then "application/pdf"
-      when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-      when ".txt" then "text/plain"
-      when ".md", ".markdown" then "text/markdown"
-      when ".html", ".htm" then "text/html"
-      when ".jpg", ".jpeg" then "image/jpeg"
-      when ".png" then "image/png"
-      when ".gif" then "image/gif"
-      when ".webp" then "image/webp"
-      when ".bmp" then "image/bmp"
-      when ".svg" then "image/svg+xml"
-      when ".ico" then "image/x-icon"
-      when ".tiff", ".tif" then "image/tiff"
-      else "application/octet-stream"
-      end
+    def extract_video_metadata
+      # Basic video file metadata
+      # In production, you might use video analysis libraries
+      {
+        media_type: "video",
+        file_type: File.extname(@file_path).sub(".", "")
+      }
     end
-    private
     # Extract a meaningful title from the file path as a fallback
-    # @param file_path [String] the full file path
-    # @return [String] a cleaned title derived from the filename
     def extract_title_from_filepath(file_path = @file_path)
       filename = File.basename(file_path, File.extname(file_path))
       # Clean up common patterns in filenames to make them more readable
       title = filename
                .gsub(/[-_]+/, ' ')           # Replace hyphens and underscores with spaces
                .gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
                .gsub(/\s+/, ' ')             # Normalize multiple spaces
                .strip
       # Capitalize words for better readability
       title.split(' ').map(&:capitalize).join(' ')
     end
-    # Parse YAML front matter from markdown content
-    # @param content [String] the full content of the markdown file
-    # @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
-    def parse_yaml_front_matter(content)
-      # Check if content starts with YAML front matter delimiter
-      return [nil, content] unless content.start_with?("---\n")
-      # Find the closing delimiter
-      lines = content.lines
-      closing_index = nil
-      lines.each_with_index do |line, index|
-        next if index == 0 # Skip the opening ---
-        if line.strip == "---"
-          closing_index = index
-          break
-        end
-      end
-      # No closing delimiter found
-      return [nil, content] unless closing_index
-      # Extract YAML content and body
-      yaml_lines = lines[1...closing_index]
-      body_lines = lines[(closing_index + 1)..-1]
-      yaml_content = yaml_lines.join
-      body_content = body_lines&.join || ""
-      # Parse YAML
-      begin
-        # Allow Time objects for date fields in YAML front matter
-        front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
-        # Convert string keys to symbols for consistency
-        front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
-        [front_matter, body_content.strip]
-      rescue YAML::SyntaxError, Psych::DisallowedClass => e
-        # If YAML parsing fails, return original content
-        Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
-        [nil, content]
-      end
-    end
     # Calculate SHA256 hash of file content for duplicate detection
     def calculate_file_hash(file_path)
       require 'digest'
@@ -497,5 +334,34 @@ module Ragdoll
       puts "Warning: Failed to calculate content hash: #{e.message}"
       nil
     end
+    # Detect file encoding for text files
+    def detect_file_encoding(file_path)
+      return nil unless File.exist?(file_path)
+      # Read a sample to detect encoding
+      sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
+      # Check for common encodings
+      if sample.valid_encoding?
+        # Try to convert to UTF-8
+        utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
+        return 'UTF-8' if utf8_content.valid_encoding?
+      end
+      # Try common encodings
+      ['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
+        begin
+          test_content = sample.force_encoding(encoding)
+          return encoding if test_content.valid_encoding?
+        rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+          next
+        end
+      end
+      'UTF-8' # Default fallback
+    rescue StandardError
+      'UTF-8'
+    end
   end
 end