RubyGems - ragdoll - Versions diffs - 0.1.10 → 0.1.12 - Mend

ragdoll 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/README.md +326 -351
data/app/models/ragdoll/document.rb +1 -1
data/app/models/ragdoll/search.rb +1 -1
data/app/models/ragdoll/unified_content.rb +216 -0
data/app/models/ragdoll/unified_document.rb +338 -0
data/app/services/ragdoll/audio_to_text_service.rb +200 -0
data/app/services/ragdoll/document_converter.rb +216 -0
data/app/services/ragdoll/document_management.rb +117 -9
data/app/services/ragdoll/document_processor.rb +213 -311
data/app/services/ragdoll/image_to_text_service.rb +322 -0
data/app/services/ragdoll/migration_service.rb +340 -0
data/app/services/ragdoll/text_extraction_service.rb +422 -0
data/app/services/ragdoll/unified_document_management.rb +300 -0
data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
data/lib/ragdoll/core/client.rb +2 -2
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +7 -0
metadata +11 -2

data/app/services/ragdoll/document_processor.rb CHANGED Viewed

@@ -5,7 +5,6 @@ require "docx"
 require "rmagick"
 require "yaml"
 require "date"
-# Image description service is auto-loaded from app/services
 module Ragdoll
   class DocumentProcessor
@@ -34,7 +33,7 @@ module Ragdoll
         location: File.expand_path(file_path),
         title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
         content: parsed[:content],
-        document_type: determine_document_type(file_path),
+        document_type: parsed[:document_type] || determine_document_type(file_path),
         metadata: parsed[:metadata] || {},
         status: "processed",
         file_modified_at: file_modified_at,
@@ -85,288 +84,44 @@ module Ragdoll
     end
     def parse
-      case @file_extension
-      when ".pdf"
-        parse_pdf
-      when ".docx"
-        parse_docx
-      when ".txt", ".md", ".markdown"
-        parse_text
-      when ".html", ".htm"
-        parse_html
-      when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
-        parse_image
-      else
-        parse_text # Default to text parsing for unknown formats
+      # Check if file exists first
+      unless File.exist?(@file_path)
+        raise ParseError, "File does not exist: #{@file_path}"
       end
-    rescue StandardError => e # StandardError => e
-      raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
-    end
-    private
-    def parse_pdf
-      content = ""
-      metadata = {}
+      # Use the new unified document converter
+      document_type = determine_document_type(@file_path)
       begin
-        PDF::Reader.open(@file_path) do |reader|
-          # Extract metadata
-          if reader.info
-            metadata[:title] = reader.info[:Title] if reader.info[:Title]
-            metadata[:author] = reader.info[:Author] if reader.info[:Author]
-            metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
-            metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
-            metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
-            metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
-            metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
-          end
-          metadata[:page_count] = reader.page_count
-          # Extract text from all pages
-          reader.pages.each_with_index do |page, index|
-            page_text = page.text.strip
-            next if page_text.empty?
-            content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
-            content += page_text
-          end
-        end
-      rescue PDF::Reader::MalformedPDFError => e
-        raise ParseError, "Malformed PDF: #{e.message}"
-      rescue PDF::Reader::UnsupportedFeatureError => e
-        raise ParseError, "Unsupported PDF feature: #{e.message}"
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
-      end
-      {
-        content: content.strip,
-        metadata: metadata,
-        document_type: "pdf"
-      }
-    end
-    def parse_docx
-      content = ""
-      metadata = {}
-      begin
-        doc = Docx::Document.open(@file_path)
-        # Extract core properties
-        if doc.core_properties
-          metadata[:title] = doc.core_properties.title if doc.core_properties.title
-          metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
-          metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
-          metadata[:description] = doc.core_properties.description if doc.core_properties.description
-          metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
-          metadata[:created] = doc.core_properties.created if doc.core_properties.created
-          metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
-          if doc.core_properties.last_modified_by
-            metadata[:last_modified_by] =
-              doc.core_properties.last_modified_by
-          end
-        end
-        # Extract text from paragraphs
-        doc.paragraphs.each do |paragraph|
-          paragraph_text = paragraph.text.strip
-          next if paragraph_text.empty?
-          content += "#{paragraph_text}\n\n"
-        end
-        # Extract text from tables
-        doc.tables.each_with_index do |table, table_index|
-          content += "\n--- Table #{table_index + 1} ---\n\n"
-          table.rows.each do |row|
-            row_text = row.cells.map(&:text).join(" | ")
-            content += "#{row_text}\n" unless row_text.strip.empty?
-          end
-          content += "\n"
-        end
-        metadata[:paragraph_count] = doc.paragraphs.count
-        metadata[:table_count] = doc.tables.count
-      rescue StandardError => e # StandardError => e
-        raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
-      end
-      {
-        content: content.strip,
-        metadata: metadata,
-        document_type: "docx"
-      }
-    end
-    def parse_text
-      content = File.read(@file_path, encoding: "UTF-8")
-      metadata = {
-        file_size: File.size(@file_path),
-        encoding: "UTF-8"
-      }
+        # Convert to text using the unified pipeline
+        text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
-      document_type = case @file_extension
-                      when ".md", ".markdown" then "markdown"
-                      when ".txt" then "text"
-                      else "text"
-                      end
-      # Parse YAML front matter for markdown files
-      if document_type == "markdown" && content.start_with?("---\n")
-        front_matter, body_content = parse_yaml_front_matter(content)
-        if front_matter
-          metadata.merge!(front_matter)
-          content = body_content
-        end
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
-      end
-      {
-        content: content,
-        metadata: metadata,
-        document_type: document_type
-      }
-    rescue Encoding::InvalidByteSequenceError
-      # Try with different encoding
-      content = File.read(@file_path, encoding: "ISO-8859-1")
-      metadata = {
-        file_size: File.size(@file_path),
-        encoding: "ISO-8859-1"
-      }
+        # Extract metadata based on document type
+        metadata = extract_metadata_for_type(document_type)
-      # Try to parse front matter with different encoding too
-      if document_type == "markdown" && content.start_with?("---\n")
-        front_matter, body_content = parse_yaml_front_matter(content)
-        if front_matter
-          metadata.merge!(front_matter)
-          content = body_content
+        # Add encoding information for text files
+        if %w[text markdown html].include?(document_type)
+          encoding = detect_file_encoding(@file_path) || "UTF-8"
+          metadata[:encoding] = encoding
         end
-      end
-      # Add filepath-based title as fallback if no title was found
-      if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
-        metadata[:title] = extract_title_from_filepath
+        # Get title from metadata or filename
+        title = metadata[:title] || extract_title_from_filepath
+        {
+          content: text_content,
+          metadata: metadata,
+          title: title,
+          document_type: document_type
+        }
+      rescue StandardError => e
+        raise ParseError, "Failed to parse document: #{e.message}"
       end
-      {
-        content: content,
-        metadata: metadata,
-        document_type: document_type.nil? ? "text" : document_type
-      }
-    end
-    def parse_html
-      content = File.read(@file_path, encoding: "UTF-8")
-      # Extract title from H1 tag if present
-      h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
-      title = nil
-      if h1_match
-        # Clean up the H1 content by removing any HTML tags and normalizing whitespace
-        title = h1_match[1]
-                  .gsub(/<[^>]+>/, " ")  # Remove any nested HTML tags
-                  .gsub(/\s+/, " ")      # Normalize whitespace
-                  .strip
-      end
-      # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
-      clean_content = content
-                      .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
-                      .gsub(%r{<style[^>]*>.*?</style>}mi, "")   # Remove style tags
-                      .gsub(/<[^>]+>/, " ")                     # Remove all HTML tags
-                      .gsub(/\s+/, " ")                         # Normalize whitespace
-                      .strip
-      metadata = {
-        file_size: File.size(@file_path),
-        original_format: "html"
-      }
-      # Add title to metadata if found, otherwise use filepath fallback
-      if title && !title.empty?
-        metadata[:title] = title
-      else
-        metadata[:title] = extract_title_from_filepath
-      end
-      {
-        content: clean_content,
-        metadata: metadata,
-        document_type: "html"
-      }
-    end
-    def parse_image
-      puts "🖼️  DocumentProcessor: Starting image parsing for #{@file_path}"
-      metadata = {
-        file_size: File.size(@file_path),
-        file_type: @file_extension.sub(".", ""),
-        original_filename: File.basename(@file_path)
-      }
-      # Extract image dimensions
-      begin
-        img = Magick::Image.read(@file_path).first
-        metadata[:width]  = img.columns
-        metadata[:height] = img.rows
-        puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
-      rescue StandardError => e # StandardError
-        puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
-        metadata[:width]  = nil
-        metadata[:height] = nil
-      end
-      puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
-      desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
-      puts "📝 DocumentProcessor: Received description: '#{desc}'"
-      metadata[:description] = desc if desc && !desc.empty?
-      # Use AI-generated description or fallback placeholder
-      content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
-      # Add filepath-based title as fallback
-      metadata[:title] = extract_title_from_filepath
-      puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
-      {
-        content: content,
-        metadata: metadata,
-        document_type: "image"
-      }
     end
     # Helper methods for document type determination
     def self.determine_document_type(file_path)
-      case File.extname(file_path).downcase
-      when ".pdf" then "pdf"
-      when ".docx" then "docx"
-      when ".txt" then "text"
-      when ".md", ".markdown" then "markdown"
-      when ".html", ".htm" then "html"
-      when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
-      else "text"
-      end
+      Ragdoll::DocumentConverter.new.determine_document_type(file_path)
     end
     def self.determine_document_type_from_content_type(content_type)
@@ -377,6 +132,8 @@ module Ragdoll
       when "text/markdown" then "markdown"
       when "text/html" then "html"
       when %r{^image/} then "image"
+      when %r{^audio/} then "audio"
+      when %r{^video/} then "video"
       else "text"
       end
     end
@@ -396,70 +153,215 @@ module Ragdoll
       when ".svg" then "image/svg+xml"
       when ".ico" then "image/x-icon"
       when ".tiff", ".tif" then "image/tiff"
+      when ".mp3" then "audio/mpeg"
+      when ".wav" then "audio/wav"
+      when ".m4a" then "audio/mp4"
+      when ".flac" then "audio/flac"
+      when ".ogg" then "audio/ogg"
+      when ".mp4" then "video/mp4"
+      when ".mov" then "video/quicktime"
+      when ".avi" then "video/x-msvideo"
+      when ".webm" then "video/webm"
       else "application/octet-stream"
       end
     end
     private
+    def determine_document_type(file_path)
+      Ragdoll::DocumentConverter.new.determine_document_type(file_path)
+    end
+    def extract_metadata_for_type(document_type)
+      metadata = basic_file_metadata
+      case document_type
+      when "pdf"
+        metadata.merge!(extract_pdf_metadata)
+      when "docx"
+        metadata.merge!(extract_docx_metadata)
+      when "image"
+        metadata.merge!(extract_image_metadata)
+      when "audio"
+        metadata.merge!(extract_audio_metadata)
+      when "video"
+        metadata.merge!(extract_video_metadata)
+      end
+      metadata
+    end
+    def basic_file_metadata
+      metadata = {}
+      if File.exist?(@file_path)
+        metadata[:file_size] = File.size(@file_path)
+        metadata[:file_hash] = calculate_file_hash(@file_path)
+        metadata[:file_modified_at] = File.mtime(@file_path)
+      end
+      metadata[:original_filename] = File.basename(@file_path)
+      metadata[:file_extension] = File.extname(@file_path).downcase
+      metadata
+    end
+    def extract_pdf_metadata
+      return {} unless File.exist?(@file_path)
+      begin
+        metadata = {}
+        PDF::Reader.open(@file_path) do |reader|
+          if reader.info
+            metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
+            metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
+            metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
+            metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
+            metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
+            metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
+            metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
+          end
+          metadata[:page_count] = reader.page_count
+        end
+        # Use PDF title as main title if available
+        metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract PDF metadata: #{e.message}"
+        {}
+      end
+    end
+    def extract_docx_metadata
+      return {} unless File.exist?(@file_path)
+      begin
+        metadata = {}
+        doc = Docx::Document.open(@file_path)
+        if doc.core_properties
+          metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
+          metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
+          metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
+          metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
+          metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
+          metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
+          metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
+          metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
+        end
+        metadata[:paragraph_count] = doc.paragraphs.count
+        metadata[:table_count] = doc.tables.count
+        # Use DOCX title as main title if available
+        metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract DOCX metadata: #{e.message}"
+        {}
+      end
+    end
+    def extract_image_metadata
+      return {} unless File.exist?(@file_path)
+      begin
+        metadata = {}
+        img = Magick::Image.read(@file_path).first
+        metadata[:width] = img.columns
+        metadata[:height] = img.rows
+        metadata[:image_format] = img.format
+        metadata[:mime_type] = img.mime_type
+        metadata[:number_colors] = img.number_colors
+        metadata
+      rescue StandardError => e
+        puts "Warning: Failed to extract image metadata: #{e.message}"
+        {}
+      end
+    end
+    def extract_audio_metadata
+      # Basic audio file metadata
+      # In production, you might use audio analysis libraries
+      {
+        media_type: "audio",
+        file_type: File.extname(@file_path).sub(".", "")
+      }
+    end
+    def extract_video_metadata
+      # Basic video file metadata
+      # In production, you might use video analysis libraries
+      {
+        media_type: "video",
+        file_type: File.extname(@file_path).sub(".", "")
+      }
+    end
     # Extract a meaningful title from the file path as a fallback
-    # @param file_path [String] the full file path
-    # @return [String] a cleaned title derived from the filename
     def extract_title_from_filepath(file_path = @file_path)
       filename = File.basename(file_path, File.extname(file_path))
       # Clean up common patterns in filenames to make them more readable
       title = filename
                .gsub(/[-_]+/, ' ')           # Replace hyphens and underscores with spaces
                .gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
                .gsub(/\s+/, ' ')             # Normalize multiple spaces
                .strip
       # Capitalize words for better readability
       title.split(' ').map(&:capitalize).join(' ')
     end
-    # Parse YAML front matter from markdown content
-    # @param content [String] the full content of the markdown file
-    # @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
-    def parse_yaml_front_matter(content)
-      # Check if content starts with YAML front matter delimiter
-      return [nil, content] unless content.start_with?("---\n")
-      # Find the closing delimiter
-      lines = content.lines
-      closing_index = nil
-      lines.each_with_index do |line, index|
-        next if index == 0 # Skip the opening ---
-        if line.strip == "---"
-          closing_index = index
-          break
-        end
-      end
+    # Calculate SHA256 hash of file content for duplicate detection
+    def calculate_file_hash(file_path)
+      require 'digest'
+      Digest::SHA256.file(file_path).hexdigest
+    rescue StandardError => e
+      Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
+      puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
+      nil
+    end
-      # No closing delimiter found
-      return [nil, content] unless closing_index
+    # Calculate SHA256 hash of text content for duplicate detection
+    def calculate_content_hash(content)
+      require 'digest'
+      Digest::SHA256.hexdigest(content)
+    rescue StandardError => e
+      Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
+      puts "Warning: Failed to calculate content hash: #{e.message}"
+      nil
+    end
-      # Extract YAML content and body
-      yaml_lines = lines[1...closing_index]
-      body_lines = lines[(closing_index + 1)..-1]
+    # Detect file encoding for text files
+    def detect_file_encoding(file_path)
+      return nil unless File.exist?(file_path)
-      yaml_content = yaml_lines.join
-      body_content = body_lines&.join || ""
+      # Read a sample to detect encoding
+      sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
-      # Parse YAML
-      begin
-        # Allow Time objects for date fields in YAML front matter
-        front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
-        # Convert string keys to symbols for consistency
-        front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
-        [front_matter, body_content.strip]
-      rescue YAML::SyntaxError, Psych::DisallowedClass => e
-        # If YAML parsing fails, return original content
-        Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
-        [nil, content]
+      # Check for common encodings
+      if sample.valid_encoding?
+        # Try to convert to UTF-8
+        utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
+        return 'UTF-8' if utf8_content.valid_encoding?
       end
+      # Try common encodings
+      ['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
+        begin
+          test_content = sample.force_encoding(encoding)
+          return encoding if test_content.valid_encoding?
+        rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+          next
+        end
+      end
+      'UTF-8' # Default fallback
+    rescue StandardError
+      'UTF-8'
     end
   end
 end