RubyGems - universal_document_processor - Versions diffs - 1.0.0 - Mend

universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/AI_USAGE_GUIDE.md +404 -0
data/CHANGELOG.md +67 -0
data/GEM_RELEASE_GUIDE.md +288 -0
data/Gemfile +27 -0
data/LICENSE +21 -0
data/README.md +726 -0
data/Rakefile +36 -0
data/lib/universal_document_processor/ai_agent.rb +491 -0
data/lib/universal_document_processor/document.rb +225 -0
data/lib/universal_document_processor/processors/archive_processor.rb +290 -0
data/lib/universal_document_processor/processors/base_processor.rb +58 -0
data/lib/universal_document_processor/processors/character_validator.rb +283 -0
data/lib/universal_document_processor/processors/excel_processor.rb +219 -0
data/lib/universal_document_processor/processors/image_processor.rb +172 -0
data/lib/universal_document_processor/processors/pdf_processor.rb +105 -0
data/lib/universal_document_processor/processors/powerpoint_processor.rb +214 -0
data/lib/universal_document_processor/processors/text_processor.rb +360 -0
data/lib/universal_document_processor/processors/word_processor.rb +137 -0
data/lib/universal_document_processor/utils/file_detector.rb +83 -0
data/lib/universal_document_processor/utils/japanese_filename_handler.rb +205 -0
data/lib/universal_document_processor/version.rb +3 -0
data/lib/universal_document_processor.rb +223 -0
metadata +198 -0

data/lib/universal_document_processor/processors/word_processor.rb ADDED Viewed

@@ -0,0 +1,137 @@
+module UniversalDocumentProcessor
+  module Processors
+    class WordProcessor < BaseProcessor
+      def extract_text
+        with_error_handling do
+          if @file_path.end_with?('.docx')
+            extract_docx_text
+          else
+            # Fallback for .doc files
+            fallback_text_extraction
+          end
+        end
+      end
+      def extract_metadata
+        with_error_handling do
+          if @file_path.end_with?('.docx')
+            extract_docx_metadata
+          else
+            super
+          end
+        end
+      end
+      def extract_images
+        with_error_handling do
+          return [] unless @file_path.end_with?('.docx')
+          images = []
+          doc = Docx::Document.open(@file_path)
+          # Extract embedded images
+          doc.doc_xml.xpath('//w:drawing//a:blip').each_with_index do |blip, index|
+            embed_id = blip['r:embed']
+            if embed_id
+              images << {
+                index: index + 1,
+                embed_id: embed_id,
+                type: 'embedded'
+              }
+            end
+          end
+          images
+        end
+      end
+      def extract_tables
+        with_error_handling do
+          return [] unless @file_path.end_with?('.docx')
+          tables = []
+          doc = Docx::Document.open(@file_path)
+          doc.tables.each_with_index do |table, table_index|
+            table_data = {
+              index: table_index + 1,
+              rows: table.rows.length,
+              columns: table.column_count,
+              content: []
+            }
+            table.rows.each do |row|
+              row_data = row.cells.map(&:text)
+              table_data[:content] << row_data
+            end
+            tables << table_data
+          end
+          tables
+        end
+      end
+      def supported_operations
+        super + [:extract_images, :extract_tables, :extract_styles, :extract_comments]
+      end
+      private
+      def extract_docx_text
+        doc = Docx::Document.open(@file_path)
+        text_content = []
+        # Extract paragraphs
+        doc.paragraphs.each do |paragraph|
+          text_content << paragraph.text unless paragraph.text.strip.empty?
+        end
+        # Extract table content
+        doc.tables.each do |table|
+          table.rows.each do |row|
+            row_text = row.cells.map(&:text).join(' | ')
+            text_content << row_text unless row_text.strip.empty?
+          end
+        end
+        text_content.join("\n")
+      end
+      def extract_docx_metadata
+        doc = Docx::Document.open(@file_path)
+        core_properties = doc.core_properties
+        super.merge({
+          title: core_properties.title,
+          author: core_properties.creator,
+          subject: core_properties.subject,
+          description: core_properties.description,
+          keywords: core_properties.keywords,
+          created_at: core_properties.created,
+          modified_at: core_properties.modified,
+          last_modified_by: core_properties.last_modified_by,
+          revision: core_properties.revision,
+          word_count: count_words(extract_docx_text),
+          paragraph_count: doc.paragraphs.length,
+          table_count: doc.tables.length
+        })
+      rescue => e
+        super
+      end
+      def count_words(text)
+        text.split(/\s+/).length
+      rescue
+        0
+      end
+      def fallback_text_extraction
+        # Use Yomu for .doc files or as fallback
+        Yomu.new(@file_path).text
+      rescue => e
+        "Unable to extract text from Word document: #{e.message}"
+      end
+    end
+  end
+end

data/lib/universal_document_processor/utils/file_detector.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module UniversalDocumentProcessor
+  module Utils
+    class FileDetector
+      MIME_TYPE_MAPPINGS = {
+        'pdf' => 'application/pdf',
+        'doc' => 'application/msword',
+        'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        'xls' => 'application/vnd.ms-excel',
+        'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        'ppt' => 'application/vnd.ms-powerpoint',
+        'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+        'txt' => 'text/plain',
+        'rtf' => 'application/rtf',
+        'html' => 'text/html',
+        'htm' => 'text/html',
+        'xml' => 'application/xml',
+        'csv' => 'text/csv',
+        'json' => 'application/json',
+        'jpg' => 'image/jpeg',
+        'jpeg' => 'image/jpeg',
+        'png' => 'image/png',
+        'gif' => 'image/gif',
+        'bmp' => 'image/bmp',
+        'tiff' => 'image/tiff',
+        'tif' => 'image/tiff',
+        'zip' => 'application/zip',
+        'rar' => 'application/x-rar-compressed',
+        '7z' => 'application/x-7z-compressed'
+      }.freeze
+      def self.detect(file_path)
+        # First try Marcel for accurate MIME detection
+        mime_type = Marcel::MimeType.for(Pathname.new(file_path))
+        return mime_type if mime_type && mime_type != 'application/octet-stream'
+        # Fallback to extension-based detection
+        extension = File.extname(file_path).downcase.gsub('.', '')
+        MIME_TYPE_MAPPINGS[extension] || 'application/octet-stream'
+      end
+      def self.supported?(file_path)
+        mime_type = detect(file_path)
+        supported_mime_types.include?(mime_type)
+      end
+      def self.supported_mime_types
+        MIME_TYPE_MAPPINGS.values + [
+          'application/octet-stream',
+          'text/plain',
+          'text/html',
+          'application/xml'
+        ]
+      end
+      def self.format_category(file_path)
+        mime_type = detect(file_path)
+        case mime_type
+        when /pdf/
+          :pdf
+        when /word/, /document/
+          :document
+        when /excel/, /spreadsheet/
+          :spreadsheet
+        when /powerpoint/, /presentation/
+          :presentation
+        when /image/
+          :image
+        when /text/, /plain/
+          :text
+        when /zip/, /archive/, /compressed/
+          :archive
+        else
+          :unknown
+        end
+      end
+      def self.extension_from_mime(mime_type)
+        MIME_TYPE_MAPPINGS.key(mime_type) || 'bin'
+      end
+    end
+  end
+end

data/lib/universal_document_processor/utils/japanese_filename_handler.rb ADDED Viewed

@@ -0,0 +1,205 @@
+module UniversalDocumentProcessor
+  module Utils
+    class JapaneseFilenameHandler
+      # Japanese filename character ranges
+      HIRAGANA_RANGE = /[\u{3040}-\u{309F}]/
+      KATAKANA_RANGE = /[\u{30A0}-\u{30FF}]/
+      KANJI_RANGE = /[\u{4E00}-\u{9FAF}]/
+      FULLWIDTH_RANGE = /[\u{FF00}-\u{FFEF}]/
+      # Combined Japanese character pattern
+      JAPANESE_CHARS = /[\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FAF}\u{FF00}-\u{FFEF}]/
+      # Valid filename characters (including Japanese)
+      VALID_FILENAME_CHARS = /\A[\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FAF}\u{FF00}-\u{FFEF}\w\s\-_.()@#$%&+=!~]*\z/
+      def self.contains_japanese?(filename)
+        return false unless filename.is_a?(String)
+        # Ensure UTF-8 encoding for regex matching
+        normalized = filename.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
+        normalized.match?(JAPANESE_CHARS)
+      rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+        false
+      end
+      def self.normalize_filename(filename)
+        return filename unless filename.is_a?(String)
+        # Ensure UTF-8 encoding
+        normalized = filename.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
+        # Handle different encoding scenarios
+        if normalized.encoding != Encoding::UTF_8
+          normalized = normalized.force_encoding('UTF-8')
+        end
+        normalized
+      end
+      def self.safe_filename(filename)
+        normalized = normalize_filename(filename)
+        # Replace problematic characters while preserving Japanese
+        safe = normalized.gsub(/[<>:"|?*]/, '_')
+        # Handle Windows reserved names
+        safe = safe.gsub(/^(CON|PRN|AUX|NUL|COM[1-9]|LPT[1-9])$/i, '_\1')
+        # Ensure not too long (Windows has 255 char limit, but we'll be conservative)
+        if safe.bytesize > 200
+          extension = File.extname(safe)
+          basename = File.basename(safe, extension)
+          # Truncate basename but keep extension
+          while (basename + extension).bytesize > 200 && basename.length > 1
+            basename = basename[0...-1]
+          end
+          safe = basename + extension
+        end
+        safe
+      end
+      def self.validate_filename(filename)
+        issues = []
+        return { valid: false, issues: ['Filename is nil or empty'] } if filename.nil? || filename.empty?
+        normalized = normalize_filename(filename)
+        # Check encoding validity
+        unless normalized.valid_encoding?
+          issues << 'Filename contains invalid encoding sequences'
+        end
+        # Check for null bytes
+        if normalized.include?("\x00")
+          issues << 'Filename contains null bytes'
+        end
+        # Check for control characters
+        if normalized.match?(/[\x00-\x1F\x7F]/)
+          issues << 'Filename contains control characters'
+        end
+        # Check length
+        if normalized.bytesize > 255
+          issues << 'Filename is too long (over 255 bytes)'
+        end
+        # Check for Windows reserved names
+        basename = File.basename(normalized, File.extname(normalized))
+        if basename.match?(/^(CON|PRN|AUX|NUL|COM[1-9]|LPT[1-9])$/i)
+          issues << 'Filename uses Windows reserved name'
+        end
+        {
+          valid: issues.empty?,
+          issues: issues,
+          contains_japanese: contains_japanese?(normalized),
+          normalized_filename: normalized,
+          safe_filename: safe_filename(filename)
+        }
+      end
+      def self.extract_japanese_parts(filename)
+        return {} unless contains_japanese?(filename)
+        {
+          hiragana: filename.scan(HIRAGANA_RANGE),
+          katakana: filename.scan(KATAKANA_RANGE),
+          kanji: filename.scan(KANJI_RANGE),
+          fullwidth: filename.scan(FULLWIDTH_RANGE),
+          japanese_count: filename.scan(JAPANESE_CHARS).length
+        }
+      end
+      def self.create_safe_temp_filename(original_filename, prefix = 'doc')
+        validation = validate_filename(original_filename)
+        if validation[:valid]
+          # Use the normalized filename if it's valid
+          validation[:normalized_filename]
+        else
+          # Create a safe temporary filename
+          extension = File.extname(original_filename)
+          timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
+          japanese_parts = extract_japanese_parts(original_filename)
+          if japanese_parts[:japanese_count] > 0
+            # Include some Japanese context if possible
+            safe_japanese = japanese_parts[:hiragana].first(3).join +
+                           japanese_parts[:katakana].first(3).join +
+                           japanese_parts[:kanji].first(3).join
+            "#{prefix}_#{safe_japanese}_#{timestamp}#{extension}"
+          else
+            "#{prefix}_#{timestamp}#{extension}"
+          end
+        end
+      end
+      def self.analyze_filename_encoding(filename)
+        encodings_to_try = ['UTF-8', 'Shift_JIS', 'EUC-JP', 'ISO-8859-1', 'Windows-1252']
+        results = {}
+        encodings_to_try.each do |encoding|
+          begin
+            if filename.encoding.name == encoding
+              # Already in this encoding
+              results[encoding] = {
+                valid: filename.valid_encoding?,
+                convertible: true,
+                contains_japanese: contains_japanese?(filename.dup.force_encoding('UTF-8'))
+              }
+            else
+              # Try to convert to this encoding
+              converted = filename.encode(encoding)
+              # For Japanese detection, always use UTF-8 version
+              utf8_version = converted.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
+              results[encoding] = {
+                valid: converted.valid_encoding?,
+                convertible: true,
+                contains_japanese: contains_japanese?(utf8_version)
+              }
+            end
+          rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+            results[encoding] = {
+              valid: false,
+              convertible: false,
+              contains_japanese: false
+            }
+          end
+        end
+        {
+          original_encoding: filename.encoding.name,
+          analysis: results,
+          recommended_encoding: find_best_encoding(results)
+        }
+      end
+      private
+      def self.find_best_encoding(analysis_results)
+        # Prefer UTF-8 if valid
+        return 'UTF-8' if analysis_results['UTF-8']&.dig(:valid)
+        # Then try Japanese encodings if they contain Japanese
+        ['Shift_JIS', 'EUC-JP'].each do |encoding|
+          result = analysis_results[encoding]
+          if result&.dig(:valid) && result&.dig(:contains_japanese)
+            return encoding
+          end
+        end
+        # Fall back to any valid encoding
+        analysis_results.each do |encoding, result|
+          return encoding if result&.dig(:valid)
+        end
+        'UTF-8' # Default fallback
+      end
+    end
+  end
+end

data/lib/universal_document_processor/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module UniversalDocumentProcessor
+  VERSION = "1.0.0"
+end

data/lib/universal_document_processor.rb ADDED Viewed

@@ -0,0 +1,223 @@
+require 'active_support/all'
+require 'marcel'
+require 'nokogiri'
+require 'zip'
+# Optional dependencies - only require if available
+begin
+  require 'pdf-reader'
+rescue LoadError
+  # PDF processing will use fallback
+end
+begin
+  require 'prawn'
+rescue LoadError
+  # PDF generation will not be available
+end
+begin
+  require 'docx'
+rescue LoadError
+  # Word processing will use fallback
+end
+begin
+  require 'roo'
+rescue LoadError
+  # Excel processing will use fallback
+end
+begin
+  require 'mini_magick'
+rescue LoadError
+  # Image processing will use fallback
+end
+begin
+  require 'yomu'
+rescue LoadError
+  # Universal text extraction will use basic fallback
+end
+require_relative 'universal_document_processor/version'
+require_relative 'universal_document_processor/document'
+require_relative 'universal_document_processor/processors/base_processor'
+require_relative 'universal_document_processor/processors/pdf_processor'
+require_relative 'universal_document_processor/processors/word_processor'
+require_relative 'universal_document_processor/processors/excel_processor'
+require_relative 'universal_document_processor/processors/powerpoint_processor'
+require_relative 'universal_document_processor/processors/image_processor'
+require_relative 'universal_document_processor/processors/archive_processor'
+require_relative 'universal_document_processor/processors/text_processor'
+require_relative 'universal_document_processor/processors/character_validator'
+require_relative 'universal_document_processor/utils/file_detector'
+require_relative 'universal_document_processor/utils/japanese_filename_handler'
+require_relative 'universal_document_processor/ai_agent'
+module UniversalDocumentProcessor
+  class Error < StandardError; end
+  class UnsupportedFormatError < Error; end
+  class ProcessingError < Error; end
+  class DependencyMissingError < Error; end
+  # Main entry point for document processing
+  def self.process(file_path_or_io, options = {})
+    Document.new(file_path_or_io, options).process
+  end
+  # Extract text from any document
+  def self.extract_text(file_path_or_io, options = {})
+    Document.new(file_path_or_io, options).extract_text
+  end
+  # Get document metadata
+  def self.get_metadata(file_path_or_io, options = {})
+    Document.new(file_path_or_io, options).metadata
+  end
+  # Analyze text for invalid characters and encoding issues
+  def self.analyze_text_quality(text)
+    Processors::CharacterValidator.analyze_text(text)
+  end
+  # Validate file encoding and character issues
+  def self.validate_file(file_path)
+    Processors::CharacterValidator.validate_file_encoding(file_path)
+  end
+  # Clean text by removing invalid characters
+  def self.clean_text(text, options = {})
+    Processors::CharacterValidator.clean_text(text, options)
+  end
+  # Validate Japanese text specifically
+  def self.validate_japanese_text(text)
+    Processors::CharacterValidator.validate_japanese_text(text)
+  end
+  # Check if text contains Japanese characters
+  def self.japanese_text?(text)
+    Processors::CharacterValidator.is_japanese_text?(text)
+  end
+  # Japanese filename support methods
+  def self.japanese_filename?(filename)
+    Utils::JapaneseFilenameHandler.contains_japanese?(filename)
+  end
+  def self.validate_filename(filename)
+    Utils::JapaneseFilenameHandler.validate_filename(filename)
+  end
+  def self.safe_filename(filename)
+    Utils::JapaneseFilenameHandler.safe_filename(filename)
+  end
+  def self.normalize_filename(filename)
+    Utils::JapaneseFilenameHandler.normalize_filename(filename)
+  end
+  # AI-powered document analysis methods
+  def self.ai_analyze(file_path, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.analyze_document(document_result, options[:query])
+  end
+  def self.ai_summarize(file_path, length: :medium, options: {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.summarize_document(document_result, length: length)
+  end
+  def self.ai_extract_info(file_path, categories = nil, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.extract_key_information(document_result, categories)
+  end
+  def self.ai_translate(file_path, target_language, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.translate_document(document_result, target_language)
+  end
+  def self.ai_classify(file_path, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.classify_document(document_result)
+  end
+  def self.ai_insights(file_path, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.generate_insights(document_result)
+  end
+  def self.ai_action_items(file_path, options = {})
+    document_result = process(file_path, options)
+    ai_agent = AIAgent.new(options)
+    ai_agent.extract_action_items(document_result)
+  end
+  def self.ai_compare(file_paths, comparison_type = :content, options = {})
+    document_results = file_paths.map { |path| process(path, options) }
+    ai_agent = AIAgent.new(options)
+    ai_agent.compare_documents(document_results, comparison_type)
+  end
+  def self.create_ai_agent(options = {})
+    AIAgent.new(options)
+  end
+  # Convert document to different format
+  def self.convert(file_path_or_io, target_format, options = {})
+    Document.new(file_path_or_io, options).convert_to(target_format)
+  end
+  # Batch process multiple documents
+  def self.batch_process(file_paths, options = {})
+    file_paths.map do |file_path|
+      begin
+        process(file_path, options)
+      rescue => e
+        { file: file_path, error: e.message, success: false }
+      end
+    end
+  end
+  # Check if a dependency is available
+  def self.dependency_available?(dependency)
+    case dependency.to_sym
+    when :pdf_reader
+      defined?(PDF::Reader)
+    when :docx
+      defined?(Docx)
+    when :roo
+      defined?(Roo)
+    when :mini_magick
+      defined?(MiniMagick)
+    when :yomu
+      defined?(Yomu)
+    when :prawn
+      defined?(Prawn)
+    else
+      false
+    end
+  end
+  # Get list of available features based on installed dependencies
+  def self.available_features
+    features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing]
+    features << :pdf_processing if dependency_available?(:pdf_reader)
+    features << :word_processing if dependency_available?(:docx)
+    features << :excel_processing if dependency_available?(:roo)
+    features << :image_processing if dependency_available?(:mini_magick)
+    features << :universal_text_extraction if dependency_available?(:yomu)
+    features << :pdf_generation if dependency_available?(:prawn)
+    features
+  end
+end