RubyGems - universal_document_processor - Versions diffs - 1.0.0 - Mend

universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/AI_USAGE_GUIDE.md +404 -0
data/CHANGELOG.md +67 -0
data/GEM_RELEASE_GUIDE.md +288 -0
data/Gemfile +27 -0
data/LICENSE +21 -0
data/README.md +726 -0
data/Rakefile +36 -0
data/lib/universal_document_processor/ai_agent.rb +491 -0
data/lib/universal_document_processor/document.rb +225 -0
data/lib/universal_document_processor/processors/archive_processor.rb +290 -0
data/lib/universal_document_processor/processors/base_processor.rb +58 -0
data/lib/universal_document_processor/processors/character_validator.rb +283 -0
data/lib/universal_document_processor/processors/excel_processor.rb +219 -0
data/lib/universal_document_processor/processors/image_processor.rb +172 -0
data/lib/universal_document_processor/processors/pdf_processor.rb +105 -0
data/lib/universal_document_processor/processors/powerpoint_processor.rb +214 -0
data/lib/universal_document_processor/processors/text_processor.rb +360 -0
data/lib/universal_document_processor/processors/word_processor.rb +137 -0
data/lib/universal_document_processor/utils/file_detector.rb +83 -0
data/lib/universal_document_processor/utils/japanese_filename_handler.rb +205 -0
data/lib/universal_document_processor/version.rb +3 -0
data/lib/universal_document_processor.rb +223 -0
metadata +198 -0

data/lib/universal_document_processor/processors/character_validator.rb ADDED Viewed

@@ -0,0 +1,283 @@
+module UniversalDocumentProcessor
+  module Processors
+    class CharacterValidator
+      # Invalid character patterns
+      INVALID_CONTROL_CHARS = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/
+      REPLACEMENT_CHAR = "\uFFFD" # Unicode replacement character
+      NULL_BYTE = "\x00"
+      def self.analyze_text(text)
+        return {} if text.nil? || text.empty?
+        {
+          encoding: text.encoding.name,
+          valid_encoding: text.valid_encoding?,
+          has_invalid_chars: has_invalid_characters?(text),
+          has_control_chars: has_control_characters?(text),
+          has_null_bytes: has_null_bytes?(text),
+          has_replacement_chars: has_replacement_characters?(text),
+          has_non_printable: has_non_printable_characters?(text),
+          character_issues: detect_character_issues(text),
+          cleaned_text: clean_text(text),
+          statistics: character_statistics(text),
+          japanese_analysis: validate_japanese_text(text)
+        }
+      end
+      def self.has_invalid_characters?(text)
+        !text.valid_encoding? || text.include?(REPLACEMENT_CHAR)
+      end
+      def self.has_control_characters?(text)
+        text.match?(INVALID_CONTROL_CHARS)
+      end
+      def self.has_null_bytes?(text)
+        text.include?(NULL_BYTE)
+      end
+      def self.has_replacement_characters?(text)
+        text.include?(REPLACEMENT_CHAR)
+      end
+      def self.has_non_printable_characters?(text)
+        # Check for non-printable characters (excluding common whitespace)
+        text.match?(/[^\p{Print}\s\t\n\r]/)
+      end
+      def self.detect_character_issues(text)
+        issues = []
+        # Check encoding validity
+        unless text.valid_encoding?
+          issues << {
+            type: 'invalid_encoding',
+            message: "Text contains invalid #{text.encoding.name} sequences",
+            severity: 'high'
+          }
+        end
+        # Check for null bytes
+        if has_null_bytes?(text)
+          null_positions = find_character_positions(text, NULL_BYTE)
+          issues << {
+            type: 'null_bytes',
+            message: "Text contains #{null_positions.length} null bytes",
+            positions: null_positions,
+            severity: 'high'
+          }
+        end
+        # Check for control characters
+        if has_control_characters?(text)
+          control_chars = text.scan(INVALID_CONTROL_CHARS).uniq
+          issues << {
+            type: 'control_characters',
+            message: "Text contains control characters: #{control_chars.map { |c| "\\x#{c.ord.to_s(16).upcase}" }.join(', ')}",
+            characters: control_chars,
+            severity: 'medium'
+          }
+        end
+        # Check for replacement characters
+        if has_replacement_characters?(text)
+          replacement_positions = find_character_positions(text, REPLACEMENT_CHAR)
+          issues << {
+            type: 'replacement_characters',
+            message: "Text contains #{replacement_positions.length} replacement characters (corrupted data)",
+            positions: replacement_positions,
+            severity: 'medium'
+          }
+        end
+        # Check for suspicious character patterns
+        suspicious_patterns = detect_suspicious_patterns(text)
+        unless suspicious_patterns.empty?
+          issues << {
+            type: 'suspicious_patterns',
+            message: "Text contains suspicious character patterns",
+            patterns: suspicious_patterns,
+            severity: 'low'
+          }
+        end
+        issues
+      end
+      def self.clean_text(text, options = {})
+        cleaned = text.dup
+        # Remove null bytes
+        cleaned.gsub!(NULL_BYTE, '') if options[:remove_null_bytes] != false
+        # Remove or replace control characters
+        if options[:remove_control_chars] != false
+          cleaned.gsub!(INVALID_CONTROL_CHARS, options[:control_char_replacement] || ' ')
+        end
+        # Handle replacement characters
+        if options[:remove_replacement_chars]
+          cleaned.gsub!(REPLACEMENT_CHAR, '')
+        end
+        # Normalize whitespace
+        if options[:normalize_whitespace] != false
+          cleaned.gsub!(/\s+/, ' ')
+          cleaned.strip!
+        end
+        # Ensure valid encoding
+        if options[:force_encoding] && !cleaned.valid_encoding?
+          cleaned = cleaned.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
+        end
+        cleaned
+      end
+      def self.character_statistics(text)
+        {
+          total_chars: text.length,
+          printable_chars: text.count("\u{20}-\u{7E}\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}"),
+          control_chars: text.scan(INVALID_CONTROL_CHARS).length,
+          whitespace_chars: text.count(" \t\n\r"),
+          null_bytes: text.count(NULL_BYTE),
+          replacement_chars: text.count(REPLACEMENT_CHAR),
+          unicode_chars: text.count("\u{80}-\u{FFFF}"),
+          ascii_chars: text.count("\u{00}-\u{7F}"),
+          # Japanese character statistics
+          japanese_chars: count_japanese_characters(text),
+          hiragana_chars: text.count("\u{3040}-\u{309F}"),
+          katakana_chars: text.count("\u{30A0}-\u{30FF}"),
+          kanji_chars: text.count("\u{4E00}-\u{9FAF}"),
+          fullwidth_chars: text.count("\u{FF00}-\u{FFEF}"),
+          # Other Asian scripts
+          chinese_chars: text.count("\u{4E00}-\u{9FFF}"),
+          korean_chars: text.count("\u{AC00}-\u{D7A3}")
+        }
+      end
+      def self.validate_file_encoding(file_path)
+        encodings_to_try = ['UTF-8', 'ISO-8859-1', 'Windows-1252', 'Shift_JIS', 'EUC-JP', 'ASCII']
+        encodings_to_try.each do |encoding|
+          begin
+            content = File.read(file_path, encoding: encoding)
+            if content.valid_encoding?
+              return {
+                detected_encoding: encoding,
+                valid: true,
+                content: content,
+                analysis: analyze_text(content)
+              }
+            end
+          rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+            next
+          end
+        end
+        # If no encoding works, read as binary and analyze
+        {
+          detected_encoding: 'BINARY',
+          valid: false,
+          content: File.read(file_path, encoding: 'BINARY'),
+          analysis: { has_invalid_chars: true }
+        }
+      end
+      def self.repair_text(text, strategy = :conservative)
+        case strategy
+        when :conservative
+          # Only remove clearly invalid characters
+          clean_text(text, remove_null_bytes: true, remove_control_chars: true)
+        when :aggressive
+          # Remove all non-printable characters
+          text.gsub(/[^\p{Print}\s]/, '')
+        when :replace
+          # Replace invalid characters with safe alternatives
+          clean_text(text,
+            remove_null_bytes: true,
+            remove_control_chars: true,
+            control_char_replacement: ' ',
+            force_encoding: true
+          )
+        else
+          text
+        end
+      end
+      # Japanese-specific methods
+      def self.detect_japanese_script(text)
+        scripts = []
+        scripts << 'hiragana' if text.match?(/[\u{3040}-\u{309F}]/)
+        scripts << 'katakana' if text.match?(/[\u{30A0}-\u{30FF}]/)
+        scripts << 'kanji' if text.match?(/[\u{4E00}-\u{9FAF}]/)
+        scripts << 'fullwidth' if text.match?(/[\u{FF00}-\u{FFEF}]/)
+        scripts
+      end
+      def self.is_japanese_text?(text)
+        japanese_chars = count_japanese_characters(text)
+        total_chars = text.gsub(/\s/, '').length
+        return false if total_chars == 0
+        # If more than 10% of non-space characters are Japanese, consider it Japanese text
+        (japanese_chars.to_f / total_chars) > 0.1
+      end
+      def self.count_japanese_characters(text)
+        hiragana = text.count("\u{3040}-\u{309F}")
+        katakana = text.count("\u{30A0}-\u{30FF}")
+        kanji = text.count("\u{4E00}-\u{9FAF}")
+        fullwidth = text.count("\u{FF00}-\u{FFEF}")
+        hiragana + katakana + kanji + fullwidth
+      end
+      def self.validate_japanese_text(text)
+        return { japanese: false } unless is_japanese_text?(text)
+        {
+          japanese: true,
+          scripts: detect_japanese_script(text),
+          character_count: count_japanese_characters(text),
+          mixed_with_latin: text.match?(/[\p{Latin}]/) && text.match?(/[\u{3040}-\u{30FF}\u{4E00}-\u{9FAF}]/),
+          valid_japanese: true # Japanese characters are always valid
+        }
+      end
+      private
+      def self.find_character_positions(text, char)
+        positions = []
+        text.chars.each_with_index do |c, index|
+          positions << index if c == char
+        end
+        positions
+      end
+      def self.detect_suspicious_patterns(text)
+        patterns = []
+        # Long sequences of the same character
+        if text.match?(/(.)\1{20,}/)
+          patterns << 'long_repetition'
+        end
+        # Excessive whitespace
+        if text.match?(/\s{50,}/)
+          patterns << 'excessive_whitespace'
+        end
+        # Mixed scripts that might indicate corruption (but allow common combinations)
+        if text.match?(/[\p{Latin}][\p{Cyrillic}\p{Arabic}\p{Hebrew}]/)
+          patterns << 'mixed_scripts'
+        end
+        # Note: Japanese mixed with Latin is common and NOT flagged as suspicious
+        # Example: "Hello 世界" or "Company株式会社" are normal
+        patterns
+      end
+    end
+  end
+end

data/lib/universal_document_processor/processors/excel_processor.rb ADDED Viewed

@@ -0,0 +1,219 @@
+module UniversalDocumentProcessor
+  module Processors
+    class ExcelProcessor < BaseProcessor
+      def extract_text
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          text_content = []
+          workbook.sheets.each do |sheet_name|
+            workbook.sheet(sheet_name)
+            text_content << "=== Sheet: #{sheet_name} ==="
+            # Get all rows with data
+            if workbook.last_row
+              (workbook.first_row..workbook.last_row).each do |row|
+                row_data = []
+                (workbook.first_column..workbook.last_column).each do |col|
+                  cell_value = workbook.cell(row, col)
+                  row_data << cell_value.to_s if cell_value
+                end
+                text_content << row_data.join(' | ') unless row_data.all?(&:empty?)
+              end
+            end
+            text_content << "" # Add blank line between sheets
+          end
+          text_content.join("\n")
+        end
+      end
+      def extract_metadata
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          sheet_info = {}
+          workbook.sheets.each do |sheet_name|
+            workbook.sheet(sheet_name)
+            sheet_info[sheet_name] = {
+              rows: workbook.last_row || 0,
+              columns: workbook.last_column || 0,
+              first_row: workbook.first_row || 0,
+              first_column: workbook.first_column || 0
+            }
+          end
+          super.merge({
+            sheet_count: workbook.sheets.length,
+            sheet_names: workbook.sheets,
+            sheet_info: sheet_info,
+            total_rows: sheet_info.values.sum { |info| info[:rows] },
+            total_columns: sheet_info.values.map { |info| info[:columns] }.max || 0,
+            has_formulas: detect_formulas(workbook),
+            has_charts: detect_charts(workbook)
+          })
+        end
+      end
+      def extract_tables
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          tables = []
+          workbook.sheets.each do |sheet_name|
+            workbook.sheet(sheet_name)
+            next unless workbook.last_row
+            table_data = {
+              sheet_name: sheet_name,
+              rows: workbook.last_row,
+              columns: workbook.last_column,
+              data: [],
+              headers: []
+            }
+            # Extract headers (first row)
+            if workbook.first_row
+              (workbook.first_column..workbook.last_column).each do |col|
+                header = workbook.cell(workbook.first_row, col)
+                table_data[:headers] << (header ? header.to_s : "Column #{col}")
+              end
+            end
+            # Extract all data
+            (workbook.first_row..workbook.last_row).each do |row|
+              row_data = []
+              (workbook.first_column..workbook.last_column).each do |col|
+                cell_value = workbook.cell(row, col)
+                row_data << (cell_value ? cell_value.to_s : "")
+              end
+              table_data[:data] << row_data
+            end
+            tables << table_data
+          end
+          tables
+        end
+      end
+      def extract_formulas
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          formulas = []
+          workbook.sheets.each do |sheet_name|
+            workbook.sheet(sheet_name)
+            next unless workbook.last_row
+            (workbook.first_row..workbook.last_row).each do |row|
+              (workbook.first_column..workbook.last_column).each do |col|
+                if workbook.respond_to?(:formula) && workbook.formula(row, col)
+                  formulas << {
+                    sheet: sheet_name,
+                    row: row,
+                    column: col,
+                    formula: workbook.formula(row, col),
+                    value: workbook.cell(row, col)
+                  }
+                end
+              end
+            end
+          end
+          formulas
+        end
+      end
+      def extract_charts
+        with_error_handling do
+          # Chart extraction would require more complex parsing
+          # This is a placeholder for future implementation
+          []
+        end
+      end
+      def supported_operations
+        super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables]
+      end
+      def to_csv(sheet_name = nil)
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          if sheet_name
+            workbook.sheet(sheet_name)
+            workbook.to_csv
+          else
+            # Convert all sheets to CSV
+            csv_data = {}
+            workbook.sheets.each do |name|
+              workbook.sheet(name)
+              csv_data[name] = workbook.to_csv
+            end
+            csv_data
+          end
+        end
+      end
+      def to_json
+        with_error_handling do
+          workbook = Roo::Spreadsheet.open(@file_path)
+          json_data = {}
+          workbook.sheets.each do |sheet_name|
+            workbook.sheet(sheet_name)
+            sheet_data = []
+            next unless workbook.last_row
+            # Get headers
+            headers = []
+            (workbook.first_column..workbook.last_column).each do |col|
+              header = workbook.cell(workbook.first_row, col)
+              headers << (header ? header.to_s : "Column #{col}")
+            end
+            # Get data rows
+            ((workbook.first_row + 1)..workbook.last_row).each do |row|
+              row_hash = {}
+              (workbook.first_column..workbook.last_column).each_with_index do |col, index|
+                cell_value = workbook.cell(row, col)
+                row_hash[headers[index]] = cell_value
+              end
+              sheet_data << row_hash
+            end
+            json_data[sheet_name] = sheet_data
+          end
+          json_data.to_json
+        end
+      end
+      private
+      def detect_formulas(workbook)
+        workbook.sheets.any? do |sheet_name|
+          workbook.sheet(sheet_name)
+          next false unless workbook.last_row
+          (workbook.first_row..workbook.last_row).any? do |row|
+            (workbook.first_column..workbook.last_column).any? do |col|
+              workbook.respond_to?(:formula) && workbook.formula(row, col)
+            end
+          end
+        end
+      rescue
+        false
+      end
+      def detect_charts(workbook)
+        # Chart detection would require more complex parsing
+        # This is a placeholder for future implementation
+        false
+      end
+    end
+  end
+end

data/lib/universal_document_processor/processors/image_processor.rb ADDED Viewed

@@ -0,0 +1,172 @@
+module UniversalDocumentProcessor
+  module Processors
+    class ImageProcessor < BaseProcessor
+      def extract_text
+        with_error_handling do
+          # Images don't contain extractable text by default
+          # This could be extended with OCR functionality
+          "Image file: #{File.basename(@file_path)}"
+        end
+      end
+      def extract_metadata
+        with_error_handling do
+          image = MiniMagick::Image.open(@file_path)
+          super.merge({
+            width: image.width,
+            height: image.height,
+            format: image.type,
+            colorspace: image.colorspace,
+            resolution: extract_resolution(image),
+            compression: image['compression'],
+            quality: image['quality'],
+            exif_data: extract_exif_data(image),
+            color_profile: extract_color_profile(image),
+            has_transparency: has_transparency?(image)
+          })
+        end
+      end
+      def extract_colors
+        with_error_handling do
+          image = MiniMagick::Image.open(@file_path)
+          # Get dominant colors using ImageMagick's histogram
+          colors = []
+          histogram_output = image.run_command('convert', @file_path, '-colors', '10', '-depth', '8', '-format', '%c', 'histogram:info:-')
+          histogram_output.split("\n").each do |line|
+            if line.match(/(\d+):\s+\(([^)]+)\)\s+(#\w+)/)
+              count = $1.to_i
+              rgb = $2
+              hex = $3
+              colors << {
+                count: count,
+                rgb: rgb,
+                hex: hex
+              }
+            end
+          end
+          colors.sort_by { |c| -c[:count] }
+        end
+      rescue => e
+        []
+      end
+      def resize(width, height, output_path = nil)
+        with_error_handling do
+          image = MiniMagick::Image.open(@file_path)
+          image.resize "#{width}x#{height}"
+          if output_path
+            image.write(output_path)
+            output_path
+          else
+            # Return as blob
+            image.to_blob
+          end
+        end
+      end
+      def convert_format(target_format, output_path = nil)
+        with_error_handling do
+          image = MiniMagick::Image.open(@file_path)
+          image.format(target_format.to_s.downcase)
+          if output_path
+            image.write(output_path)
+            output_path
+          else
+            # Return as blob
+            image.to_blob
+          end
+        end
+      end
+      def create_thumbnail(size = 150, output_path = nil)
+        with_error_handling do
+          image = MiniMagick::Image.open(@file_path)
+          image.resize "#{size}x#{size}"
+          if output_path
+            image.write(output_path)
+            output_path
+          else
+            image.to_blob
+          end
+        end
+      end
+      def extract_faces
+        with_error_handling do
+          # Placeholder for face detection
+          # Would require additional libraries like opencv or face detection APIs
+          []
+        end
+      end
+      def extract_text_ocr
+        with_error_handling do
+          # Placeholder for OCR functionality
+          # Would require tesseract or similar OCR library
+          "OCR not implemented - would require tesseract gem"
+        end
+      end
+      def supported_operations
+        super + [:extract_colors, :resize, :convert_format, :create_thumbnail, :extract_faces, :extract_text_ocr]
+      end
+      private
+      def extract_resolution(image)
+        {
+          x: image.resolution[0],
+          y: image.resolution[1],
+          units: image['units']
+        }
+      rescue
+        { x: nil, y: nil, units: nil }
+      end
+      def extract_exif_data(image)
+        exif = {}
+        # Common EXIF tags
+        exif_tags = %w[
+          exif:DateTime exif:DateTimeOriginal exif:DateTimeDigitized
+          exif:Make exif:Model exif:Software
+          exif:ExposureTime exif:FNumber exif:ISO exif:Flash
+          exif:FocalLength exif:WhiteBalance
+          exif:GPSLatitude exif:GPSLongitude exif:GPSAltitude
+        ]
+        exif_tags.each do |tag|
+          value = image[tag]
+          exif[tag.gsub('exif:', '')] = value if value
+        end
+        exif
+      rescue
+        {}
+      end
+      def extract_color_profile(image)
+        {
+          profile: image['colorspace'],
+          icc_profile: image['icc:description']
+        }
+      rescue
+        {}
+      end
+      def has_transparency?(image)
+        image['matte'] == 'True' || image.type.downcase.include?('png')
+      rescue
+        false
+      end
+    end
+  end
+end