RubyGems - universal_document_processor - Versions diffs - 1.0.0 - Mend

universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/AI_USAGE_GUIDE.md +404 -0
data/CHANGELOG.md +67 -0
data/GEM_RELEASE_GUIDE.md +288 -0
data/Gemfile +27 -0
data/LICENSE +21 -0
data/README.md +726 -0
data/Rakefile +36 -0
data/lib/universal_document_processor/ai_agent.rb +491 -0
data/lib/universal_document_processor/document.rb +225 -0
data/lib/universal_document_processor/processors/archive_processor.rb +290 -0
data/lib/universal_document_processor/processors/base_processor.rb +58 -0
data/lib/universal_document_processor/processors/character_validator.rb +283 -0
data/lib/universal_document_processor/processors/excel_processor.rb +219 -0
data/lib/universal_document_processor/processors/image_processor.rb +172 -0
data/lib/universal_document_processor/processors/pdf_processor.rb +105 -0
data/lib/universal_document_processor/processors/powerpoint_processor.rb +214 -0
data/lib/universal_document_processor/processors/text_processor.rb +360 -0
data/lib/universal_document_processor/processors/word_processor.rb +137 -0
data/lib/universal_document_processor/utils/file_detector.rb +83 -0
data/lib/universal_document_processor/utils/japanese_filename_handler.rb +205 -0
data/lib/universal_document_processor/version.rb +3 -0
data/lib/universal_document_processor.rb +223 -0
metadata +198 -0

data/lib/universal_document_processor/document.rb ADDED Viewed

@@ -0,0 +1,225 @@
+module UniversalDocumentProcessor
+  class Document
+    attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
+    def initialize(file_path_or_io, options = {})
+      @file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
+      @options = options
+      @content_type = detect_content_type
+      @file_size = File.size(@file_path)
+      @filename_validation = validate_filename_encoding
+    end
+    def process
+      {
+        file_path: @file_path,
+        content_type: @content_type,
+        file_size: @file_size,
+        text_content: extract_text,
+        metadata: metadata,
+        images: extract_images,
+        tables: extract_tables,
+        filename_info: filename_info,
+        processed_at: Time.current
+      }
+    end
+    def extract_text
+      processor.extract_text
+    rescue => e
+      fallback_text_extraction
+    end
+    def metadata
+      processor.extract_metadata
+    rescue => e
+      basic_metadata
+    end
+    def extract_images
+      processor.respond_to?(:extract_images) ? processor.extract_images : []
+    rescue => e
+      []
+    end
+    def extract_tables
+      processor.respond_to?(:extract_tables) ? processor.extract_tables : []
+    rescue => e
+      []
+    end
+    def convert_to(target_format)
+      case target_format.to_sym
+      when :pdf
+        convert_to_pdf
+      when :text, :txt
+        extract_text
+      when :html
+        convert_to_html
+      when :json
+        process.to_json
+      else
+        raise UnsupportedFormatError, "Conversion to #{target_format} not supported"
+      end
+    end
+    def supported_formats
+      %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
+    end
+    def supported?
+      supported_formats.include?(file_extension.downcase)
+    end
+    def japanese_filename?
+      Utils::JapaneseFilenameHandler.contains_japanese?(File.basename(@file_path))
+    end
+    def filename_info
+      {
+        original_filename: File.basename(@file_path),
+        contains_japanese: japanese_filename?,
+        validation: @filename_validation,
+        japanese_parts: Utils::JapaneseFilenameHandler.extract_japanese_parts(File.basename(@file_path))
+      }
+    end
+    # AI-powered analysis methods
+    def ai_analyze(query = nil, options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.analyze_document(process, query)
+    end
+    def ai_summarize(length: :medium, options: {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.summarize_document(process, length: length)
+    end
+    def ai_extract_info(categories = nil, options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.extract_key_information(process, categories)
+    end
+    def ai_translate(target_language, options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.translate_document(process, target_language)
+    end
+    def ai_classify(options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.classify_document(process)
+    end
+    def ai_insights(options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.generate_insights(process)
+    end
+    def ai_action_items(options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.extract_action_items(process)
+    end
+    def ai_chat(message, options = {})
+      ai_agent = create_ai_agent(options)
+      ai_agent.chat(message, process)
+    end
+    def create_ai_agent(options = {})
+      AIAgent.new(options.merge(@options))
+    end
+    private
+    def processor
+      @processor ||= create_processor
+    end
+    def create_processor
+      case @content_type
+      when /pdf/
+        Processors::PdfProcessor.new(@file_path, @options)
+      when /word/, /document/
+        Processors::WordProcessor.new(@file_path, @options)
+      when /excel/, /spreadsheet/
+        Processors::ExcelProcessor.new(@file_path, @options)
+      when /powerpoint/, /presentation/
+        Processors::PowerpointProcessor.new(@file_path, @options)
+      when /image/
+        Processors::ImageProcessor.new(@file_path, @options)
+      when /zip/, /archive/, /compressed/
+        Processors::ArchiveProcessor.new(@file_path, @options)
+      when /text/, /plain/
+        Processors::TextProcessor.new(@file_path, @options)
+      else
+        # Fallback to base processor with universal extraction
+        Processors::BaseProcessor.new(@file_path, @options)
+      end
+    end
+    def detect_content_type
+      Utils::FileDetector.detect(@file_path)
+    end
+    def file_extension
+      File.extname(@file_path).gsub('.', '')
+    end
+    def save_temp_file(io)
+      # Try to get original filename from IO if available
+      original_filename = io.respond_to?(:original_filename) ? io.original_filename : nil
+      extension = original_filename ? File.extname(original_filename) : ".#{file_extension}"
+      # Create safe temporary filename
+      if original_filename && Utils::JapaneseFilenameHandler.contains_japanese?(original_filename)
+        safe_name = Utils::JapaneseFilenameHandler.create_safe_temp_filename(original_filename, 'temp')
+        temp_file = Tempfile.new([File.basename(safe_name, extension), extension])
+      else
+        temp_file = Tempfile.new(['document', extension])
+      end
+      temp_file.binmode
+      temp_file.write(io.read)
+      temp_file.close
+      temp_file.path
+    end
+    def fallback_text_extraction
+      begin
+        Yomu.new(@file_path).text
+      rescue => e
+        "Unable to extract text: #{e.message}"
+      end
+    end
+    def basic_metadata
+      {
+        filename: File.basename(@file_path),
+        file_size: @file_size,
+        content_type: @content_type,
+        created_at: File.ctime(@file_path),
+        modified_at: File.mtime(@file_path),
+        japanese_filename: japanese_filename?,
+        filename_encoding: @filename_validation
+      }
+    end
+    def normalize_file_path(file_path)
+      Utils::JapaneseFilenameHandler.normalize_filename(file_path)
+    end
+    def validate_filename_encoding
+      Utils::JapaneseFilenameHandler.validate_filename(File.basename(@file_path))
+    end
+    def convert_to_pdf
+      # Implementation for PDF conversion
+      raise NotImplementedError, "PDF conversion not yet implemented"
+    end
+    def convert_to_html
+      # Implementation for HTML conversion
+      raise NotImplementedError, "HTML conversion not yet implemented"
+    end
+  end
+end

data/lib/universal_document_processor/processors/archive_processor.rb ADDED Viewed

@@ -0,0 +1,290 @@
+module UniversalDocumentProcessor
+  module Processors
+    class ArchiveProcessor < BaseProcessor
+      def extract_text
+        with_error_handling do
+          files_list = list_files
+          text_content = ["=== Archive Contents ==="]
+          files_list.each do |file_info|
+            text_content << "#{file_info[:path]} (#{file_info[:size]} bytes)"
+          end
+          # Try to extract text from text files within the archive
+          text_files = extract_text_files
+          unless text_files.empty?
+            text_content << "\n=== Text File Contents ==="
+            text_files.each do |file_path, content|
+              text_content << "\n--- #{file_path} ---"
+              text_content << content[0..1000] # Limit to first 1000 chars
+              text_content << "..." if content.length > 1000
+            end
+          end
+          text_content.join("\n")
+        end
+      end
+      def extract_metadata
+        with_error_handling do
+          files_list = list_files
+          super.merge({
+            archive_type: detect_archive_type,
+            total_files: files_list.length,
+            total_uncompressed_size: files_list.sum { |f| f[:size] },
+            file_types: analyze_file_types(files_list),
+            directory_structure: build_directory_structure(files_list),
+            has_executable_files: has_executable_files?(files_list),
+            largest_file: find_largest_file(files_list),
+            compression_ratio: calculate_compression_ratio
+          })
+        end
+      end
+      def list_files
+        with_error_handling do
+          case detect_archive_type
+          when :zip
+            list_zip_files
+          when :rar
+            list_rar_files
+          when :seven_zip
+            list_7z_files
+          else
+            []
+          end
+        end
+      end
+      def extract_file(file_path, output_path = nil)
+        with_error_handling do
+          case detect_archive_type
+          when :zip
+            extract_zip_file(file_path, output_path)
+          when :rar
+            extract_rar_file(file_path, output_path)
+          when :seven_zip
+            extract_7z_file(file_path, output_path)
+          else
+            raise UnsupportedFormatError, "Unsupported archive format"
+          end
+        end
+      end
+      def extract_all(output_directory)
+        with_error_handling do
+          case detect_archive_type
+          when :zip
+            extract_all_zip(output_directory)
+          when :rar
+            extract_all_rar(output_directory)
+          when :seven_zip
+            extract_all_7z(output_directory)
+          else
+            raise UnsupportedFormatError, "Unsupported archive format"
+          end
+        end
+      end
+      def supported_operations
+        super + [:list_files, :extract_file, :extract_all, :analyze_security]
+      end
+      private
+      def detect_archive_type
+        extension = File.extname(@file_path).downcase
+        case extension
+        when '.zip'
+          :zip
+        when '.rar'
+          :rar
+        when '.7z'
+          :seven_zip
+        else
+          # Try to detect by file signature
+          File.open(@file_path, 'rb') do |file|
+            signature = file.read(4)
+            case signature
+            when "PK\x03\x04", "PK\x05\x06", "PK\x07\x08"
+              :zip
+            when "Rar!"
+              :rar
+            when "7z\xBC\xAF"
+              :seven_zip
+            else
+              :unknown
+            end
+          end
+        end
+      end
+      def list_zip_files
+        files = []
+        Zip::File.open(@file_path) do |zip|
+          zip.each do |entry|
+            files << {
+              path: entry.name,
+              size: entry.size,
+              compressed_size: entry.compressed_size,
+              is_directory: entry.directory?,
+              modified_time: entry.time,
+              crc: entry.crc
+            }
+          end
+        end
+        files
+      end
+      def list_rar_files
+        # RAR support would require external library or system command
+        # This is a placeholder implementation
+        []
+      end
+      def list_7z_files
+        # 7z support would require external library or system command
+        # This is a placeholder implementation
+        []
+      end
+      def extract_zip_file(file_path, output_path)
+        Zip::File.open(@file_path) do |zip|
+          entry = zip.find_entry(file_path)
+          if entry
+            if output_path
+              entry.extract(output_path)
+              output_path
+            else
+              entry.get_input_stream.read
+            end
+          else
+            raise ProcessingError, "File not found in archive: #{file_path}"
+          end
+        end
+      end
+      def extract_rar_file(file_path, output_path)
+        # RAR extraction would require external library
+        raise NotImplementedError, "RAR extraction not implemented"
+      end
+      def extract_7z_file(file_path, output_path)
+        # 7z extraction would require external library
+        raise NotImplementedError, "7z extraction not implemented"
+      end
+      def extract_all_zip(output_directory)
+        FileUtils.mkdir_p(output_directory)
+        Zip::File.open(@file_path) do |zip|
+          zip.each do |entry|
+            output_path = File.join(output_directory, entry.name)
+            FileUtils.mkdir_p(File.dirname(output_path))
+            entry.extract(output_path) unless File.exist?(output_path)
+          end
+        end
+        output_directory
+      end
+      def extract_all_rar(output_directory)
+        raise NotImplementedError, "RAR extraction not implemented"
+      end
+      def extract_all_7z(output_directory)
+        raise NotImplementedError, "7z extraction not implemented"
+      end
+      def extract_text_files
+        text_files = {}
+        return text_files unless detect_archive_type == :zip
+        Zip::File.open(@file_path) do |zip|
+          zip.each do |entry|
+            next if entry.directory?
+            # Check if it's a text file
+            if text_file?(entry.name)
+              begin
+                content = entry.get_input_stream.read
+                # Try to decode as UTF-8
+                text_files[entry.name] = content.force_encoding('UTF-8')
+              rescue
+                # Skip files that can't be read as text
+              end
+            end
+          end
+        end
+        text_files
+      end
+      def text_file?(filename)
+        text_extensions = %w[.txt .md .readme .log .csv .json .xml .html .css .js .rb .py .java .c .cpp .h]
+        extension = File.extname(filename).downcase
+        text_extensions.include?(extension) || File.basename(filename).downcase.match?(/readme|license|changelog/)
+      end
+      def analyze_file_types(files_list)
+        type_counts = Hash.new(0)
+        files_list.each do |file_info|
+          next if file_info[:is_directory]
+          extension = File.extname(file_info[:path]).downcase
+          type_counts[extension.empty? ? 'no_extension' : extension] += 1
+        end
+        type_counts
+      end
+      def build_directory_structure(files_list)
+        structure = {}
+        files_list.each do |file_info|
+          path_parts = file_info[:path].split('/')
+          current = structure
+          path_parts.each_with_index do |part, index|
+            current[part] ||= {}
+            current = current[part]
+            if index == path_parts.length - 1 && !file_info[:is_directory]
+              current[:_file_info] = file_info
+            end
+          end
+        end
+        structure
+      end
+      def has_executable_files?(files_list)
+        executable_extensions = %w[.exe .bat .sh .cmd .com .scr .msi .deb .rpm .dmg .app]
+        files_list.any? do |file_info|
+          extension = File.extname(file_info[:path]).downcase
+          executable_extensions.include?(extension)
+        end
+      end
+      def find_largest_file(files_list)
+        files_list.reject { |f| f[:is_directory] }.max_by { |f| f[:size] }
+      end
+      def calculate_compression_ratio
+        return 0 unless detect_archive_type == :zip
+        total_size = 0
+        compressed_size = 0
+        Zip::File.open(@file_path) do |zip|
+          zip.each do |entry|
+            next if entry.directory?
+            total_size += entry.size
+            compressed_size += entry.compressed_size
+          end
+        end
+        return 0 if total_size == 0
+        ((total_size - compressed_size).to_f / total_size * 100).round(2)
+      rescue
+        0
+      end
+    end
+  end
+end

data/lib/universal_document_processor/processors/base_processor.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module UniversalDocumentProcessor
+  module Processors
+    class BaseProcessor
+      attr_reader :file_path, :options
+      def initialize(file_path, options = {})
+        @file_path = file_path
+        @options = options
+      end
+      def extract_text
+        # Fallback to universal text extraction
+        Yomu.new(@file_path).text
+      rescue => e
+        raise ProcessingError, "Failed to extract text: #{e.message}"
+      end
+      def extract_metadata
+        # Basic file metadata
+        {
+          filename: File.basename(@file_path),
+          file_size: File.size(@file_path),
+          content_type: Marcel::MimeType.for(Pathname.new(@file_path)),
+          created_at: File.ctime(@file_path),
+          modified_at: File.mtime(@file_path)
+        }
+      rescue => e
+        raise ProcessingError, "Failed to extract metadata: #{e.message}"
+      end
+      def extract_images
+        []
+      end
+      def extract_tables
+        []
+      end
+      def supported_operations
+        [:extract_text, :extract_metadata]
+      end
+      protected
+      def validate_file
+        raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
+        raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+      end
+      def with_error_handling
+        validate_file
+        yield
+      rescue => e
+        raise ProcessingError, "Processing failed: #{e.message}"
+      end
+    end
+  end
+end