RubyGems - universal_document_processor - Versions diffs - 1.0.5 → 1.1.1 - Mend

universal_document_processor 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/README.md +237 -2
data/lib/universal_document_processor/ai_agent.rb +48 -49
data/lib/universal_document_processor/document.rb +130 -13
data/lib/universal_document_processor/processors/archive_processor.rb +26 -0
data/lib/universal_document_processor/processors/base_processor.rb +17 -0
data/lib/universal_document_processor/processors/excel_processor.rb +30 -0
data/lib/universal_document_processor/processors/pdf_processor.rb +21 -1
data/lib/universal_document_processor/processors/text_processor.rb +21 -0
data/lib/universal_document_processor/processors/word_processor.rb +30 -0
data/lib/universal_document_processor/version.rb +1 -1
data/lib/universal_document_processor.rb +10 -0
metadata +1 -6
data/debug_test.rb +0 -35
data/test_ai_dependency.rb +0 -80
data/test_core_functionality.rb +0 -280
data/test_performance_memory.rb +0 -271
data/test_published_gem.rb +0 -349

data/lib/universal_document_processor/document.rb CHANGED Viewed

@@ -2,29 +2,62 @@ module UniversalDocumentProcessor
   class Document
     attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
+    class LargeFileError < StandardError; end
+    class FileValidationError < StandardError; end
+    MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
     def initialize(file_path_or_io, options = {})
       @file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
       @options = options
+      # 1. Check file existence and readability
+      unless File.exist?(@file_path) && File.readable?(@file_path)
+        raise FileValidationError, "File is missing or unreadable: #{@file_path}"
+      end
       @content_type = detect_content_type
       @file_size = File.size(@file_path)
+      # 2. Large file safeguard
+      if @file_size > MAX_FILE_SIZE
+        raise LargeFileError, "File size #{@file_size} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+      end
       @filename_validation = validate_filename_encoding
+      # 3. Encoding validation and cleaning for text files
+      if @content_type =~ /text|plain/
+        validation = UniversalDocumentProcessor.validate_file(@file_path)
+        unless validation[:valid]
+          @cleaned_text_content = UniversalDocumentProcessor.clean_text(validation[:content], {
+            remove_null_bytes: true,
+            remove_control_chars: true,
+            normalize_whitespace: true
+          })
+        else
+          @cleaned_text_content = nil
+        end
+      end
     end
     def process
-      {
-        file_path: @file_path,
-        content_type: @content_type,
-        file_size: @file_size,
-        text_content: extract_text,
-        metadata: metadata,
-        images: extract_images,
-        tables: extract_tables,
-        filename_info: filename_info,
-        processed_at: Time.current
-      }
+      begin
+        {
+          file_path: @file_path,
+          content_type: @content_type,
+          file_size: @file_size,
+          text_content: extract_text,
+          metadata: metadata,
+          images: extract_images,
+          tables: extract_tables,
+          filename_info: filename_info,
+          processed_at: Time.current
+        }
+      rescue LargeFileError, FileValidationError => e
+        { error: e.class.name, message: e.message, file_path: @file_path }
+      rescue => e
+        { error: 'ProcessingError', message: e.message, file_path: @file_path }
+      end
     end
     def extract_text
+      # Use cleaned text if available (from encoding validation)
+      return @cleaned_text_content if defined?(@cleaned_text_content) && @cleaned_text_content
       processor.extract_text
     rescue => e
       fallback_text_extraction
@@ -253,13 +286,97 @@ module UniversalDocumentProcessor
     end
     def convert_to_pdf
-      # Implementation for PDF conversion
-      raise NotImplementedError, "PDF conversion not yet implemented"
+      ensure_prawn_available!
+      output_path = @file_path.gsub(File.extname(@file_path), '.pdf')
+      Prawn::Document.generate(output_path) do |pdf|
+        # Add title
+        pdf.font_size 18
+        pdf.text "Document: #{File.basename(@file_path)}", style: :bold
+        pdf.move_down 20
+        # Add metadata section
+        pdf.font_size 12
+        pdf.text "Document Information", style: :bold
+        pdf.move_down 10
+        metadata_info = metadata
+        pdf.text "File Size: #{format_file_size(@file_size)}"
+        pdf.text "Content Type: #{@content_type}"
+        pdf.text "Created: #{metadata_info[:created_at]}" if metadata_info[:created_at]
+        pdf.text "Modified: #{metadata_info[:modified_at]}" if metadata_info[:modified_at]
+        pdf.move_down 20
+        # Add content section
+        pdf.text "Content", style: :bold
+        pdf.move_down 10
+        text_content = extract_text
+        if text_content && !text_content.strip.empty?
+          pdf.font_size 10
+          pdf.text text_content
+        else
+          pdf.text "No text content available for this document."
+        end
+        # Add tables if available
+        tables = extract_tables
+        unless tables.empty?
+          pdf.start_new_page
+          pdf.font_size 12
+          pdf.text "Tables", style: :bold
+          pdf.move_down 10
+          tables.each_with_index do |table, index|
+            pdf.text "Table #{index + 1}", style: :bold
+            pdf.move_down 5
+            if table[:content] && !table[:content].empty?
+              # Format table data for Prawn
+              table_data = table[:content].first(20) # Limit to first 20 rows
+              pdf.table(table_data, header: true) do
+                row(0).font_style = :bold
+                cells.size = 8
+                cells.padding = 3
+              end
+            end
+            pdf.move_down 15
+          end
+        end
+      end
+      output_path
+    rescue => e
+      raise ProcessingError, "Failed to create PDF: #{e.message}"
     end
     def convert_to_html
       # Implementation for HTML conversion
       raise NotImplementedError, "HTML conversion not yet implemented"
     end
+    private
+    def ensure_prawn_available!
+      unless defined?(Prawn)
+        raise DependencyMissingError, "PDF creation requires the 'prawn' gem. Install it with: gem install prawn -v '~> 2.4'"
+      end
+    end
+    def format_file_size(bytes)
+      return "0 B" if bytes == 0
+      units = ['B', 'KB', 'MB', 'GB']
+      size = bytes.to_f
+      unit_index = 0
+      while size >= 1024 && unit_index < units.length - 1
+        size /= 1024
+        unit_index += 1
+      end
+      "#{size.round(2)} #{units[unit_index]}"
+    end
   end
 end

data/lib/universal_document_processor/processors/archive_processor.rb CHANGED Viewed

@@ -91,6 +91,32 @@ module UniversalDocumentProcessor
         super + [:list_files, :extract_file, :extract_all, :analyze_security]
       end
+      # Class method to create a zip file from a list of files or a directory
+      def self.create_zip(output_zip_path, files_or_directory)
+        require 'zip'
+        files = []
+        if files_or_directory.is_a?(String) && File.directory?(files_or_directory)
+          # Recursively collect all files in the directory
+          Dir[File.join(files_or_directory, '**', '**')].each do |file|
+            files << file unless File.directory?(file)
+          end
+          base_dir = files_or_directory
+        elsif files_or_directory.is_a?(Array)
+          files = files_or_directory
+          base_dir = nil
+        else
+          raise ArgumentError, 'files_or_directory must be a directory path or an array of file paths'
+        end
+        Zip::File.open(output_zip_path, Zip::File::CREATE) do |zipfile|
+          files.each do |file|
+            entry_name = base_dir ? file.sub(/^#{Regexp.escape(base_dir)}\/?/, '') : File.basename(file)
+            zipfile.add(entry_name, file)
+          end
+        end
+        output_zip_path
+      end
       private
       def detect_archive_type

data/lib/universal_document_processor/processors/base_processor.rb CHANGED Viewed

@@ -3,6 +3,8 @@ module UniversalDocumentProcessor
     class BaseProcessor
       attr_reader :file_path, :options
+      MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
       def initialize(file_path, options = {})
         @file_path = file_path
         @options = options
@@ -11,6 +13,17 @@ module UniversalDocumentProcessor
       def extract_text
         # Fallback to universal text extraction
         if defined?(Yomu)
+          # Encoding validation for text files
+          if File.extname(@file_path) =~ /\.(txt|csv|tsv|md|json|xml|html|htm)$/i
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
+          end
           Yomu.new(@file_path).text
         else
           raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
@@ -49,6 +62,10 @@ module UniversalDocumentProcessor
       def validate_file
         raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
         raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+        # Large file safeguard
+        if File.size(@file_path) > MAX_FILE_SIZE
+          raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+        end
       end
       def with_error_handling

data/lib/universal_document_processor/processors/excel_processor.rb CHANGED Viewed

@@ -6,11 +6,32 @@ require 'csv'
 module UniversalDocumentProcessor
   module Processors
     class ExcelProcessor < BaseProcessor
+      MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
       def extract_text
+        validate_file
         with_error_handling do
           if @file_path.end_with?('.csv')
+            # Encoding validation for CSV
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
             extract_csv_text
           elsif @file_path.end_with?('.tsv')
+            # Encoding validation for TSV
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
             extract_tsv_text
           elsif @file_path.end_with?('.xlsx')
             extract_xlsx_text_builtin
@@ -208,6 +229,15 @@ module UniversalDocumentProcessor
       private
+      def validate_file
+        raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
+        raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+        # Large file safeguard
+        if File.size(@file_path) > MAX_FILE_SIZE
+          raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+        end
+      end
       # CSV Processing Methods
       def extract_csv_text
         content = File.read(@file_path, encoding: 'UTF-8')

data/lib/universal_document_processor/processors/pdf_processor.rb CHANGED Viewed

@@ -1,12 +1,23 @@
 module UniversalDocumentProcessor
   module Processors
     class PdfProcessor < BaseProcessor
+      MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
       def extract_text
         ensure_pdf_reader_available!
+        validate_file
         with_error_handling do
           reader = PDF::Reader.new(@file_path)
           text = reader.pages.map(&:text).join("\n")
+          # Encoding validation for extracted text
+          validation = UniversalDocumentProcessor.validate_file(@file_path)
+          unless validation[:valid]
+            return UniversalDocumentProcessor.clean_text(validation[:content], {
+              remove_null_bytes: true,
+              remove_control_chars: true,
+              normalize_whitespace: true
+            })
+          end
           text.strip.empty? ? "No text content found in PDF" : text
         end
       rescue => e
@@ -104,6 +115,15 @@ module UniversalDocumentProcessor
         end
       end
+      def validate_file
+        raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
+        raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+        # Large file safeguard
+        if File.size(@file_path) > MAX_FILE_SIZE
+          raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+        end
+      end
       def extract_form_fields(reader)
         # Extract PDF form fields if present
         []

data/lib/universal_document_processor/processors/text_processor.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 module UniversalDocumentProcessor
   module Processors
     class TextProcessor < BaseProcessor
+      MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
       def extract_text
+        validate_file
         with_error_handling do
           case detect_text_format
           when :rtf
@@ -15,6 +18,15 @@ module UniversalDocumentProcessor
           when :json
             extract_json_text
           else
+            # Encoding validation for plain text
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
             extract_plain_text
           end
         end
@@ -81,6 +93,15 @@ module UniversalDocumentProcessor
       private
+      def validate_file
+        raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
+        raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+        # Large file safeguard
+        if File.size(@file_path) > MAX_FILE_SIZE
+          raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+        end
+      end
       def detect_text_format
         extension = File.extname(@file_path).downcase
         case extension

data/lib/universal_document_processor/processors/word_processor.rb CHANGED Viewed

@@ -1,11 +1,32 @@
 module UniversalDocumentProcessor
   module Processors
     class WordProcessor < BaseProcessor
+      MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
       def extract_text
+        validate_file
         with_error_handling do
           if @file_path.end_with?('.docx')
+            # Encoding validation for docx (if possible)
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
             extract_docx_text
           elsif @file_path.end_with?('.doc')
+            # Encoding validation for doc (if possible)
+            validation = UniversalDocumentProcessor.validate_file(@file_path)
+            unless validation[:valid]
+              return UniversalDocumentProcessor.clean_text(validation[:content], {
+                remove_null_bytes: true,
+                remove_control_chars: true,
+                normalize_whitespace: true
+              })
+            end
             # Built-in .doc file processing
             fallback_text_extraction
           else
@@ -90,6 +111,15 @@ module UniversalDocumentProcessor
       private
+      def validate_file
+        raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
+        raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
+        # Large file safeguard
+        if File.size(@file_path) > MAX_FILE_SIZE
+          raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
+        end
+      end
       def ensure_docx_available!
         unless defined?(Docx)
           raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"

data/lib/universal_document_processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UniversalDocumentProcessor
-  VERSION = "1.0.5"
+  VERSION = "1.1.1"
 end

data/lib/universal_document_processor.rb CHANGED Viewed

@@ -206,6 +206,16 @@ module UniversalDocumentProcessor
     Document.new(file_path_or_io, options).convert_to(target_format)
   end
+  # Create PDF from any supported document
+  def self.create_pdf(file_path, options = {})
+    Document.new(file_path, options).convert_to(:pdf)
+  end
+  # Check if PDF creation is available
+  def self.pdf_creation_available?
+    defined?(Prawn)
+  end
   # Batch process multiple documents
   def self.batch_process(file_paths, options = {})
     file_paths.map do |file_path|

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: universal_document_processor
 version: !ruby/object:Gem::Version
-  version: 1.0.5
+  version: 1.1.1
 platform: ruby
 authors:
 - Vikas Patil
@@ -201,7 +201,6 @@ files:
 - README.md
 - Rakefile
 - USER_GUIDE.md
-- debug_test.rb
 - lib/universal_document_processor.rb
 - lib/universal_document_processor/ai_agent.rb
 - lib/universal_document_processor/document.rb
@@ -217,10 +216,6 @@ files:
 - lib/universal_document_processor/utils/file_detector.rb
 - lib/universal_document_processor/utils/japanese_filename_handler.rb
 - lib/universal_document_processor/version.rb
-- test_ai_dependency.rb
-- test_core_functionality.rb
-- test_performance_memory.rb
-- test_published_gem.rb
 homepage: https://github.com/vpatil160/universal_document_processor
 licenses:
 - MIT

data/debug_test.rb DELETED Viewed

@@ -1,35 +0,0 @@
-#!/usr/bin/env ruby
-# Add lib directory to load path
-$LOAD_PATH.unshift File.expand_path('lib', __dir__)
-# Load the gem
-require 'universal_document_processor'
-require 'tempfile'
-# Create a simple text file
-txt_file = Tempfile.new(['test', '.txt'])
-txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
-txt_file.close
-puts "Testing text file: #{txt_file.path}"
-begin
-  puts "Processing file..."
-  result = UniversalDocumentProcessor.process(txt_file.path)
-  puts "Result keys: #{result.keys}"
-  puts "Result type: #{result.class}"
-  if result.is_a?(Hash)
-    result.each do |key, value|
-      puts "#{key}: #{value.class} - #{value.to_s[0..100]}..."
-    end
-  end
-rescue => e
-  puts "Error: #{e.class} - #{e.message}"
-  puts e.backtrace.first(5)
-end
-txt_file.unlink

data/test_ai_dependency.rb DELETED Viewed

@@ -1,80 +0,0 @@
-#!/usr/bin/env ruby
-# Add lib directory to load path
-$LOAD_PATH.unshift File.expand_path('lib', __dir__)
-# Load the gem
-require 'universal_document_processor'
-puts "Testing AI Dependency Handling"
-puts "=" * 50
-# Test 1: Check AI availability without API key
-puts "\n1. Testing AI availability without API key:"
-ai_available = UniversalDocumentProcessor.ai_available?
-puts "   AI Available: #{ai_available}"
-# Test 2: Create AI agent without API key
-puts "\n2. Creating AI agent without API key:"
-agent = UniversalDocumentProcessor.create_ai_agent
-puts "   Agent created: #{agent.class}"
-puts "   AI enabled: #{agent.ai_enabled}"
-puts "   AI available: #{agent.ai_available?}"
-# Test 3: Try to use AI methods without API key
-puts "\n3. Testing AI methods without API key:"
-# Create a sample text file
-require 'tempfile'
-sample_file = Tempfile.new(['test', '.txt'])
-sample_file.write("This is a test document for AI processing.")
-sample_file.close
-begin
-  result = UniversalDocumentProcessor.ai_analyze(sample_file.path)
-  puts "   ERROR: Should have raised an exception!"
-rescue UniversalDocumentProcessor::DependencyMissingError => e
-  puts "   ✓ Correctly raised DependencyMissingError: #{e.message}"
-rescue => e
-  puts "   ✗ Unexpected error: #{e.class} - #{e.message}"
-end
-# Test 4: Check available features
-puts "\n4. Available features:"
-features = UniversalDocumentProcessor.available_features
-puts "   Features: #{features.join(', ')}"
-puts "   AI processing included: #{features.include?(:ai_processing)}"
-# Test 5: Check optional dependencies
-puts "\n5. Optional dependencies:"
-optional_deps = UniversalDocumentProcessor.optional_dependencies
-puts "   Optional dependencies: #{optional_deps.keys.join(', ')}"
-missing_deps = UniversalDocumentProcessor.missing_dependencies
-puts "   Missing dependencies: #{missing_deps.join(', ')}"
-# Test 6: Installation instructions
-puts "\n6. Installation instructions:"
-instructions = UniversalDocumentProcessor.installation_instructions
-puts instructions
-# Test 7: Test with API key if provided
-if ENV['OPENAI_API_KEY'] && !ENV['OPENAI_API_KEY'].empty?
-  puts "\n7. Testing with API key:"
-  ai_available_with_key = UniversalDocumentProcessor.ai_available?
-  puts "   AI Available with key: #{ai_available_with_key}"
-  agent_with_key = UniversalDocumentProcessor.create_ai_agent
-  puts "   Agent AI enabled: #{agent_with_key.ai_enabled}"
-else
-  puts "\n7. Skipping API key test (OPENAI_API_KEY not set)"
-end
-# Clean up
-sample_file.unlink
-puts "\n" + "=" * 50
-puts "AI Dependency Test Complete!"
-puts "✓ AI features are properly optional"
-puts "✓ Clear error messages when dependencies missing"
-puts "✓ Graceful degradation when features unavailable"