RubyGems - universal_document_processor - Versions diffs - 1.0.3 → 1.0.5 - Mend

universal_document_processor 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/ISSUES_ANALYSIS.md +295 -0
data/PERFORMANCE.md +492 -0
data/USER_GUIDE.md +597 -0
data/debug_test.rb +35 -0
data/lib/universal_document_processor/document.rb +5 -1
data/lib/universal_document_processor/processors/base_processor.rb +5 -1
data/lib/universal_document_processor/processors/pdf_processor.rb +17 -0
data/lib/universal_document_processor/version.rb +1 -1
data/test_ai_dependency.rb +80 -0
data/test_core_functionality.rb +280 -0
data/test_performance_memory.rb +271 -0
data/test_published_gem.rb +349 -0
metadata +20 -6

data/test_ai_dependency.rb ADDED Viewed

@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+# Add lib directory to load path
+$LOAD_PATH.unshift File.expand_path('lib', __dir__)
+# Load the gem
+require 'universal_document_processor'
+puts "Testing AI Dependency Handling"
+puts "=" * 50
+# Test 1: Check AI availability without API key
+puts "\n1. Testing AI availability without API key:"
+ai_available = UniversalDocumentProcessor.ai_available?
+puts "   AI Available: #{ai_available}"
+# Test 2: Create AI agent without API key
+puts "\n2. Creating AI agent without API key:"
+agent = UniversalDocumentProcessor.create_ai_agent
+puts "   Agent created: #{agent.class}"
+puts "   AI enabled: #{agent.ai_enabled}"
+puts "   AI available: #{agent.ai_available?}"
+# Test 3: Try to use AI methods without API key
+puts "\n3. Testing AI methods without API key:"
+# Create a sample text file
+require 'tempfile'
+sample_file = Tempfile.new(['test', '.txt'])
+sample_file.write("This is a test document for AI processing.")
+sample_file.close
+begin
+  result = UniversalDocumentProcessor.ai_analyze(sample_file.path)
+  puts "   ERROR: Should have raised an exception!"
+rescue UniversalDocumentProcessor::DependencyMissingError => e
+  puts "   ✓ Correctly raised DependencyMissingError: #{e.message}"
+rescue => e
+  puts "   ✗ Unexpected error: #{e.class} - #{e.message}"
+end
+# Test 4: Check available features
+puts "\n4. Available features:"
+features = UniversalDocumentProcessor.available_features
+puts "   Features: #{features.join(', ')}"
+puts "   AI processing included: #{features.include?(:ai_processing)}"
+# Test 5: Check optional dependencies
+puts "\n5. Optional dependencies:"
+optional_deps = UniversalDocumentProcessor.optional_dependencies
+puts "   Optional dependencies: #{optional_deps.keys.join(', ')}"
+missing_deps = UniversalDocumentProcessor.missing_dependencies
+puts "   Missing dependencies: #{missing_deps.join(', ')}"
+# Test 6: Installation instructions
+puts "\n6. Installation instructions:"
+instructions = UniversalDocumentProcessor.installation_instructions
+puts instructions
+# Test 7: Test with API key if provided
+if ENV['OPENAI_API_KEY'] && !ENV['OPENAI_API_KEY'].empty?
+  puts "\n7. Testing with API key:"
+  ai_available_with_key = UniversalDocumentProcessor.ai_available?
+  puts "   AI Available with key: #{ai_available_with_key}"
+  agent_with_key = UniversalDocumentProcessor.create_ai_agent
+  puts "   Agent AI enabled: #{agent_with_key.ai_enabled}"
+else
+  puts "\n7. Skipping API key test (OPENAI_API_KEY not set)"
+end
+# Clean up
+sample_file.unlink
+puts "\n" + "=" * 50
+puts "AI Dependency Test Complete!"
+puts "✓ AI features are properly optional"
+puts "✓ Clear error messages when dependencies missing"
+puts "✓ Graceful degradation when features unavailable"

data/test_core_functionality.rb ADDED Viewed

@@ -0,0 +1,280 @@
+#!/usr/bin/env ruby
+# Add lib directory to load path
+$LOAD_PATH.unshift File.expand_path('lib', __dir__)
+# Load the gem
+require 'universal_document_processor'
+require 'tempfile'
+puts "Testing Core Functionality"
+puts "=" * 50
+test_count = 0
+passed_count = 0
+def test(description)
+  global_test_count = caller_locations.first.lineno
+  print "#{global_test_count}. #{description}... "
+  begin
+    yield
+    puts "✓ PASS"
+    return true
+  rescue => e
+    puts "✗ FAIL: #{e.message}"
+    puts "   #{e.backtrace.first}" if ENV['DEBUG']
+    return false
+  end
+end
+# Create sample files for testing
+puts "\nCreating sample files..."
+# Text file
+txt_file = Tempfile.new(['test', '.txt'])
+txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
+txt_file.close
+# CSV file
+csv_file = Tempfile.new(['test', '.csv'])
+csv_file.write("Name,Age,City\nJohn,25,New York\nJane,30,Los Angeles\nBob,35,Chicago")
+csv_file.close
+# TSV file
+tsv_file = Tempfile.new(['test', '.tsv'])
+tsv_file.write("Name\tAge\tCity\nJohn\t25\tNew York\nJane\t30\tLos Angeles\nBob\t35\tChicago")
+tsv_file.close
+# JSON file
+json_file = Tempfile.new(['test', '.json'])
+json_file.write('{"name": "Test Document", "type": "sample", "data": [1, 2, 3, 4, 5]}')
+json_file.close
+# XML file
+xml_file = Tempfile.new(['test', '.xml'])
+xml_file.write(<<~XML)
+  <?xml version="1.0" encoding="UTF-8"?>
+  <document>
+    <title>Sample XML Document</title>
+    <content>This is a sample XML file for testing.</content>
+  </document>
+XML
+xml_file.close
+puts "Sample files created successfully!"
+# Run tests
+puts "\nRunning Core Tests:"
+puts "-" * 30
+# Test 1: Version number
+test_count += 1
+passed = test("Version number is defined") do
+  version = UniversalDocumentProcessor::VERSION
+  raise "Version is nil" if version.nil?
+  raise "Version format invalid" unless version.match?(/\d+\.\d+\.\d+/)
+end
+passed_count += 1 if passed
+# Test 2: Text file processing
+test_count += 1
+passed = test("Text file processing") do
+  result = UniversalDocumentProcessor.process(txt_file.path)
+  raise "Result is not a hash" unless result.is_a?(Hash)
+  raise "Missing text key" unless result.has_key?(:text)
+  raise "Missing metadata key" unless result.has_key?(:metadata)
+  raise "Text content incorrect" unless result[:text].include?("sample text file")
+  raise "Format incorrect" unless result[:metadata][:format] == "txt"
+end
+passed_count += 1 if passed
+# Test 3: Text extraction
+test_count += 1
+passed = test("Text extraction method") do
+  text = UniversalDocumentProcessor.extract_text(txt_file.path)
+  raise "Text is not a string" unless text.is_a?(String)
+  raise "Text content missing" unless text.include?("sample text file")
+end
+passed_count += 1 if passed
+# Test 4: Metadata extraction
+test_count += 1
+passed = test("Metadata extraction") do
+  metadata = UniversalDocumentProcessor.get_metadata(txt_file.path)
+  raise "Metadata is not a hash" unless metadata.is_a?(Hash)
+  raise "Format missing" unless metadata[:format] == "txt"
+  raise "File size missing" unless metadata[:file_size] > 0
+end
+passed_count += 1 if passed
+# Test 5: CSV processing
+test_count += 1
+passed = test("CSV file processing") do
+  result = UniversalDocumentProcessor.process(csv_file.path)
+  raise "Result is not a hash" unless result.is_a?(Hash)
+  raise "Missing tables key" unless result.has_key?(:tables)
+  raise "Format incorrect" unless result[:metadata][:format] == "csv"
+  raise "Delimiter incorrect" unless result[:metadata][:delimiter] == "comma"
+  raise "No tables found" unless result[:tables].length > 0
+end
+passed_count += 1 if passed
+# Test 6: TSV processing
+test_count += 1
+passed = test("TSV file processing") do
+  result = UniversalDocumentProcessor.process(tsv_file.path)
+  raise "Result is not a hash" unless result.is_a?(Hash)
+  raise "Missing tables key" unless result.has_key?(:tables)
+  raise "Format incorrect" unless result[:metadata][:format] == "tsv"
+  raise "Delimiter incorrect" unless result[:metadata][:delimiter] == "tab"
+  raise "No tables found" unless result[:tables].length > 0
+end
+passed_count += 1 if passed
+# Test 7: JSON processing
+test_count += 1
+passed = test("JSON file processing") do
+  result = UniversalDocumentProcessor.process(json_file.path)
+  raise "Result is not a hash" unless result.is_a?(Hash)
+  raise "Format incorrect" unless result[:metadata][:format] == "json"
+  raise "Text missing" unless result[:text].include?("Test Document")
+end
+passed_count += 1 if passed
+# Test 8: XML processing
+test_count += 1
+passed = test("XML file processing") do
+  result = UniversalDocumentProcessor.process(xml_file.path)
+  raise "Result is not a hash" unless result.is_a?(Hash)
+  raise "Format incorrect" unless result[:metadata][:format] == "xml"
+  raise "Text missing" unless result[:text].include?("Sample XML Document")
+end
+passed_count += 1 if passed
+# Test 9: Batch processing
+test_count += 1
+passed = test("Batch processing") do
+  files = [txt_file.path, csv_file.path, json_file.path]
+  results = UniversalDocumentProcessor.batch_process(files)
+  raise "Results not array" unless results.is_a?(Array)
+  raise "Wrong number of results" unless results.length == 3
+  results.each do |result|
+    raise "Missing text or error key" unless result.has_key?(:text) || result.has_key?(:error)
+  end
+end
+passed_count += 1 if passed
+# Test 10: Available features
+test_count += 1
+passed = test("Available features check") do
+  features = UniversalDocumentProcessor.available_features
+  raise "Features not array" unless features.is_a?(Array)
+  raise "Missing text processing" unless features.include?(:text_processing)
+  raise "Missing CSV processing" unless features.include?(:csv_processing)
+  raise "Missing TSV processing" unless features.include?(:tsv_processing)
+end
+passed_count += 1 if passed
+# Test 11: Dependency checking
+test_count += 1
+passed = test("Dependency availability check") do
+  # These may or may not be available, just test the method works
+  pdf_available = UniversalDocumentProcessor.dependency_available?(:pdf_reader)
+  raise "Dependency check failed" unless [true, false].include?(pdf_available)
+end
+passed_count += 1 if passed
+# Test 12: Text quality analysis
+test_count += 1
+passed = test("Text quality analysis") do
+  analysis = UniversalDocumentProcessor.analyze_text_quality("Clean text")
+  raise "Analysis not hash" unless analysis.is_a?(Hash)
+  raise "Missing valid_characters" unless analysis.has_key?(:valid_characters)
+  raise "Missing invalid_characters" unless analysis.has_key?(:invalid_characters)
+end
+passed_count += 1 if passed
+# Test 13: Text cleaning
+test_count += 1
+passed = test("Text cleaning") do
+  dirty_text = "Clean\x00text"
+  clean_text = UniversalDocumentProcessor.clean_text(dirty_text)
+  raise "Cleaning failed" if clean_text.include?("\x00")
+end
+passed_count += 1 if passed
+# Test 14: Japanese text detection
+test_count += 1
+passed = test("Japanese text detection") do
+  english = "This is English"
+  japanese = "これは日本語"
+  raise "English detected as Japanese" if UniversalDocumentProcessor.japanese_text?(english)
+  raise "Japanese not detected" unless UniversalDocumentProcessor.japanese_text?(japanese)
+end
+passed_count += 1 if passed
+# Test 15: Optional dependencies info
+test_count += 1
+passed = test("Optional dependencies information") do
+  optional_deps = UniversalDocumentProcessor.optional_dependencies
+  raise "Optional deps not hash" unless optional_deps.is_a?(Hash)
+  raise "Missing pdf-reader" unless optional_deps.has_key?('pdf-reader')
+  missing_deps = UniversalDocumentProcessor.missing_dependencies
+  raise "Missing deps not array" unless missing_deps.is_a?(Array)
+  instructions = UniversalDocumentProcessor.installation_instructions
+  raise "Instructions not string" unless instructions.is_a?(String)
+end
+passed_count += 1 if passed
+# Test 16: AI availability check (should be false without API key)
+test_count += 1
+passed = test("AI availability check") do
+  ai_available = UniversalDocumentProcessor.ai_available?
+  raise "AI should not be available without key" if ai_available
+end
+passed_count += 1 if passed
+# Test 17: Error handling for unsupported format
+test_count += 1
+passed = test("Error handling for unsupported format") do
+  unsupported_file = Tempfile.new(['test', '.unknown'])
+  unsupported_file.write("test content")
+  unsupported_file.close
+  begin
+    UniversalDocumentProcessor.process(unsupported_file.path)
+    raise "Should have raised UnsupportedFormatError"
+  rescue UniversalDocumentProcessor::UnsupportedFormatError
+    # Expected error
+  rescue => e
+    raise "Wrong error type: #{e.class}"
+  ensure
+    unsupported_file.unlink
+  end
+end
+passed_count += 1 if passed
+# Clean up
+puts "\nCleaning up temporary files..."
+[txt_file, csv_file, tsv_file, json_file, xml_file].each do |file|
+  file.unlink if File.exist?(file.path)
+end
+# Results
+puts "\n" + "=" * 50
+puts "Test Results:"
+puts "  Total tests: #{test_count}"
+puts "  Passed: #{passed_count}"
+puts "  Failed: #{test_count - passed_count}"
+puts "  Success rate: #{((passed_count.to_f / test_count) * 100).round(1)}%"
+if passed_count == test_count
+  puts "\n🎉 All tests passed! Core functionality is working correctly."
+  exit 0
+else
+  puts "\n❌ Some tests failed. Please check the issues above."
+  exit 1
+end

data/test_performance_memory.rb ADDED Viewed

@@ -0,0 +1,271 @@
+#!/usr/bin/env ruby
+# Performance and Memory Usage Analysis for Universal Document Processor
+# This test checks if we need to add performance guidelines and memory usage documentation
+puts "🚀 Performance & Memory Analysis - Universal Document Processor"
+puts "=" * 70
+$LOAD_PATH.unshift File.expand_path('lib', __dir__)
+require 'universal_document_processor'
+require 'tempfile'
+require 'benchmark'
+# Helper to get memory usage (Windows-specific)
+def get_memory_usage
+  begin
+    result = `tasklist /FI "PID eq #{Process.pid}" /FO CSV 2>nul`
+    if result && !result.empty?
+      lines = result.split("\n")
+      if lines.length > 1
+        memory_str = lines[1].split(",")[4].gsub('"', '').gsub(',', '')
+        return memory_str.to_i # KB
+      end
+    end
+  rescue
+    # Fallback for non-Windows or error cases
+  end
+  return 0
+end
+def format_memory(kb)
+  if kb > 1024
+    "#{(kb / 1024.0).round(1)} MB"
+  else
+    "#{kb} KB"
+  end
+end
+def create_test_file(size_description, content_generator)
+  file = Tempfile.new(['perf_test', '.txt'])
+  content = content_generator.call
+  file.write(content)
+  file.close
+  actual_size = File.size(file.path)
+  puts "  📁 Created #{size_description}: #{format_memory(actual_size / 1024)} (#{file.path})"
+  return file, actual_size
+end
+issues_found = []
+performance_concerns = []
+puts "\n📊 PERFORMANCE TESTING"
+puts "-" * 50
+# Test 1: Small file performance (baseline)
+puts "\n1️⃣ Small File Performance (Baseline)"
+small_file, small_size = create_test_file("small file", -> { "Hello World!\n" * 100 })
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  result = UniversalDocumentProcessor.process(small_file.path)
+end
+end_memory = get_memory_usage
+puts "  ⏱️  Processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory change: #{format_memory(end_memory - start_memory)}"
+small_file.unlink
+baseline_time = time_taken
+# Test 2: Medium file performance
+puts "\n2️⃣ Medium File Performance (1MB)"
+medium_file, medium_size = create_test_file("medium file", -> { "This is a test line with some content.\n" * 25000 })
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  result = UniversalDocumentProcessor.process(medium_file.path)
+end
+end_memory = get_memory_usage
+puts "  ⏱️  Processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory change: #{format_memory(end_memory - start_memory)}"
+puts "  📈 Speed ratio: #{(time_taken / baseline_time).round(1)}x slower than baseline"
+if time_taken > 2.0
+  performance_concerns << "Medium files (1MB) take #{time_taken.round(2)} seconds to process"
+end
+medium_file.unlink
+# Test 3: Large file performance
+puts "\n3️⃣ Large File Performance (5MB)"
+large_file, large_size = create_test_file("large file", -> { "This is a longer test line with more content to simulate real documents.\n" * 75000 })
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  result = UniversalDocumentProcessor.process(large_file.path)
+end
+end_memory = get_memory_usage
+puts "  ⏱️  Processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory change: #{format_memory(end_memory - start_memory)}"
+puts "  📈 Speed ratio: #{(time_taken / baseline_time).round(1)}x slower than baseline"
+if time_taken > 10.0
+  performance_concerns << "Large files (5MB) take #{time_taken.round(2)} seconds to process"
+end
+if (end_memory - start_memory) > 100000  # 100MB
+  performance_concerns << "Large files use #{format_memory(end_memory - start_memory)} of memory"
+end
+large_file.unlink
+puts "\n💾 MEMORY USAGE TESTING"
+puts "-" * 50
+# Test 4: Memory usage with multiple files
+puts "\n4️⃣ Batch Processing Memory Test"
+files = []
+file_sizes = []
+5.times do |i|
+  file, size = create_test_file("batch file #{i+1}", -> { "Batch processing test content line #{i}.\n" * 5000 })
+  files << file.path
+  file_sizes << size
+end
+total_file_size = file_sizes.sum
+puts "  📦 Total file size: #{format_memory(total_file_size / 1024)}"
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  results = UniversalDocumentProcessor.batch_process(files)
+end
+end_memory = get_memory_usage
+memory_used = end_memory - start_memory
+puts "  ⏱️  Batch processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory used: #{format_memory(memory_used)}"
+puts "  📊 Memory efficiency: #{(memory_used.to_f / (total_file_size / 1024)).round(2)}x file size"
+if memory_used > (total_file_size / 1024) * 3  # More than 3x file size
+  performance_concerns << "Batch processing uses #{(memory_used.to_f / (total_file_size / 1024)).round(1)}x the file size in memory"
+end
+# Cleanup
+files.each { |f| File.delete(f) if File.exist?(f) }
+# Test 5: CSV/TSV processing performance
+puts "\n5️⃣ Structured Data Processing Performance"
+# Large CSV test
+csv_content = "Name,Age,Email,Department,Salary,Location,Phone\n"
+csv_content += 10000.times.map { |i| "User#{i},#{20+i%50},user#{i}@example.com,Dept#{i%10},#{30000+i*10},City#{i%100},555-#{i.to_s.rjust(4, '0')}" }.join("\n")
+csv_file = Tempfile.new(['large', '.csv'])
+csv_file.write(csv_content)
+csv_file.close
+csv_size = File.size(csv_file.path)
+puts "  📊 Large CSV size: #{format_memory(csv_size / 1024)}"
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  result = UniversalDocumentProcessor.process(csv_file.path)
+end
+end_memory = get_memory_usage
+puts "  ⏱️  CSV processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory change: #{format_memory(end_memory - start_memory)}"
+if time_taken > 5.0
+  performance_concerns << "Large CSV files (#{format_memory(csv_size / 1024)}) take #{time_taken.round(2)} seconds"
+end
+csv_file.unlink
+# Test 6: Unicode content performance
+puts "\n6️⃣ Unicode Content Performance"
+unicode_content = "これは日本語のテストです。🌟 This includes emoji and special characters: áéíóú, ñ, ç, ü\n" * 5000
+unicode_file = Tempfile.new(['unicode', '.txt'])
+unicode_file.write(unicode_content)
+unicode_file.close
+start_memory = get_memory_usage
+time_taken = Benchmark.realtime do
+  result = UniversalDocumentProcessor.process(unicode_file.path)
+end
+end_memory = get_memory_usage
+puts "  ⏱️  Unicode processing time: #{(time_taken * 1000).round(2)} ms"
+puts "  🧠 Memory change: #{format_memory(end_memory - start_memory)}"
+unicode_file.unlink
+puts "\n" + "=" * 70
+puts "🎯 PERFORMANCE & MEMORY ANALYSIS RESULTS"
+puts "=" * 70
+puts "\n📈 PERFORMANCE CONCERNS FOUND:"
+if performance_concerns.empty?
+  puts "✅ No significant performance issues detected!"
+  puts "   The gem performs well within reasonable limits."
+else
+  performance_concerns.each_with_index do |concern, i|
+    puts "⚠️  #{i + 1}. #{concern}"
+  end
+end
+puts "\n📚 DOCUMENTATION RECOMMENDATIONS:"
+puts "\n4️⃣ Performance Guidelines Needed:"
+guidelines_needed = []
+if performance_concerns.any? { |c| c.include?("seconds") }
+  guidelines_needed << "Processing time expectations for different file sizes"
+  guidelines_needed << "Recommended file size limits for real-time processing"
+end
+if performance_concerns.any? { |c| c.include?("memory") }
+  guidelines_needed << "Memory usage patterns and optimization tips"
+  guidelines_needed << "Best practices for batch processing large files"
+end
+guidelines_needed << "Performance comparison between different file formats"
+guidelines_needed << "Optimization tips for production environments"
+if guidelines_needed.any?
+  puts "📋 Suggested documentation additions:"
+  guidelines_needed.each_with_index do |guideline, i|
+    puts "   #{i + 1}. #{guideline}"
+  end
+else
+  puts "✅ Current performance is good - minimal documentation needed"
+end
+puts "\n5️⃣ Memory Usage Documentation Needed:"
+memory_docs_needed = []
+memory_docs_needed << "Expected memory usage patterns (typically 2-3x file size)"
+memory_docs_needed << "Memory-efficient processing tips for large files"
+memory_docs_needed << "Batch processing memory considerations"
+memory_docs_needed << "When to process files individually vs. in batches"
+puts "📋 Suggested memory usage documentation:"
+memory_docs_needed.each_with_index do |doc, i|
+  puts "   #{i + 1}. #{doc}"
+end
+puts "\n💡 SPECIFIC RECOMMENDATIONS:"
+puts "1. Add a PERFORMANCE.md file with benchmarks and guidelines"
+puts "2. Include memory usage examples in README"
+puts "3. Add performance tips to method documentation"
+puts "4. Consider adding a performance_info method to the gem"
+puts "5. Document recommended file size limits for different use cases"
+puts "\n🎯 CONCLUSION:"
+if performance_concerns.length > 2
+  puts "❌ Performance documentation is NEEDED - several concerns found"
+  exit 1
+elsif performance_concerns.length > 0
+  puts "⚠️  Performance documentation would be HELPFUL - some concerns found"
+  exit 2
+else
+  puts "✅ Performance is good, but documentation would still be valuable for users"
+  exit 0
+end